update dependencies, fix up vulkan initialization, start model loading to gpu

2025-11-21 18:45:12 +11:00 · 2025-11-21 18:45:12 +11:00 · bd4e1cc07e
commit bd4e1cc07e
parent 25899ff448
12 changed files with 2612 additions and 2659 deletions
--- a/assets/models/DamagedHelmet.glb
+++ b/assets/models/DamagedHelmet.glb
--- a/assets/shaders/gradient.comp.spv
+++ b/assets/shaders/gradient.comp.spv
--- a/assets/shaders/pbr.frag.spv
+++ b/assets/shaders/pbr.frag.spv
--- a/assets/shaders/pbr.vert.spv
+++ b/assets/shaders/pbr.vert.spv
--- a/dub.json
+++ b/dub.json
@ -7,7 +7,7 @@
 			"targetType": "executable",
 			"targetName": "Gears",
 			"targetPath": "build",
-			"sourceFiles-linux": ["build/libvma.a", "build/libstb.a", "build/libm3d.a", "build/libcglm.a"],
+			"sourceFiles-linux": ["build/libvma.a", "build/libstb.a", "build/libm3d.a", "build/libcglm.a", "build/libcgltf.a"],
 			"sourceFiles-windows": [],
 			"importPaths": ["src/gears", "src/dlib", "src/dlib/external/xxhash", "src/VulkanRenderer"],
 			"sourcePaths": ["src/gears", "src/dlib", "src/dlib/external/xxhash", "src/VulkanRenderer"],
@ -17,7 +17,7 @@
 			"preGenerateCommands-linux": ["./build.sh"],
 			"preGenerateCommands-windows": [],
 			"dflags": ["-Xcc=-mno-sse", "-P-I/usr/include/freetype2", "-Jbuild", "-Jassets/fonts"],
-			"dflags-dmd": ["-P=-DSTBI_NO_SIMD"]
+			"dflags-dmd": []
 		},
 		{
 			"name": "packer",
--- a/external/inteli/bmi2intrin.d
+++ b/external/inteli/bmi2intrin.d
@ -1,363 +1,363 @@
-/**
-* BMI2 intrinsics.
-* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
-*
-* Copyright: Copyright Johan Engelen 2021.
-* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
-*/
-module inteli.bmi2intrin;
-
-import inteli.internals;
-
-nothrow @nogc pure @safe:
-
-/// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
-uint _bzhi_u32 (uint a, uint index)
-{
-    static if (GDC_or_LDC_with_BMI2)
-    {
-        if (!__ctfe)
-            return __builtin_ia32_bzhi_si(a, index);
-        else
-            return bzhi!uint(a, index);
-    }
-    else
-    {
-        return bzhi!uint(a, index);
-    }
-}
-unittest
-{
-    static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
-           assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
-    static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
-           assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
-    static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
-           assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
-}
-
-/// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
-ulong _bzhi_u64 (ulong a, uint index)
-{
-    static if (GDC_or_LDC_with_BMI2)
-    {
-        if (!__ctfe)
-        {
-            version(X86_64)
-            {
-                // This instruction not available in 32-bit x86.
-                return __builtin_ia32_bzhi_di(a, index);
-            }
-            else
-                return bzhi!ulong(a, index);
-        }
-        else
-            return bzhi!ulong(a, index);
-    }
-    else
-    {
-        return bzhi!ulong(a, index);
-    }
-}
-unittest
-{
-    static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
-           assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
-    static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
-           assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
-    static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
-           assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
-    static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
-           assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
-}
-
-// Helper function for BZHI
-private T bzhi(T)(T a, uint index)
-{
-    /+
-        n := index[7:0]
-        dst := a
-        IF (n < number of bits)
-            dst[MSB:n] := 0
-        FI
-    +/
-    enum numbits = T.sizeof*8;
-    T dst = a;
-    if (index < numbits)
-    {
-        T mask = (T(1) << index) - 1;
-        dst &= mask;
-    }
-    return dst;
-}
-
-/// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, 
-/// and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
-/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
-///       But, those particular semantics don't exist at the level of intrinsics.
-uint _mulx_u32 (uint a, uint b, uint* hi)
-{
-    // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for
-    // some reason, even with LLVM IR.
-    // Also same with GDC.
-    ulong result = cast(ulong) a * b;
-    *hi = cast(uint) (result >>> 32);
-    return cast(uint)result;
-}
-@system unittest
-{
-    uint hi;
-    assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
-    assert (hi == 0x014B_66DC);
-}
-
-/// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and 
-/// store the high 64-bits in `hi`. This does not read or write arithmetic flags.
-/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
-///       But, those particular semantics don't exist at the level of intrinsics.
-ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
-{
-    /+
-        dst[63:0] := (a * b)[63:0]
-        MEM[hi+63:hi]  := (a * b)[127:64]
-    +/
-
-    static if (LDC_with_optimizations)
-    {
-        static if (__VERSION__ >= 2094)
-            enum bool withLDCIR = true;
-        else
-            enum bool withLDCIR = false;
-    }
-    else
-    {
-        enum bool withLDCIR = false;
-    }
-
-    static if (withLDCIR)
-    {
-        // LDC x86: Generates mulx from -O0
-        enum ir = `
-            %4 = zext i64 %0 to i128
-            %5 = zext i64 %1 to i128
-            %6 = mul nuw i128 %5, %4
-            %7 = lshr i128 %6, 64
-            %8 = trunc i128 %7 to i64
-            store i64 %8, i64* %2, align 8
-            %9 = trunc i128 %6 to i64
-            ret i64 %9`;
-        return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
-    }
-    else
-    {
-        /+ Straight-forward implementation with `ucent`:
-        ucent result = cast(ucent) a * b;
-        *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
-        return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
-        +/
-
-        /+
-            Implementation using 64bit math is more complex...
-            a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
-                  = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
-                  = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
-                  = c2 << 64 + c11 << 32 + c12 << 32 + c0
-                  = z1 << 64  +  z0
-        // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
-        // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
-        // intermediate result is then the 'carry' that we need to add when calculating z1's sum.
-            z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
-        The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
-            z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
-        +/
-
-        const ulong a_low = a & 0xFFFF_FFFF;
-        const ulong a_high = a >>> 32;
-        const ulong b_low = b & 0xFFFF_FFFF;
-        const ulong b_high = b >>> 32;
-
-        const ulong c2 = a_high*b_high;
-        const ulong c11 = a_high*b_low;
-        const ulong c12 = a_low*b_high;
-        const ulong c0 = a_low*b_low;
-
-        const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
-        const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
-        const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
-
-        *hi = z1;
-        return z0;
-    }
-}
-@system unittest
-{
-    ulong hi;
-    // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
-    assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
-    assert (hi == 0x14b_66dc_33f6_acdc);
-}
-
-/// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
-uint _pdep_u32 (uint a, uint mask)
-{
-    static if (GDC_or_LDC_with_BMI2)
-    {
-        if (!__ctfe)
-            return __builtin_ia32_pdep_si(a, mask);
-        else
-            return pdep!uint(a, mask);
-    }
-    else
-    {
-        return pdep!uint(a, mask);
-    }
-}
-unittest
-{
-    static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
-           assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
-}
-
-/// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
-ulong _pdep_u64 (ulong a, ulong mask)
-{
-    static if (GDC_or_LDC_with_BMI2)
-    {
-        if (!__ctfe)
-        {
-            version(X86_64)
-            {
-                // This instruction not available in 32-bit x86.
-                return __builtin_ia32_pdep_di(a, mask);
-            }
-            else
-                return pdep!ulong(a, mask);
-        }
-        else
-            return pdep!ulong(a, mask);
-    }
-    else
-    {
-        return pdep!ulong(a, mask);
-    }
-}
-unittest
-{
-    static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
-           assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
-}
-
-// Helper function for PDEP
-private T pdep(T)(T a, T mask)
-{
-    /+
-        tmp := a
-        dst := 0
-        m := 0
-        k := 0
-        DO WHILE m < 32
-            IF mask[m] == 1
-                dst[m] := tmp[k]
-                k := k + 1
-            FI
-            m := m + 1
-        OD
-    +/
-    T dst;
-    T k_bitpos = 1;
-    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
-    foreach (m; 0..T.sizeof*8)
-    {
-        if (mask & m_bitpos)
-        {
-            dst |= (a & k_bitpos) ? m_bitpos : 0;
-            k_bitpos <<= 1;
-        }
-        m_bitpos <<= 1;
-    }
-    return dst;
-}
-
-
-/// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by 
-/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
-uint _pext_u32 (uint a, uint mask)
-{
-    static if (GDC_or_LDC_with_BMI2)
-    {
-        if (!__ctfe)
-            return __builtin_ia32_pext_si(a, mask);
-        else
-            return pext!uint(a, mask);
-    }
-    else
-    {
-        return pext!uint(a, mask);
-    }
-}
-unittest
-{
-    static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
-           assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
-}
-
-/// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by 
-/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
-ulong _pext_u64 (ulong a, ulong mask)
-{
-    static if (GDC_or_LDC_with_BMI2)
-    {
-        if (!__ctfe)
-        {
-            version(X86_64)
-            {
-                // This instruction not available in 32-bit x86.
-                return __builtin_ia32_pext_di(a, mask);
-            }
-            else
-                return pext!ulong(a, mask);
-        }
-        else
-            return pext!ulong(a, mask);
-    }
-    else
-    {
-        return pext!ulong(a, mask);
-    }
-}
-unittest
-{
-    static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
-           assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
-}
-
-// Helper function for PEXT
-private T pext(T)(T a, T mask)
-{
-    /+
-        tmp := a
-        dst := 0
-        m := 0
-        k := 0
-        DO WHILE m < number of bits in T
-            IF mask[m] == 1
-                dst[k] := tmp[m]
-                k := k + 1
-            FI
-            m := m + 1
-        OD
-    +/
-    T dst;
-    T k_bitpos = 1;
-    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
-    foreach (m; 0..T.sizeof*8)
-    {
-        if (mask & m_bitpos)
-        {
-            dst |= (a & m_bitpos) ? k_bitpos : 0;
-            k_bitpos <<= 1;
-        }
-        m_bitpos <<= 1;
-    }
-    return dst;
-}
+/**
+* BMI2 intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
+*
+* Copyright: Copyright Johan Engelen 2021.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.bmi2intrin;
+
+import inteli.internals;
+
+nothrow @nogc pure @safe:
+
+/// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
+uint _bzhi_u32 (uint a, uint index)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+            return __builtin_ia32_bzhi_si(a, index);
+        else
+            return bzhi!uint(a, index);
+    }
+    else
+    {
+        return bzhi!uint(a, index);
+    }
+}
+unittest
+{
+    static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
+           assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
+    static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
+           assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
+    static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
+           assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
+}
+
+/// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
+ulong _bzhi_u64 (ulong a, uint index)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+        {
+            version(X86_64)
+            {
+                // This instruction not available in 32-bit x86.
+                return __builtin_ia32_bzhi_di(a, index);
+            }
+            else
+                return bzhi!ulong(a, index);
+        }
+        else
+            return bzhi!ulong(a, index);
+    }
+    else
+    {
+        return bzhi!ulong(a, index);
+    }
+}
+unittest
+{
+    static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
+           assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
+    static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
+           assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
+    static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
+           assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
+    static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
+           assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
+}
+
+// Helper function for BZHI
+private T bzhi(T)(T a, uint index)
+{
+    /+
+        n := index[7:0]
+        dst := a
+        IF (n < number of bits)
+            dst[MSB:n] := 0
+        FI
+    +/
+    enum numbits = T.sizeof*8;
+    T dst = a;
+    if (index < numbits)
+    {
+        T mask = (T(1) << index) - 1;
+        dst &= mask;
+    }
+    return dst;
+}
+
+/// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, 
+/// and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
+/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
+///       But, those particular semantics don't exist at the level of intrinsics.
+uint _mulx_u32 (uint a, uint b, uint* hi)
+{
+    // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for
+    // some reason, even with LLVM IR.
+    // Also same with GDC.
+    ulong result = cast(ulong) a * b;
+    *hi = cast(uint) (result >>> 32);
+    return cast(uint)result;
+}
+@system unittest
+{
+    uint hi;
+    assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
+    assert (hi == 0x014B_66DC);
+}
+
+/// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and 
+/// store the high 64-bits in `hi`. This does not read or write arithmetic flags.
+/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
+///       But, those particular semantics don't exist at the level of intrinsics.
+ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
+{
+    /+
+        dst[63:0] := (a * b)[63:0]
+        MEM[hi+63:hi]  := (a * b)[127:64]
+    +/
+
+    static if (LDC_with_optimizations)
+    {
+        static if (__VERSION__ >= 2094)
+            enum bool withLDCIR = true;
+        else
+            enum bool withLDCIR = false;
+    }
+    else
+    {
+        enum bool withLDCIR = false;
+    }
+
+    static if (withLDCIR)
+    {
+        // LDC x86: Generates mulx from -O0
+        enum ir = `
+            %4 = zext i64 %0 to i128
+            %5 = zext i64 %1 to i128
+            %6 = mul nuw i128 %5, %4
+            %7 = lshr i128 %6, 64
+            %8 = trunc i128 %7 to i64
+            store i64 %8, i64* %2, align 8
+            %9 = trunc i128 %6 to i64
+            ret i64 %9`;
+        return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
+    }
+    else
+    {
+        /+ Straight-forward implementation with `ucent`:
+        ucent result = cast(ucent) a * b;
+        *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
+        return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
+        +/
+
+        /+
+            Implementation using 64bit math is more complex...
+            a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
+                  = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
+                  = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
+                  = c2 << 64 + c11 << 32 + c12 << 32 + c0
+                  = z1 << 64  +  z0
+        // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
+        // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
+        // intermediate result is then the 'carry' that we need to add when calculating z1's sum.
+            z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
+        The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
+            z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
+        +/
+
+        const ulong a_low = a & 0xFFFF_FFFF;
+        const ulong a_high = a >>> 32;
+        const ulong b_low = b & 0xFFFF_FFFF;
+        const ulong b_high = b >>> 32;
+
+        const ulong c2 = a_high*b_high;
+        const ulong c11 = a_high*b_low;
+        const ulong c12 = a_low*b_high;
+        const ulong c0 = a_low*b_low;
+
+        const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
+        const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
+        const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
+
+        *hi = z1;
+        return z0;
+    }
+}
+@system unittest
+{
+    ulong hi;
+    // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
+    assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
+    assert (hi == 0x14b_66dc_33f6_acdc);
+}
+
+/// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
+uint _pdep_u32 (uint a, uint mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+            return __builtin_ia32_pdep_si(a, mask);
+        else
+            return pdep!uint(a, mask);
+    }
+    else
+    {
+        return pdep!uint(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
+           assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
+}
+
+/// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
+ulong _pdep_u64 (ulong a, ulong mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+        {
+            version(X86_64)
+            {
+                // This instruction not available in 32-bit x86.
+                return __builtin_ia32_pdep_di(a, mask);
+            }
+            else
+                return pdep!ulong(a, mask);
+        }
+        else
+            return pdep!ulong(a, mask);
+    }
+    else
+    {
+        return pdep!ulong(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
+           assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
+}
+
+// Helper function for PDEP
+private T pdep(T)(T a, T mask)
+{
+    /+
+        tmp := a
+        dst := 0
+        m := 0
+        k := 0
+        DO WHILE m < 32
+            IF mask[m] == 1
+                dst[m] := tmp[k]
+                k := k + 1
+            FI
+            m := m + 1
+        OD
+    +/
+    T dst;
+    T k_bitpos = 1;
+    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
+    foreach (m; 0..T.sizeof*8)
+    {
+        if (mask & m_bitpos)
+        {
+            dst |= (a & k_bitpos) ? m_bitpos : 0;
+            k_bitpos <<= 1;
+        }
+        m_bitpos <<= 1;
+    }
+    return dst;
+}
+
+
+/// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by 
+/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
+uint _pext_u32 (uint a, uint mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+            return __builtin_ia32_pext_si(a, mask);
+        else
+            return pext!uint(a, mask);
+    }
+    else
+    {
+        return pext!uint(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
+           assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
+}
+
+/// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by 
+/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
+ulong _pext_u64 (ulong a, ulong mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+        {
+            version(X86_64)
+            {
+                // This instruction not available in 32-bit x86.
+                return __builtin_ia32_pext_di(a, mask);
+            }
+            else
+                return pext!ulong(a, mask);
+        }
+        else
+            return pext!ulong(a, mask);
+    }
+    else
+    {
+        return pext!ulong(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
+           assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
+}
+
+// Helper function for PEXT
+private T pext(T)(T a, T mask)
+{
+    /+
+        tmp := a
+        dst := 0
+        m := 0
+        k := 0
+        DO WHILE m < number of bits in T
+            IF mask[m] == 1
+                dst[k] := tmp[m]
+                k := k + 1
+            FI
+            m := m + 1
+        OD
+    +/
+    T dst;
+    T k_bitpos = 1;
+    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
+    foreach (m; 0..T.sizeof*8)
+    {
+        if (mask & m_bitpos)
+        {
+            dst |= (a & m_bitpos) ? k_bitpos : 0;
+            k_bitpos <<= 1;
+        }
+        m_bitpos <<= 1;
+    }
+    return dst;
+}
--- a/external/inteli/smmintrin.d
+++ b/external/inteli/smmintrin.d
--- a/src/VulkanRenderer
+++ b/src/VulkanRenderer
@ -1 +1 @@
-Subproject commit d46741a48033b5136fa189c1b80a574986e68f64
+Subproject commit c42238a456f5048c7d1b2d5ebd71ecf13bb10ece
--- a/src/dlib
+++ b/src/dlib
@ -1 +1 @@
-Subproject commit c83ffabce69071a3e7a0af3f26aa420082eeda1f
+Subproject commit 493c17cba26952861ae4c5402f1676013b8317c6
--- a/src/gears/game.d
+++ b/src/gears/game.d
@ -1,4 +1,5 @@
 import dlib;
+/*
 public import vulkan : PlatformHandles;
 import vulkan : Destroy;
 import vulkan;
@ -414,6 +415,7 @@ DrawRect(Game* g, f32 p0_x, f32 p0_y, f32 p1_x, f32 p1_y, Vec4 col)
 	AddUIIndices(g);
 }

+/*
 // TODO: integrate this with vulkan again
 Model
 LoadModel(Game* g, string name)
@ -697,3 +699,4 @@ ReadModel(Game* g, string name)
 		const(char)[] mat_name = m3d.material[i].name[0 .. strlen(m3d.material[i].name)];
 	}
 }
+*/
--- a/src/gears/game2.d
+++ b/src/gears/game2.d
@ -10,8 +10,6 @@ ImageView[256] TEXTURES;
 Buffer[256]    MATERIALS;
 Buffer[256]    MODEL_STATES;

-DescIndices[DESC_SET_MAX] DESC_INDICES];
-
 struct GameState
 {
 	RenderState rds;
@ -29,6 +27,7 @@ struct RenderState
 	Pipeline[PID.Max]        pipelines;
 	DescSetLayout            desc_layout_globals;
 	DescSetLayout            desc_layout_resources;
+	DescSetLayout            desc_layout_state;
 	DescSet[2]               desc_set_globals;
 	PipelineLayout           pipeline_layout_pbr;

@ -41,13 +40,6 @@ struct RenderState
 	Buffer                   globals_buffer;
 }

-struct DescIndices
-{
-	u32 tex;
-	u32 mat;
-	u32 state;
-}
-
 struct ShaderGlobals
 {
 	Vec4 ambient;
@ -57,41 +49,6 @@ struct ShaderGlobals
 	f32  alpha     = 0.0;
 }

-struct MeshPart
-{
-	u32       mat;
-	u32       offset;
-	u32       length;
-	PushConst pc;
-
-	alias pc this;
-}
-
-struct ModelData
-{
-	MeshPart[]    parts;
-	ModelState    state;
-	Vertex[]      v;
-	u32[]         idx;
-	TextureData[] tex;
-}
-
-struct TextureData
-{
-	u8[] name;
-	u8[] data;
-	u32  width;
-	u32  height;
-	u32  ch;
-}
-
-struct Model
-{
-	Buffer     v_buf;
-	Buffer     i_buf;
-	MeshPart[] parts;
-}
-
 struct ModelRenderInfo
 {
 	PushConst  pc;
@ -105,15 +62,6 @@ struct ModelState
 	Mat4 matrix;
 }

-struct Material
-{
-	Vec4  ambient;
-	Vec4  diffuse;
-	Vec4  specular;
-	float shininess;
-	float alpha;
-}
-
 enum PBRMod : u32
 {
 	AlbedoValue     = 0x0001,
@ -182,17 +130,14 @@ struct PushConst
 	}
 }

-struct Vertex
-{
-	Vec4 col;
-	Vec4 tangent;
-	Vec3 pos;
-	Vec3 normal;
-	Vec2 uv;
-}
-
 ModelData g_box;

+void
+RunCycle(GameState* g)
+{
+
+}
+
 GameState
 InitGame(PlatformWindow* window)
 {
@ -225,7 +170,7 @@ Init(RenderState* rds, PlatformWindow* window)
 		{ binding: 2, descriptorType: DT.StorageImage,    descriptorCount: 1, stageFlags: SS.All },
 	];

-	DescLayoutBinding[3] resource_bindings = [
+	DescLayoutBinding[6] resource_bindings = [
 		{ binding: 0, descriptorType: DT.Image,   descriptorCount: 1, stageFlags: SS.All },
 		{ binding: 1, descriptorType: DT.Image,   descriptorCount: 1, stageFlags: SS.All },
 		{ binding: 2, descriptorType: DT.Image,   descriptorCount: 1, stageFlags: SS.All },
@ -234,8 +179,12 @@ Init(RenderState* rds, PlatformWindow* window)
 		{ binding: 5, descriptorType: DT.Uniform, descriptorCount: 1, stageFlags: SS.All },
 	];

+	DescLayoutBinding[1] state_bindings = [
+		{ binding: 0, descriptorType: DT.Uniform, descriptorCount: 1, stageFlags: SS.All },
+	];
+
 	Attribute[5] attributes = [
-		{ binding: 0, location: 0, format: FMT.RGBA_F32, offset: Vertex.col.offsetof     },
+		{ binding: 0, location: 0, format: FMT.RGBA_F32, offset: Vertex.color.offsetof   },
 		{ binding: 0, location: 1, format: FMT.RGBA_F32, offset: Vertex.tangent.offsetof },
 		{ binding: 0, location: 2, format: FMT.RGB_F32,  offset: Vertex.pos.offsetof     },
 		{ binding: 0, location: 3, format: FMT.RGB_F32,  offset: Vertex.normal.offsetof  },
@ -251,8 +200,9 @@ Init(RenderState* rds, PlatformWindow* window)

 	rds.desc_layout_globals   = CreateDescSetLayout(&rds.rd, global_bindings);
 	rds.desc_layout_resources = CreateDescSetLayout(&rds.rd, resource_bindings);
+	rds.desc_layout_state     = CreateDescSetLayout(&rds.rd, state_bindings);

-	rds.pipeline_layout_pbr   = CreatePipelineLayout(&rds.rd, [rds.desc_layout_globals, rds.desc_layout_resources], PushConst.sizeof);
+	rds.pipeline_layout_pbr   = CreatePipelineLayout(&rds.rd, [rds.desc_layout_globals, rds.desc_layout_resources, rds.desc_layout_state], PushConst.sizeof);

 	foreach(i; 0 .. 2)
 	{
@ -273,8 +223,8 @@ Init(RenderState* rds, PlatformWindow* window)
 	};

 	GfxPipelineInfo pbr_info = {
-		vertex_shader:     LoadAssetData(&rds.frame_arenas[0], "shaders/pbr.vert.spv"),
-		frag_shader:       LoadAssetData(&rds.frame_arenas[0], "shaders/pbr.frag.spv"),
+		vertex_shader:     LoadFile(&rds.frame_arenas[0], "assets/shaders/pbr.vert.spv"),
+		frag_shader:       LoadFile(&rds.frame_arenas[0], "assets/shaders/pbr.frag.spv"),
 		input_rate:        IR.Vertex,
 		input_rate_stride: Vertex.sizeof,
 		layout:            rds.pipeline_layout_pbr,
@ -315,7 +265,7 @@ Init(RenderState* rds, PlatformWindow* window)

 	CreateBuffer(&rds.rd, &rds.globals_buffer, BT.Uniform, ShaderGlobals.sizeof, false);

-	g_box = MakeBox
+	ModelData md = LoadGLTF(&rds.frame_arenas[0], "assets/models/DamagedHelmet.glb");
 }

 PipelineID
@ -381,6 +331,7 @@ GetPBRMod(bool albedo = false, bool ambient = false, bool specular = false, bool
 	}
 }

+/*
 ModelData
 MakeBox(RenderState* rds, f32 width, f32 height, Vec4 col)
 {
@ -428,9 +379,9 @@ Model
 Upload(RenderState* rds, ModelData* data)
 {
 	Model model;
-	u32[] tex_idx   = Alloc!(&rds.frame_arenas[0], data.text.length);
-	u32[] mat_idx   = Alloc!(&rds.frame_arenas[0], data.materials.length);
-	u32[] state_idx = Alloc!(&rds.frame_arenas[0], data.model_states.length);
+	u32[] tex_idx   = Alloc!(u32)(&rds.frame_arenas[0], data.text.length);
+	u32[] mat_idx   = Alloc!(u32)(&rds.frame_arenas[0], data.materials.length);
+	u32[] state_idx = Alloc!(u32)(&rds.frame_arenas[0], data.model_states.length);

 	bool result = true;

@ -450,20 +401,18 @@ Upload(RenderState* rds, ModelData* data)
 	{
 		Buffer* buf = &rds.materials[rds.imat++];
 		CreateBuffer(&rds.rd, buf, BT.Uniform, Material.sizeof);
-		result = Transfer(&rds.rd, buf, )
+		//result = Transfer(&rds.rd, buf, )
 	}

 	for(u64 i = 0; i < data.model_states.length; i += 1)
 	{
 		Buffer* buf = &rds.model_states[rds.istate++];
-		CreateBuffer(&rds)
+		//CreateBuffer(&rds)
 	}

 	model.parts = data.parts;
 }
-
-DescIndices*
-GetDescIndices()
+*/

 unittest
 {
--- a/src/shaders/structures.layout
+++ b/src/shaders/structures.layout
@ -44,7 +44,8 @@ layout (set = 2, binding = 4) uniform MaterialData {
 	float shininess;
 	float alpha;
 } Material;
-layout (set = 2, binding = 5) uniform ModelState {
+
+layout (set = 3, binding = 0) uniform ModelState {
 	mat4 model_matrix;
 } State;