add clean up for model transfers, add math and intrinsics libs

2025-07-20 17:30:29 +10:00 · 2025-07-20 17:30:29 +10:00 · ba4ccad085
commit ba4ccad085
parent 6127a0bb70
25 changed files with 31148 additions and 75 deletions
--- a/dub.json
+++ b/dub.json
@ -9,8 +9,8 @@
 			"targetPath": "build",
 			"sourceFiles-linux": ["build/libvma.a", "build/libstb_image.a", "build/libm3d.a"],
 			"sourceFiles-windows": [],
-			"importPaths": ["src/gears", "src/shared", "src/generated", "external/xxhash"],
-			"sourcePaths": ["src/gears", "src/shared", "src/generated", "external/xxhash"],
+			"importPaths": ["src/gears", "src/shared", "src/generated", "external/xxhash", "external/dplug/math", "external/inteli"],
+			"sourcePaths": ["src/gears", "src/shared", "src/generated", "external/xxhash", "external/dplug/math", "external/inteli"],
 			"libs-linux": ["xcb", "X11", "X11-xcb", "vulkan", "stdc++"],
 			"libs-windows": [],
 			"preGenerateCommands-linux": ["./build-vma.sh", "build/Codegen", "dub main:packer"],
@ -22,8 +22,8 @@
 			"targetType": "executable",
 			"targetPath": "build",
 			"targetName": "Packer",
-			"importPaths": ["src/packer", "src/shared", "src/generated", "external/xxhash"],
-			"sourcePaths": ["src/packer", "src/shared", "src/generated", "external/xxhash"],
+			"importPaths": ["src/packer", "src/shared", "src/generated", "external/xxhash", "external/dplug/math", "external/inteli"],
+			"sourcePaths": ["src/packer", "src/shared", "src/generated", "external/xxhash", "external/dplug/math", "external/inteli"],
 			"sourceFiles-linux": ["build/libstb_image.a", "build/libm3d.a"],
 			"preGenerateCommands-linux": ["./build-vma.sh"],
 			"postGenerateCommands-linux": ["build/Packer"],
@ -35,8 +35,8 @@
 			"targetType": "executable",
 			"targetPath": "build",
 			"targetName": "Codegen",
-			"importPaths": ["src/codegen", "src/shared", "external/xxhash"],
-			"sourcePaths": ["src/codegen", "src/shared", "external/xxhash"],
+			"importPaths": ["src/codegen", "src/shared", "external/xxhash", "external/dplug/math", "external/inteli"],
+			"sourcePaths": ["src/codegen", "src/shared", "external/xxhash", "external/dplug/math", "external/inteli"],
 			"sourceFiles-linux": ["build/libstb_image.a"],
 			"preGenerateCommands-linux": ["./build-vma.sh"],
 			"preGenerateCommands-windows": [],
--- a/external/dplug/math/box.d
+++ b/external/dplug/math/box.d
@ -0,0 +1,689 @@
+/**
+ * N-dimensional half-open interval [a, b[.
+ *
+ * Copyright: Copyright Guillaume Piolat 2015-2021.
+ *            Copyright Ahmet Sait 2021.
+ *            Copyright Ryan Roden-Corrent 2016.
+ *            Copyright Nathan Sashihara 2018.
+ *            Copyright Colden Cullen 2014.
+ *
+ * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+ */
+module dplug.math.box;
+
+import std.math,
+       std.traits;
+
+import dplug.math.vector;
+
+/// N-dimensional half-open interval [a, b[.
+struct Box(T, int N)
+{
+    static assert(N > 0);
+
+    public
+    {
+        alias bound_t = Vector!(T, N);
+
+        bound_t min; // not enforced, the box can have negative volume
+        bound_t max;
+
+        /// Construct a box which extends between 2 points.
+        /// Boundaries: min is inside the box, max is just outside.
+        @nogc this(bound_t min_, bound_t max_) pure nothrow
+        {
+            min = min_;
+            max = max_;
+        }
+
+        static if (N == 1)
+        {
+            @nogc this(T min_, T max_) pure nothrow
+            {
+                min.x = min_;
+                max.x = max_;
+            }
+        }
+
+        static if (N == 2)
+        {
+            @nogc this(T min_x, T min_y, T max_x, T max_y) pure nothrow
+            {
+                min = bound_t(min_x, min_y);
+                max = bound_t(max_x, max_y);
+            }
+        }
+
+        static if (N == 3)
+        {
+            @nogc this(T min_x, T min_y, T min_z, T max_x, T max_y, T max_z) pure nothrow
+            {
+                min = bound_t(min_x, min_y, min_z);
+                max = bound_t(max_x, max_y, max_z);
+            }
+        }
+
+        @property
+        {
+            /// Returns: Dimensions of the box.
+            @nogc bound_t size() pure const nothrow
+            {
+                return max - min;
+            }
+
+            /// Sets size of the box assuming min point is the pivot.
+            /// Returns: Dimensions of the box.
+            @nogc bound_t size(bound_t value) pure nothrow
+            {
+                max = min + value;
+                return value;
+            }
+
+            /// Returns: Center of the box.
+            @nogc bound_t center() pure const nothrow
+            {
+                return (min + max) / 2;
+            }
+
+            static if (N >= 1)
+            {
+                /// Returns: Width of the box, always applicable.
+                @nogc T width() pure const nothrow @property
+                {
+                    return max.x - min.x;
+                }
+
+                /// Sets width of the box assuming min point is the pivot.
+                /// Returns: Width of the box, always applicable.
+                @nogc T width(T value) pure nothrow @property
+                {
+                    max.x = min.x + value;
+                    return value;
+                }
+            }
+
+            static if (N >= 2)
+            {
+                /// Returns: Height of the box, if applicable.
+                @nogc T height() pure const nothrow @property
+                {
+                    return max.y - min.y;
+                }
+
+                /// Sets height of the box assuming min point is the pivot.
+                /// Returns: Height of the box, if applicable.
+                @nogc T height(T value) pure nothrow @property
+                {
+                    max.y = min.y + value;
+                    return value;
+                }
+            }
+
+            static if (N >= 3)
+            {
+                /// Returns: Depth of the box, if applicable.
+                @nogc T depth() pure const nothrow @property
+                {
+                    return max.z - min.z;
+                }
+
+                /// Sets depth of the box assuming min point is the pivot.
+                /// Returns: Depth of the box, if applicable.
+                @nogc T depth(T value) pure nothrow @property
+                {
+                    max.z = min.z + value;
+                    return value;
+                }
+            }
+
+            /// Returns: Signed volume of the box.
+            @nogc T volume() pure const nothrow
+            {
+                T res = 1;
+                bound_t size = size();
+                for(int i = 0; i < N; ++i)
+                    res *= size[i];
+                return res;
+            }
+
+            /// Returns: true if empty.
+            @nogc bool empty() pure const nothrow
+            {
+                bound_t size = size();
+                mixin(generateLoopCode!("if (min[@] == max[@]) return true;", N)());
+                return false;
+            }
+        }
+
+        /// Returns: true if it contains point.
+        @nogc bool contains(bound_t point) pure const nothrow
+        {
+            assert(isSorted());
+            for(int i = 0; i < N; ++i)
+                if ( !(point[i] >= min[i] && point[i] < max[i]) )
+                    return false;
+
+            return true;
+        }
+
+        static if (N >= 2)
+        {
+            /// Returns: true if it contains point `x`, `y`.
+            @nogc bool contains(T x, T y) pure const nothrow
+            {
+                assert(isSorted());
+                if ( !(x >= min.x && x < max.x) )
+                    return false;
+                if ( !(y >= min.y && y < max.y) )
+                    return false;
+                return true;
+            }
+        }
+
+        static if (N >= 3)
+        {
+            /// Returns: true if it contains point `x`, `y`, `z`.
+            @nogc bool contains(T x, T y, T z) pure const nothrow
+            {
+                assert(isSorted());
+                if ( !(x >= min.x && x < max.x) )
+                    return false;
+                if ( !(y >= min.y && y < max.y) )
+                    return false;
+                if ( !(z >= min.z && z < max.z) )
+                    return false;
+                return true;
+            }
+        }
+
+        /// Returns: true if it contains box other.
+        @nogc bool contains(Box other) pure const nothrow
+        {
+            assert(isSorted());
+            assert(other.isSorted());
+
+            mixin(generateLoopCode!("if ( (other.min[@] < min[@]) || (other.max[@] > max[@]) ) return false;", N)());
+            return true;
+        }
+
+        /// Euclidean squared distance from a point.
+        /// See_also: Numerical Recipes Third Edition (2007)
+        @nogc real squaredDistance(bound_t point) pure const nothrow
+        {
+            assert(isSorted());
+            real distanceSquared = 0;
+            for (int i = 0; i < N; ++i)
+            {
+                if (point[i] < min[i])
+                    distanceSquared += (point[i] - min[i]) ^^ 2;
+
+                if (point[i] > max[i])
+                    distanceSquared += (point[i] - max[i]) ^^ 2;
+            }
+            return distanceSquared;
+        }
+
+        /// Euclidean distance from a point.
+        /// See_also: squaredDistance.
+        @nogc real distance(bound_t point) pure const nothrow
+        {
+            return sqrt(squaredDistance(point));
+        }
+
+        /// Euclidean squared distance from another box.
+        /// See_also: Numerical Recipes Third Edition (2007)
+        @nogc real squaredDistance(Box o) pure const nothrow
+        {
+            assert(isSorted());
+            assert(o.isSorted());
+            real distanceSquared = 0;
+            for (int i = 0; i < N; ++i)
+            {
+                if (o.max[i] < min[i])
+                    distanceSquared += (o.max[i] - min[i]) ^^ 2;
+
+                if (o.min[i] > max[i])
+                    distanceSquared += (o.min[i] - max[i]) ^^ 2;
+            }
+            return distanceSquared;
+        }
+
+        /// Euclidean distance from another box.
+        /// See_also: squaredDistance.
+        @nogc real distance(Box o) pure const nothrow
+        {
+            return sqrt(squaredDistance(o));
+        }
+
+        /// Assumes sorted boxes.
+        /// This function deals with empty boxes correctly.
+        /// Returns: Intersection of two boxes.
+        @nogc Box intersection(Box o) pure const nothrow
+        {
+            assert(isSorted());
+            assert(o.isSorted());
+
+            // Return an empty box if one of the boxes is empty
+            if (empty())
+                return this;
+
+            if (o.empty())
+                return o;
+
+            Box result = void;
+            for (int i = 0; i < N; ++i)
+            {
+                T maxOfMins = (min.v[i] > o.min.v[i]) ? min.v[i] : o.min.v[i];
+                T minOfMaxs = (max.v[i] < o.max.v[i]) ? max.v[i] : o.max.v[i];
+                result.min.v[i] = maxOfMins;
+                result.max.v[i] = minOfMaxs >= maxOfMins ? minOfMaxs : maxOfMins;
+            }
+            return result;
+        }
+
+        /// Assumes sorted boxes.
+        /// This function deals with empty boxes correctly.
+        /// Returns: Intersection of two boxes.
+        @nogc bool intersects(Box other) pure const nothrow
+        {
+            Box inter = this.intersection(other);
+            return inter.isSorted() && !inter.empty();
+        }
+
+        /// Extends the area of this Box.
+        @nogc Box grow(bound_t space) pure const nothrow
+        {
+            Box res = this;
+            res.min -= space;
+            res.max += space;
+            return res;
+        }
+
+        /// Shrink the area of this Box. The box might became unsorted.
+        @nogc Box shrink(bound_t space) pure const nothrow
+        {
+            return grow(-space);
+        }
+
+        /// Extends the area of this Box.
+        @nogc Box grow(T space) pure const nothrow
+        {
+            return grow(bound_t(space));
+        }
+
+        /// Translate this Box.
+        @nogc Box translate(bound_t offset) pure const nothrow
+        {
+            return Box(min + offset, max + offset);
+        }
+
+        /// Scale the box by factor `scale`, and round the result to integer if needed.
+        @nogc Box scaleByFactor(float scale) const nothrow
+        {
+            Box res;
+            static if (isFloatingPoint!T)
+            {
+                res.min.x = min.x * scale;
+                res.min.y = min.y * scale;
+                res.max.x = max.x * scale;
+                res.max.y = max.y * scale;
+            }
+            else
+            {
+                res.min.x = cast(T)( round(min.x * scale) );
+                res.min.y = cast(T)( round(min.y * scale) );
+                res.max.x = cast(T)( round(max.x * scale) );
+                res.max.y = cast(T)( round(max.y * scale) );
+            }
+            return res;
+        }
+
+        static if (N == 2) // useful for UI that have horizontal and vertical scale
+        {
+            /// Scale the box by factor `scaleX` horizontally and `scaleY` vetically. 
+            /// Round the result to integer if needed.
+            @nogc Box scaleByFactor(float scaleX, float scaleY) const nothrow
+            {
+                Box res;
+                static if (isFloatingPoint!T)
+                {
+                    res.min.x = min.x * scaleX;
+                    res.min.y = min.y * scaleY;
+                    res.max.x = max.x * scaleX;
+                    res.max.y = max.y * scaleY;
+                }
+                else
+                {
+                    res.min.x = cast(T)( round(min.x * scaleX) );
+                    res.min.y = cast(T)( round(min.y * scaleY) );
+                    res.max.x = cast(T)( round(max.x * scaleX) );
+                    res.max.y = cast(T)( round(max.y * scaleY) );
+                }
+                return res;
+            }
+        }
+
+        static if (N >= 2)
+        {
+            /// Translate this Box by `x`, `y`.
+            @nogc Box translate(T x, T y) pure const nothrow
+            {
+                Box res = this;
+                res.min.x += x;
+                res.min.y += y;
+                res.max.x += x;
+                res.max.y += y;
+                return res;
+            }
+        }
+
+        static if (N >= 3)
+        {
+            /// Translate this Box by `x`, `y`.
+            @nogc Box translate(T x, T y, T z) pure const nothrow
+            {
+                Box res = this;
+                res.min.x += x;
+                res.min.y += y;
+                res.min.z += z;
+                res.max.x += x;
+                res.max.y += y;
+                res.max.z += z;
+                return res;
+            }
+        }
+
+        /// Shrinks the area of this Box.
+        /// Returns: Shrinked box.
+        @nogc Box shrink(T space) pure const nothrow
+        {
+            return shrink(bound_t(space));
+        }
+
+        /// Expands the box to include point.
+        /// Returns: Expanded box.
+        @nogc Box expand(bound_t point) pure const nothrow
+        {
+            import vector = dplug.math.vector;
+            return Box(vector.minByElem(min, point), vector.maxByElem(max, point));
+        }
+
+        /// Expands the box to include another box.
+        /// This function deals with empty boxes correctly.
+        /// Returns: Expanded box.
+        @nogc Box expand(Box other) pure const nothrow
+        {
+            assert(isSorted());
+            assert(other.isSorted());
+
+            // handle empty boxes
+            if (empty())
+                return other;
+            if (other.empty())
+                return this;
+
+            Box result = void;
+            for (int i = 0; i < N; ++i)
+            {
+                T minOfMins = (min.v[i] < other.min.v[i]) ? min.v[i] : other.min.v[i];
+                T maxOfMaxs = (max.v[i] > other.max.v[i]) ? max.v[i] : other.max.v[i];
+                result.min.v[i] = minOfMins;
+                result.max.v[i] = maxOfMaxs;
+            }
+            return result;
+        }
+
+        /// Returns: true if each dimension of the box is >= 0.
+        @nogc bool isSorted() pure const nothrow
+        {
+            for(int i = 0; i < N; ++i)
+            {
+                if (min[i] > max[i])
+                    return false;
+            }
+            return true;
+        }
+
+        /// Returns: Absolute value of the Box to ensure each dimension of the
+        /// box is >= 0.
+        @nogc Box abs() pure const nothrow
+        {
+            Box!(T, N) s = this;
+            for (int i = 0; i < N; ++i)
+            {
+                if (s.min.v[i] > s.max.v[i])
+                {
+                    T tmp = s.min.v[i];
+                    s.min.v[i] = s.max.v[i];
+                    s.max.v[i] = tmp;
+                }
+            }
+            return s;
+        }
+
+        /// Assign with another box.
+        @nogc ref Box opAssign(U)(U x) nothrow if (isBox!U)
+        {
+            static if(is(U.element_t : T))
+            {
+                static if(U._size == _size)
+                {
+                    min = x.min;
+                    max = x.max;
+                }
+                else
+                {
+                    static assert(false, "no conversion between boxes with different dimensions");
+                }
+            }
+            else
+            {
+                static assert(false, "no conversion from " ~ U.element_t.stringof ~ " to " ~ element_t.stringof);
+            }
+            return this;
+        }
+
+        /// Returns: true if comparing equal boxes.
+        @nogc bool opEquals(U)(U other) pure const nothrow if (is(U : Box))
+        {
+            return (min == other.min) && (max == other.max);
+        }
+
+        /// Cast to other box types.
+        @nogc U opCast(U)() pure const nothrow if (isBox!U)
+        {
+            U b = void;
+            for(int i = 0; i < N; ++i)
+            {
+                b.min[i] = cast(U.element_t)(min[i]);
+                b.max[i] = cast(U.element_t)(max[i]);
+            }
+            return b; // return a box where each element has been casted
+        }
+
+        static if (N == 2)
+        {
+            /// Helper function to create rectangle with a given point, width and height.
+            static @nogc Box rectangle(T x, T y, T width, T height) pure nothrow
+            {
+                return Box(x, y, x + width, y + height);
+            }
+        }
+    }
+
+    private
+    {
+        enum _size = N;
+        alias T element_t;
+    }
+}
+
+/// Instanciate to use a 2D box.
+template box2(T)
+{
+    alias Box!(T, 2) box2;
+}
+
+/// Instanciate to use a 3D box.
+template box3(T)
+{
+    alias Box!(T, 3) box3;
+}
+
+
+alias box2!int box2i; /// 2D box with integer coordinates.
+alias box3!int box3i; /// 3D box with integer coordinates.
+alias box2!float box2f; /// 2D box with float coordinates.
+alias box3!float box3f; /// 3D box with float coordinates.
+alias box2!double box2d; /// 2D box with double coordinates.
+alias box3!double box3d; /// 3D box with double coordinates.
+
+/// Returns: A 2D rectangle with point `x`,`y`, `width` and `height`.
+box2i rectangle(int x, int y, int width, int height) pure nothrow @nogc
+{
+    return box2i(x, y, x + width, y + height);
+}
+
+/// Returns: A 2D rectangle with point `x`,`y`, `width` and `height`.
+box2f rectanglef(float x, float y, float width, float height) pure nothrow @nogc
+{
+    return box2f(x, y, x + width, y + height);
+}
+
+/// Returns: A 2D rectangle with point `x`,`y`, `width` and `height`.
+box2d rectangled(double x, double y, double width, double height) pure nothrow @nogc
+{
+    return box2d(x, y, x + width, y + height);
+}
+
+
+unittest
+{
+    box2i a = box2i(1, 2, 3, 4);
+    assert(a.width == 2);
+    assert(a.height == 2);
+    assert(a.volume == 4);
+    box2i b = box2i(vec2i(1, 2), vec2i(3, 4));
+    assert(a == b);
+
+    box3i q = box3i(-3, -2, -1, 0, 1, 2);
+    q.bound_t s = q.bound_t(11, 17, 19);
+    q.bound_t q_min = q.min;
+    assert((q.size = s) == s);
+    assert(q.size == s);
+    assert(q.min == q_min);
+    assert(q.max == q.min + s);
+    assert(q.max -  q.min == s);
+
+    assert((q.width = s.z) == s.z);
+    assert(q.width == s.z);
+    assert(q.min.x == q_min.x);
+    assert(q.max.x == q.min.x + s.z);
+    assert(q.max.x -  q.min.x == s.z);
+
+    assert((q.height = s.y) == s.y);
+    assert(q.height == s.y);
+    assert(q.min.y == q_min.y);
+    assert(q.max.y == q.min.y + s.y);
+    assert(q.max.y -  q.min.y == s.y);
+
+    assert((q.depth = s.x) == s.x);
+    assert(q.depth == s.x);
+    assert(q.min.z == q_min.z);
+    assert(q.max.z == q.min.z + s.x);
+    assert(q.max.z -  q.min.z == s.x);
+
+    assert(q.size == s.zyx);
+
+    box3i n = box3i(2, 1, 0, -1, -2, -3);
+    assert(n.abs == box3i(-1, -2, -3, 2, 1, 0));
+
+    box2f bf = cast(box2f)b;
+    assert(bf == box2f(1.0f, 2.0f, 3.0f, 4.0f));
+
+    box3f qf = box3f(-0, 1f, 2.5f, 3.25f, 5.125f, 7.0625f);
+    qf.bound_t sf = qf.bound_t(-11.5f, -17.25f, -19.125f);
+    qf.bound_t qf_min = qf.min;
+    assert((qf.size = sf) == sf);
+    assert(qf.size == sf);
+    assert(qf.min == qf_min);
+    assert(qf.max == qf.min + sf);
+    assert(qf.max -  qf.min == sf);
+
+    assert((qf.width = sf.z) == sf.z);
+    assert(qf.width == sf.z);
+    assert(qf.min.x == qf_min.x);
+    assert(qf.max.x == qf.min.x + sf.z);
+    assert(qf.max.x -  qf.min.x == sf.z);
+
+    assert((qf.height = sf.y) == sf.y);
+    assert(qf.height == sf.y);
+    assert(qf.min.y == qf_min.y);
+    assert(qf.max.y == qf.min.y + sf.y);
+    assert(qf.max.y -  qf.min.y == sf.y);
+
+    assert((qf.depth = sf.x) == sf.x);
+    assert(qf.depth == sf.x);
+    assert(qf.min.z == qf_min.z);
+    assert(qf.max.z == qf.min.z + sf.x);
+    assert(qf.max.z -  qf.min.z == sf.x);
+
+    assert(qf.size == sf.zyx);
+
+    box2i c = box2i(0, 0, 1,1);
+    assert(c.translate(vec2i(3, 3)) == box2i(3, 3, 4, 4));
+    assert(c.translate(3, 3) == box2i(3, 3, 4, 4));
+    assert(c.contains(vec2i(0, 0)));
+    assert(c.contains(0, 0));
+    assert(!c.contains(vec2i(1, 1)));
+    assert(!c.contains(1, 1));
+    assert(b.contains(b));
+    box2i d = c.expand(vec2i(3, 3));
+    assert(d.contains(vec2i(2, 2)));
+
+    assert(d == d.expand(d));
+
+    assert(!box2i(0, 0, 4, 4).contains(box2i(2, 2, 6, 6)));
+
+    assert(box2f(0, 0, 0, 0).empty());
+    assert(!box2f(0, 2, 1, 1).empty());
+    assert(!box2f(0, 0, 1, 1).empty());
+
+    assert(box2i(260, 100, 360, 200).intersection(box2i(100, 100, 200, 200)).empty());
+
+    // union with empty box is identity
+    assert(a.expand(box2i(10, 4, 10, 6)) == a);
+
+    // intersection with empty box is empty
+    assert(a.intersection(box2i(10, 4, 10, 6)).empty);
+
+    assert(box2i.rectangle(1, 2, 3, 4) == box2i(1, 2, 4, 6));
+    assert(rectangle(1, 2, 3, 4) == box2i(1, 2, 4, 6));
+    assert(rectanglef(1, 2, 3, 4) == box2f(1, 2, 4, 6));
+    assert(rectangled(1, 2, 3, 4) == box2d(1, 2, 4, 6));
+
+    assert(rectangle(10, 10, 20, 20).scaleByFactor(1.5f) == rectangle(15, 15, 30, 30));
+    assert(rectangle(10, 10, 20, 20).scaleByFactor(1.5f, 2.0f) == rectangle(15, 20, 30, 40));
+}
+
+/// True if `T` is a kind of Box
+enum isBox(T) = is(T : Box!U, U...);
+
+unittest
+{
+    static assert( isBox!box2f);
+    static assert( isBox!box3d);
+    static assert( isBox!(Box!(real, 2)));
+    static assert(!isBox!vec2f);
+}
+
+/// Get the numeric type used to measure a box's dimensions.
+alias DimensionType(T : Box!U, U...) = U[0];
+
+///
+unittest
+{
+    static assert(is(DimensionType!box2f == float));
+    static assert(is(DimensionType!box3d == double));
+}
+
--- a/external/dplug/math/matrix.d
+++ b/external/dplug/math/matrix.d
@ -0,0 +1,852 @@
+/**
+ * Custom sized 2D Matrices.
+ *
+ * Copyright: Copyright Guillaume Piolat 2015-2021.
+ *            Copyright Aleksandr Druzhinin 2016-2020.
+ *            Copyright Nathan Sashihara 2018.
+ *            Copyright Thibaut Charles 2018.
+ *
+ * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+ */
+module dplug.math.matrix;
+
+import std.math,
+       std.typetuple,
+       std.traits,
+       std.typecons;
+
+import dplug.math.vector;
+
+/// Generic non-resizeable matrix with R rows and C columns.
+/// Intended for 3D use (size 3x3 and 4x4).
+/// Important: <b>Matrices here are in row-major order whereas OpenGL is column-major.</b>
+/// Params:
+///   T = type of elements
+///   R = number of rows
+///   C = number of columns
+struct Matrix(T, int R, int C)
+{
+    public
+    {
+        static assert(R >= 1 && C >= 1);
+
+        alias Vector!(T, C) row_t;
+        alias Vector!(T, R) column_t;
+
+        enum bool isSquare = (R == C);
+
+        // fields definition
+        union
+        {
+            T[C*R] v;        // all elements
+            row_t[R] rows;   // all rows
+            T[C][R] c;       // components
+        }
+
+        @nogc this(U...)(U values) pure nothrow
+        {
+            static if ((U.length == C*R) && allSatisfy!(isTAssignable, U))
+            {
+                // construct with components
+                foreach(int i, x; values)
+                    v[i] = x;
+            }
+            else static if ((U.length == 1) && (isAssignable!(U[0])) && (!is(U[0] : Matrix)))
+            {
+                // construct with assignment
+                opAssign!(U[0])(values[0]);
+            }
+            else static assert(false, "cannot create a matrix from given arguments");
+        }
+
+        /// Construct a matrix from columns.
+        @nogc static Matrix fromColumns(column_t[] columns) pure nothrow
+        {
+            assert(columns.length == C);
+            Matrix res;
+            for (int i = 0; i < R; ++i)
+                for (int j = 0; j < C; ++j)
+                {
+                   res.c[i][j] = columns[j][i];
+                }
+            return res;
+        }
+
+        /// Construct a matrix from rows.
+        @nogc static Matrix fromRows(row_t[] rows) pure nothrow
+        {
+            assert(rows.length == R);
+            Matrix res;
+            res.rows[] = rows[];
+            return res;
+        }
+
+        /// Construct matrix with a scalar.
+        @nogc this(U)(T x) pure nothrow
+        {
+            for (int i = 0; i < _N; ++i)
+                v[i] = x;
+        }
+
+        /// Assign with a scalar.
+        @nogc ref Matrix opAssign(U : T)(U x) pure nothrow
+        {
+            for (int i = 0; i < R * C; ++i)
+                v[i] = x;
+            return this;
+        }
+
+        /// Assign with a samey matrice.
+        @nogc ref Matrix opAssign(U : Matrix)(U x) pure nothrow
+        {
+            for (int i = 0; i < R * C; ++i)
+                v[i] = x.v[i];
+            return this;
+        }
+
+        /// Assign from other small matrices (same size, compatible type).
+        @nogc ref Matrix opAssign(U)(U x) pure nothrow
+            if (isMatrixInstantiation!U
+                && is(U._T : _T)
+                && (!is(U: Matrix))
+                && (U._R == R) && (U._C == C))
+        {
+            for (int i = 0; i < R * C; ++i)
+                v[i] = x.v[i];
+            return this;
+        }
+
+        /// Assign with a static array of size R * C.
+        @nogc ref Matrix opAssign(U)(U x) pure nothrow
+            if ((isStaticArray!U)
+                && is(typeof(x[0]) : T)
+                && (U.length == R * C))
+        {
+            for (int i = 0; i < R * C; ++i)
+                v[i] = x[i];
+            return this;
+        }
+
+        /// Assign with a static array of shape (R, C).
+        @nogc ref Matrix opAssign(U)(U x) pure nothrow
+            if ((isStaticArray!U) && isStaticArray!(typeof(x[0]))
+                && is(typeof(x[0][0]) : T)
+                && (U.length == R)
+                && (x[0].length == C))
+        {
+            foreach (i; 0..R)
+                rows[i] = x[i];
+            return this;
+        }
+
+        /// Assign with a dynamic array of size R * C.
+        @nogc ref Matrix opAssign(U)(U x) pure nothrow
+            if ((isDynamicArray!U)
+                && is(typeof(x[0]) : T))
+        {
+            assert(x.length == R * C);
+            for (int i = 0; i < R * C; ++i)
+                v[i] = x[i];
+            return this;
+        }
+
+        /// Assign with a dynamic array of shape (R, C).
+        @nogc ref Matrix opAssign(U)(U x) pure nothrow
+            if ((isDynamicArray!U) && isDynamicArray!(typeof(x[0]))
+                && is(typeof(x[0][0]) : T))
+        {
+            assert(x.length == R);
+            foreach (i; 0..R)
+            {
+                assert(x[i].length == C);
+                rows[i] = x[i];
+            }
+            return this;
+        }
+
+        /// Return a pointer to content.
+        @nogc inout(T)* ptr() pure inout nothrow @property
+        {
+            return v.ptr;
+        }
+
+        /// Returns a column as a vector
+        /// Returns: column j as a vector.
+        @nogc column_t column(int j) pure const nothrow
+        {
+            column_t res = void;
+            for (int i = 0; i < R; ++i)
+                res.v[i] = c[i][j];
+            return res;
+        }
+
+        /// Returns a row as a vector
+        /// Returns: row i as a vector.
+        @nogc row_t row(int i) pure const nothrow
+        {
+            return rows[i];
+        }
+
+        /// Matrix * scalar multiplication.
+        @nogc Matrix opBinary(string op)(T factor) pure const nothrow if (op == "*")
+        {
+            Matrix result = void;
+
+            for (int i = 0; i < R; ++i)
+            {
+                for (int j = 0; j < C; ++j)
+                {
+                    result.c[i][j] = c[i][j] * factor;
+                }
+            }
+            return result;
+        }
+
+        /// Matrix * vector multiplication.
+        @nogc column_t opBinary(string op)(row_t x) pure const nothrow if (op == "*")
+        {
+            column_t res = void;
+            for (int i = 0; i < R; ++i)
+            {
+                T sum = 0;
+                for (int j = 0; j < C; ++j)
+                {
+                    sum += c[i][j] * x.v[j];
+                }
+                res.v[i] = sum;
+            }
+            return res;
+        }
+
+        /// Matrix * matrix multiplication.
+        @nogc auto opBinary(string op, U)(U x) pure const nothrow
+            if (isMatrixInstantiation!U && (U._R == C) && (op == "*"))
+        {
+            Matrix!(T, R, U._C) result = void;
+
+            for (int i = 0; i < R; ++i)
+            {
+                for (int j = 0; j < U._C; ++j)
+                {
+                    T sum = 0;
+                    for (int k = 0; k < C; ++k)
+                        sum += c[i][k] * x.c[k][j];
+                    result.c[i][j] = sum;
+                }
+            }
+            return result;
+        }
+
+        /// Matrix add and substraction.
+        @nogc Matrix opBinary(string op, U)(U other) pure const nothrow
+            if (is(U : Matrix) && (op == "+" || op == "-"))
+        {
+            Matrix result = void;
+
+            for (int i = 0; i < R; ++i)
+            {
+                for (int j = 0; j < C; ++j)
+                {
+                    mixin("result.c[i][j] = c[i][j] " ~ op ~ " other.c[i][j];");
+                }
+            }
+            return result;
+        }
+
+        // matrix *= scalar
+        @nogc ref Matrix opOpAssign(string op, U : T)(U x) pure nothrow if (op == "*")
+        {
+            for (int i = 0; i < R * C; ++i)
+                v[i] *= x;
+            return this;
+        }
+
+        /// Assignment operator with another samey matrix.
+        @nogc ref Matrix opOpAssign(string op, U)(U operand) pure nothrow 
+            if (is(U : Matrix) && (op == "*" || op == "+" || op == "-"))
+        {
+            mixin("Matrix result = this " ~ op ~ " operand;");
+            return opAssign!Matrix(result);
+        }
+
+        /// Matrix += <something convertible to a Matrix>
+        /// Matrix -= <something convertible to a Matrix>
+        @nogc ref Matrix opOpAssign(string op, U)(U operand) pure nothrow 
+            if ((isConvertible!U) && (op == "*" || op == "+" || op == "-"))
+        {
+            Matrix conv = operand;
+            return opOpAssign!op(conv);
+        }
+
+        /// Cast to other matrix types.
+        /// If the size are different, the resulting matrix is truncated
+        /// and/or filled with identity coefficients.
+        @nogc U opCast(U)() pure const nothrow if (isMatrixInstantiation!U)
+        {
+            U res = U.identity();
+            enum minR = R < U._R ? R : U._R;
+            enum minC = C < U._C ? C : U._C;
+            for (int i = 0; i < minR; ++i)
+                for (int j = 0; j < minC; ++j)
+                {
+                    res.c[i][j] = cast(U._T)(c[i][j]);
+                }
+            return res;
+        }
+
+        @nogc bool opEquals(U)(U other) pure const nothrow if (is(U : Matrix))
+        {
+            for (int i = 0; i < R * C; ++i)
+                if (v[i] != other.v[i])
+                    return false;
+            return true;
+        }
+
+        @nogc bool opEquals(U)(U other) pure const nothrow
+            if ((isAssignable!U) && (!is(U: Matrix)))
+        {
+            Matrix conv = other;
+            return opEquals(conv);
+        }
+
+        // +matrix, -matrix, ~matrix, !matrix
+        @nogc Matrix opUnary(string op)() pure const nothrow if (op == "+" || op == "-" || op == "~" || op == "!")
+        {
+            Matrix res = void;
+            for (int i = 0; i < N; ++i)
+                mixin("res.v[i] = " ~ op ~ "v[i];");
+            return res;
+        }
+
+        static if (isSquare && isFloatingPoint!T && R == 1)
+        {
+            /// Returns an inverted copy of this matrix
+            /// Returns: inverse of matrix.
+            /// Note: Matrix inversion is provided for 1x1, 2x2, 3x3 and 4x4 floating point matrices.
+            @nogc Matrix inverse() pure const nothrow
+            {
+                assert(c[0][0] != 0); // Programming error if matrix is not invertible.
+                return Matrix( 1 / c[0][0]);
+            }
+        }
+
+        static if (isSquare && isFloatingPoint!T && R == 2)
+        {
+            /// Returns an inverted copy of this matrix
+            /// Returns: inverse of matrix.
+            /// Note: Matrix inversion is provided for 1x1, 2x2, 3x3 and 4x4 floating point matrices.
+            @nogc Matrix inverse() pure const nothrow
+            {
+                T det = (c[0][0] * c[1][1] - c[0][1] * c[1][0]);
+                assert(det != 0); // Programming error if matrix is not invertible.
+                T invDet = 1 / det;
+                return Matrix( c[1][1] * invDet, -c[0][1] * invDet,
+                                   -c[1][0] * invDet,  c[0][0] * invDet);
+            }
+        }
+
+        static if (isSquare && isFloatingPoint!T && R == 3)
+        {
+            /// Returns an inverted copy of this matrix
+            /// Returns: inverse of matrix.
+            /// Note: Matrix inversion is provided for 1x1, 2x2, 3x3 and 4x4 floating point matrices.
+            @nogc Matrix inverse() pure const nothrow
+            {
+                T det = c[0][0] * (c[1][1] * c[2][2] - c[2][1] * c[1][2])
+                      - c[0][1] * (c[1][0] * c[2][2] - c[1][2] * c[2][0])
+                      + c[0][2] * (c[1][0] * c[2][1] - c[1][1] * c[2][0]);
+                assert(det != 0); // Programming error if matrix is not invertible.
+                T invDet = 1 / det;
+
+                Matrix res = void;
+                res.c[0][0] =  (c[1][1] * c[2][2] - c[2][1] * c[1][2]) * invDet;
+                res.c[0][1] = -(c[0][1] * c[2][2] - c[0][2] * c[2][1]) * invDet;
+                res.c[0][2] =  (c[0][1] * c[1][2] - c[0][2] * c[1][1]) * invDet;
+                res.c[1][0] = -(c[1][0] * c[2][2] - c[1][2] * c[2][0]) * invDet;
+                res.c[1][1] =  (c[0][0] * c[2][2] - c[0][2] * c[2][0]) * invDet;
+                res.c[1][2] = -(c[0][0] * c[1][2] - c[1][0] * c[0][2]) * invDet;
+                res.c[2][0] =  (c[1][0] * c[2][1] - c[2][0] * c[1][1]) * invDet;
+                res.c[2][1] = -(c[0][0] * c[2][1] - c[2][0] * c[0][1]) * invDet;
+                res.c[2][2] =  (c[0][0] * c[1][1] - c[1][0] * c[0][1]) * invDet;
+                return res;
+            }
+        }
+
+        static if (isSquare && isFloatingPoint!T && R == 4)
+        {
+            /// Returns an inverted copy of this matrix
+            /// Returns: inverse of matrix.
+            /// Note: Matrix inversion is provided for 1x1, 2x2, 3x3 and 4x4 floating point matrices.
+            @nogc Matrix inverse() pure const nothrow
+            {
+                T det2_01_01 = c[0][0] * c[1][1] - c[0][1] * c[1][0];
+                T det2_01_02 = c[0][0] * c[1][2] - c[0][2] * c[1][0];
+                T det2_01_03 = c[0][0] * c[1][3] - c[0][3] * c[1][0];
+                T det2_01_12 = c[0][1] * c[1][2] - c[0][2] * c[1][1];
+                T det2_01_13 = c[0][1] * c[1][3] - c[0][3] * c[1][1];
+                T det2_01_23 = c[0][2] * c[1][3] - c[0][3] * c[1][2];
+
+                T det3_201_012 = c[2][0] * det2_01_12 - c[2][1] * det2_01_02 + c[2][2] * det2_01_01;
+                T det3_201_013 = c[2][0] * det2_01_13 - c[2][1] * det2_01_03 + c[2][3] * det2_01_01;
+                T det3_201_023 = c[2][0] * det2_01_23 - c[2][2] * det2_01_03 + c[2][3] * det2_01_02;
+                T det3_201_123 = c[2][1] * det2_01_23 - c[2][2] * det2_01_13 + c[2][3] * det2_01_12;
+
+                T det = - det3_201_123 * c[3][0] + det3_201_023 * c[3][1] - det3_201_013 * c[3][2] + det3_201_012 * c[3][3];
+                assert(det != 0); // Programming error if matrix is not invertible.
+                T invDet = 1 / det;
+
+                T det2_03_01 = c[0][0] * c[3][1] - c[0][1] * c[3][0];
+                T det2_03_02 = c[0][0] * c[3][2] - c[0][2] * c[3][0];
+                T det2_03_03 = c[0][0] * c[3][3] - c[0][3] * c[3][0];
+                T det2_03_12 = c[0][1] * c[3][2] - c[0][2] * c[3][1];
+                T det2_03_13 = c[0][1] * c[3][3] - c[0][3] * c[3][1];
+                T det2_03_23 = c[0][2] * c[3][3] - c[0][3] * c[3][2];
+                T det2_13_01 = c[1][0] * c[3][1] - c[1][1] * c[3][0];
+                T det2_13_02 = c[1][0] * c[3][2] - c[1][2] * c[3][0];
+                T det2_13_03 = c[1][0] * c[3][3] - c[1][3] * c[3][0];
+                T det2_13_12 = c[1][1] * c[3][2] - c[1][2] * c[3][1];
+                T det2_13_13 = c[1][1] * c[3][3] - c[1][3] * c[3][1];
+                T det2_13_23 = c[1][2] * c[3][3] - c[1][3] * c[3][2];
+
+                T det3_203_012 = c[2][0] * det2_03_12 - c[2][1] * det2_03_02 + c[2][2] * det2_03_01;
+                T det3_203_013 = c[2][0] * det2_03_13 - c[2][1] * det2_03_03 + c[2][3] * det2_03_01;
+                T det3_203_023 = c[2][0] * det2_03_23 - c[2][2] * det2_03_03 + c[2][3] * det2_03_02;
+                T det3_203_123 = c[2][1] * det2_03_23 - c[2][2] * det2_03_13 + c[2][3] * det2_03_12;
+
+                T det3_213_012 = c[2][0] * det2_13_12 - c[2][1] * det2_13_02 + c[2][2] * det2_13_01;
+                T det3_213_013 = c[2][0] * det2_13_13 - c[2][1] * det2_13_03 + c[2][3] * det2_13_01;
+                T det3_213_023 = c[2][0] * det2_13_23 - c[2][2] * det2_13_03 + c[2][3] * det2_13_02;
+                T det3_213_123 = c[2][1] * det2_13_23 - c[2][2] * det2_13_13 + c[2][3] * det2_13_12;
+
+                T det3_301_012 = c[3][0] * det2_01_12 - c[3][1] * det2_01_02 + c[3][2] * det2_01_01;
+                T det3_301_013 = c[3][0] * det2_01_13 - c[3][1] * det2_01_03 + c[3][3] * det2_01_01;
+                T det3_301_023 = c[3][0] * det2_01_23 - c[3][2] * det2_01_03 + c[3][3] * det2_01_02;
+                T det3_301_123 = c[3][1] * det2_01_23 - c[3][2] * det2_01_13 + c[3][3] * det2_01_12;
+
+                Matrix res = void;
+                res.c[0][0] = - det3_213_123 * invDet;
+                res.c[1][0] = + det3_213_023 * invDet;
+                res.c[2][0] = - det3_213_013 * invDet;
+                res.c[3][0] = + det3_213_012 * invDet;
+
+                res.c[0][1] = + det3_203_123 * invDet;
+                res.c[1][1] = - det3_203_023 * invDet;
+                res.c[2][1] = + det3_203_013 * invDet;
+                res.c[3][1] = - det3_203_012 * invDet;
+
+                res.c[0][2] = + det3_301_123 * invDet;
+                res.c[1][2] = - det3_301_023 * invDet;
+                res.c[2][2] = + det3_301_013 * invDet;
+                res.c[3][2] = - det3_301_012 * invDet;
+
+                res.c[0][3] = - det3_201_123 * invDet;
+                res.c[1][3] = + det3_201_023 * invDet;
+                res.c[2][3] = - det3_201_013 * invDet;
+                res.c[3][3] = + det3_201_012 * invDet;
+                return res;
+            }
+        }
+
+        /// Returns a transposed copy of this matrix
+        /// Returns: transposed matrice.
+        @nogc Matrix!(T, C, R) transposed() pure const nothrow
+        {
+            Matrix!(T, C, R) res;
+            for (int i = 0; i < C; ++i)
+                for (int j = 0; j < R; ++j)
+                    res.c[i][j] = c[j][i];
+            return res;
+        }
+
+        static if (isSquare && R > 1)
+        {
+            /// Makes a diagonal matrix from a vector.
+            @nogc static Matrix diag(Vector!(T, R) v) pure nothrow
+            {
+                Matrix res = void;
+                for (int i = 0; i < R; ++i)
+                    for (int j = 0; j < C; ++j)
+                        res.c[i][j] = (i == j) ? v.v[i] : 0;
+                return res;
+            }
+
+            /// In-place translate by (v, 1)
+            @nogc void translate(Vector!(T, R-1) v) pure nothrow
+            {
+                for (int i = 0; i < R; ++i)
+                {
+                    T dot = 0;
+                    for (int j = 0; j + 1 < C; ++j)
+                        dot += v.v[j] * c[i][j];
+
+                    c[i][C-1] += dot;
+                }
+            }
+
+            /// Make a translation matrix.
+            @nogc static Matrix translation(Vector!(T, R-1) v) pure nothrow
+            {
+                Matrix res = identity();
+                for (int i = 0; i + 1 < R; ++i)
+                    res.c[i][C-1] += v.v[i];
+                return res;
+            }
+
+            /// In-place matrix scaling.
+            void scale(Vector!(T, R-1) v) pure nothrow
+            {
+                for (int i = 0; i < R; ++i)
+                    for (int j = 0; j + 1 < C; ++j)
+                        c[i][j] *= v.v[j];
+            }
+
+            /// Make a scaling matrix.
+            @nogc static Matrix scaling(Vector!(T, R-1) v) pure nothrow
+            {
+                Matrix res = identity();
+                for (int i = 0; i + 1 < R; ++i)
+                    res.c[i][i] = v.v[i];
+                return res;
+            }
+        }
+
+        // rotations are implemented for 3x3 and 4x4 matrices.
+        static if (isSquare && (R == 3 || R == 4) && isFloatingPoint!T)
+        {
+            @nogc public static Matrix rotateAxis(int i, int j)(T angle) pure nothrow
+            {
+                Matrix res = identity();
+                const T cosa = cos(angle);
+                const T sina = sin(angle);
+                res.c[i][i] = cosa;
+                res.c[i][j] = -sina;
+                res.c[j][i] = sina;
+                res.c[j][j] = cosa;
+                return res;
+            }
+
+            /// Rotate along X axis
+            /// Returns: rotation matrix along axis X
+            alias rotateAxis!(1, 2) rotateX;
+
+            /// Rotate along Y axis
+            /// Returns: rotation matrix along axis Y
+            alias rotateAxis!(2, 0) rotateY;
+
+            /// Rotate along Z axis
+            /// Returns: rotation matrix along axis Z
+            alias rotateAxis!(0, 1) rotateZ;
+
+            /// Similar to the glRotate matrix, however the angle is expressed in radians
+            /// See_also: $(LINK http://www.cs.rutgers.edu/~decarlo/428/gl_man/rotate.html)
+            @nogc static Matrix rotation(T angle, vec3!T axis) pure nothrow
+            {
+                Matrix res = identity();
+                const T c = cos(angle);
+                const oneMinusC = 1 - c;
+                const T s = sin(angle);
+                axis = axis.normalized();
+                T x = axis.x,
+                  y = axis.y,
+                  z = axis.z;
+                T xy = x * y,
+                  yz = y * z,
+                  xz = x * z;
+
+                res.c[0][0] = x * x * oneMinusC + c;
+                res.c[0][1] = x * y * oneMinusC - z * s;
+                res.c[0][2] = x * z * oneMinusC + y * s;
+                res.c[1][0] = y * x * oneMinusC + z * s;
+                res.c[1][1] = y * y * oneMinusC + c;
+                res.c[1][2] = y * z * oneMinusC - x * s;
+                res.c[2][0] = z * x * oneMinusC - y * s;
+                res.c[2][1] = z * y * oneMinusC + x * s;
+                res.c[2][2] = z * z * oneMinusC + c;
+                return res;
+            }
+        }
+
+        // 4x4 specific transformations for 3D usage
+        static if (isSquare && R == 4 && isFloatingPoint!T)
+        {
+            /// Orthographic projection
+            /// Returns: orthographic projection.
+            @nogc static Matrix orthographic(T left, T right, T bottom, T top, T near, T far) pure nothrow
+            {
+                T dx = right - left,
+                  dy = top - bottom,
+                  dz = far - near;
+
+                T tx = -(right + left) / dx;
+                T ty = -(top + bottom) / dy;
+                T tz = -(far + near)   / dz;
+
+                return Matrix(2 / dx,   0,      0,    tx,
+                                0,    2 / dy,   0,    ty,
+                                0,      0,   -2 / dz, tz,
+                                0,      0,      0,     1);
+            }
+
+            /// Perspective projection
+            /// Returns: perspective projection.
+            @nogc static Matrix perspective(T FOVInRadians, T aspect, T zNear, T zFar) pure nothrow
+            {
+                T f = 1 / tan(FOVInRadians / 2);
+                T d = 1 / (zNear - zFar);
+
+                return Matrix(f / aspect, 0,                  0,                    0,
+                                       0, f,                  0,                    0,
+                                       0, 0, (zFar + zNear) * d, 2 * d * zFar * zNear,
+                                       0, 0,                 -1,                    0);
+            }
+
+            /// Look At projection
+            /// Returns: "lookAt" projection.
+            /// Thanks to vuaru for corrections.
+            @nogc static Matrix lookAt(vec3!T eye, vec3!T target, vec3!T up) pure nothrow
+            {
+                vec3!T Z = (eye - target).normalized();
+                vec3!T X = cross(-up, Z).normalized();
+                vec3!T Y = cross(Z, -X);
+
+                return Matrix(-X.x,        -X.y,        -X.z,      dot(X, eye),
+                               Y.x,         Y.y,         Y.z,     -dot(Y, eye),
+                               Z.x,         Z.y,         Z.z,     -dot(Z, eye),
+                               0,           0,           0,        1);
+            }
+        }
+    }
+
+    package
+    {
+        alias T _T;
+        enum _R = R;
+        enum _C = C;
+    }
+
+    private
+    {
+        template isAssignable(T)
+        {
+            enum bool isAssignable = std.traits.isAssignable!(Matrix, T);
+        }
+
+        template isConvertible(T)
+        {
+            enum bool isConvertible = (!is(T : Matrix)) && isAssignable!T;
+        }
+
+        template isTAssignable(U)
+        {
+            enum bool isTAssignable = std.traits.isAssignable!(T, U);
+        }
+
+        template isRowConvertible(U)
+        {
+            enum bool isRowConvertible = is(U : row_t);
+        }
+
+        template isColumnConvertible(U)
+        {
+            enum bool isColumnConvertible = is(U : column_t);
+        }
+    }
+
+    public
+    {
+        /// Construct an identity matrix
+        /// Returns: an identity matrix.
+        /// Note: the identity matrix, while only meaningful for square matrices,
+        /// is also defined for non-square ones.
+        @nogc static Matrix identity() pure nothrow
+        {
+            Matrix res = void;
+            for (int i = 0; i < R; ++i)
+                for (int j = 0; j < C; ++j)
+                    res.c[i][j] = (i == j) ? 1 : 0;
+            return res;
+        }
+
+        /// Construct an constant matrix
+        /// Returns: a constant matrice.
+        @nogc static Matrix constant(U)(U x) pure nothrow
+        {
+            Matrix res = void;
+
+            for (int i = 0; i < R * C; ++i)
+                res.v[i] = cast(T)x;
+            return res;
+        }
+    }
+}
+
+template isMatrixInstantiation(U)
+{
+    private static void isMatrix(T, int R, int C)(Matrix!(T, R, C) x)
+    {
+    }
+
+    enum bool isMatrixInstantiation = is(typeof(isMatrix(U.init)));
+}
+
+// GLSL is a big inspiration here
+// we defines types with more or less the same names
+
+///
+template mat2x2(T) { alias Matrix!(T, 2, 2) mat2x2; }
+///
+template mat3x3(T) { alias Matrix!(T, 3, 3) mat3x3; }
+///
+template mat4x4(T) { alias Matrix!(T, 4, 4) mat4x4; }
+
+// WARNING: in GLSL, first number is _columns_, second is rows
+// It is the opposite here: first number is rows, second is columns
+// With this convention mat2x3 * mat3x4 -> mat2x4.
+
+///
+template mat2x3(T) { alias Matrix!(T, 2, 3) mat2x3; }
+///
+template mat2x4(T) { alias Matrix!(T, 2, 4) mat2x4; }
+///
+template mat3x2(T) { alias Matrix!(T, 3, 2) mat3x2; }
+///
+template mat3x4(T) { alias Matrix!(T, 3, 4) mat3x4; }
+///
+template mat4x2(T) { alias Matrix!(T, 4, 2) mat4x2; }
+///
+template mat4x3(T) { alias Matrix!(T, 4, 3) mat4x3; }
+
+// shorter names for most common matrices
+alias mat2x2 mat2;///
+alias mat3x3 mat3;///
+alias mat4x4 mat4;///
+
+// Define a lot of type names
+// Most useful are probably mat4f and mat4d
+
+alias mat2!float  mat2f;///
+alias mat2!double mat2d;///
+
+alias mat3!float  mat3f;///
+alias mat3!double mat3d;///
+
+alias mat4!float  mat4f;///
+alias mat4!double mat4d;///
+
+alias mat2x2!float  mat2x2f;///
+alias mat2x2!double mat2x2d;///
+
+alias mat3x3!float  mat3x3f;///
+alias mat3x3!double mat3x3d;///
+
+alias mat4x4!float  mat4x4f;///
+alias mat4x4!double mat4x4d;///
+
+unittest
+{
+    alias mat2i = mat2!int;
+    alias mat2x3f = mat2x3!float;
+    alias mat3x4f = mat3x4!float;
+    alias mat2x4f = mat2x4!float;
+
+    mat2i x = mat2i(0, 1,
+                    2, 3);
+    assert(x.c[0][0] == 0 && x.c[0][1] == 1 && x.c[1][0] == 2 && x.c[1][1] == 3);
+
+    vec2i[2] cols = [vec2i(0, 2), vec2i(1, 3)];
+    mat2i y = mat2i.fromColumns(cols[]);
+    assert(y.c[0][0] == 0 && y.c[0][1] == 1 && y.c[1][0] == 2 && y.c[1][1] == 3);
+    y = mat2i.fromRows(cols[]);
+    assert(y.c[0][0] == 0 && y.c[1][0] == 1 && y.c[0][1] == 2 && y.c[1][1] == 3);
+    y = y.transposed();
+
+    assert(x == y);
+    x = [0, 1, 2, 3];
+    assert(x == y);
+
+    mat2i z = x * y;
+    assert(z == mat2i([2, 3, 6, 11]));
+    vec2i vz = z * vec2i(2, -1);
+    assert(vz == vec2i(1, 1));
+
+    mat2f a = z;
+    mat2d ad = a;
+    ad += a;
+    mat2f w = [4, 5, 6, 7];
+    z = cast(mat2i)w;
+    assert(w == z);
+
+    {
+        mat2x3f A;
+        mat3x4f B;
+        mat2x4f C = A * B;
+    }
+
+    assert(mat2i.diag(vec2i(1, 2)) == mat2i(1, 0,
+                                            0, 2));
+
+    // Construct with a single scalar
+    auto D = mat4f(1.0f);
+    assert(D.v[] == [1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1, ]);
+
+    {
+        double[4][3] starray = [
+            [ 0,  1,  2,  3],
+            [ 4,  5,  6,  7,],
+            [ 8,  9, 10, 11,],
+        ];
+
+        // starray has the shape 3x4
+        assert(starray.length == 3);
+        assert(starray[0].length == 4);
+
+        auto m = mat3x4!double(starray);
+        assert(m.v[] == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ]);
+    }
+
+    {
+        auto dynarray = [
+            [ 0,  1,  2,  3],
+            [ 4,  5,  6,  7,],
+            [ 8,  9, 10, 11,],
+        ];
+
+        // dynarray has the shape 3x4
+        assert(dynarray.length == 3);
+        assert(dynarray[0].length == 4);
+
+        auto m = mat3x4!double(dynarray);
+        assert(m.v[] == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ]);
+    }
+}
+
+// Issue #206 (matrix *= scalar) not yielding matrix * scalar but matrix * matrix(scalar)
+unittest
+{
+    mat4f mvp = mat4f.identity;
+    mvp *= 2;
+    assert(mvp == mat4f(2, 0, 0, 0,
+                        0, 2, 0, 0,
+                        0, 0, 2, 0,
+                        0, 0, 0, 2));
+
+    mvp = mat4f.identity * 2;
+    assert(mvp == mat4f(2, 0, 0, 0,
+                        0, 2, 0, 0,
+                        0, 0, 2, 0,
+                        0, 0, 0, 2));
+
+
+    mvp = mat4f(1) * mat4f(1);
+    assert(mvp == mat4f(4, 4, 4, 4,
+                        4, 4, 4, 4,
+                        4, 4, 4, 4,
+                        4, 4, 4, 4));
+
+    mvp = mat4f(1);
+    mvp *= mat4f(1);
+    assert(mvp == mat4f(4, 4, 4, 4,
+                        4, 4, 4, 4,
+                        4, 4, 4, 4,
+                        4, 4, 4, 4));
+}
--- a/external/dplug/math/package.d
+++ b/external/dplug/math/package.d
@ -0,0 +1,12 @@
+/**
+ * Math package: rectangles, vectors, matrices.
+ *
+ * Copyright: Copyright Guillaume Piolat 2021.
+ * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+ * Note: this is part of the former gfm:math package, hence containing copyright from many GFM contributors.
+ */
+module dplug.math;
+
+public import dplug.math.vector,
+              dplug.math.box,
+              dplug.math.matrix;
--- a/external/dplug/math/vector.d
+++ b/external/dplug/math/vector.d
@ -0,0 +1,823 @@
+/**
+ * N-dimensional small vector math.
+ *
+ * Copyright: Copyright Guillaume Piolat 2021.
+ *            Copyright Chance Snow 2021.
+ *            Copyright Aleksandr Druzhinin 2018.
+ *            Copyright Nathan Sashihara 2018.
+ *            Copyright Ryan Roden-Corrent 2016.
+ *            Copyright Steven Dwy 2015.
+ *            Copyright Martin Nowak 2015.
+ *            Copyright Tanel Tagaväli 2015.
+ * 
+ * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+ */
+module dplug.math.vector;
+
+
+import std.traits,
+       std.math,
+       std.array;
+
+import inteli.emmintrin;
+
+/**
+ * Generic 1D small vector.
+ * Params:
+ *    N = number of elements
+ *    T = type of elements
+ */
+struct Vector(T, int N)
+{
+nothrow:
+    public
+    {
+        static assert(N >= 1);
+
+        // fields definition
+        union
+        {
+            T[N] v;
+            struct
+            {
+                static if (N >= 1)
+                {
+                    T x;
+                    alias x r;
+                }
+                static if (N >= 2)
+                {
+                    T y;
+                    alias y g;
+                }
+                static if (N >= 3)
+                {
+                    T z;
+                    alias z b;
+                }
+                static if (N >= 4)
+                {
+                    T w;
+                    alias w a;
+                }
+            }
+        }
+
+        /// Construct a Vector with a `T[]` or the values as arguments
+        @nogc this(Args...)(Args args) pure nothrow
+        {
+            static if (args.length == 1)
+            {
+                // Construct a Vector from a single value.
+                opAssign!(Args[0])(args[0]);
+            }
+            else
+            {
+                // validate the total argument count across scalars and vectors
+                template argCount(T...) {
+                    static if(T.length == 0)
+                        enum argCount = 0; // done recursing
+                    else static if(isVector!(T[0]))
+                        enum argCount = T[0]._N + argCount!(T[1..$]);
+                    else
+                        enum argCount = 1 + argCount!(T[1..$]);
+                }
+
+                static assert(argCount!Args <= N, "Too many arguments in vector constructor");
+
+                int index = 0;
+                foreach(arg; args)
+                {
+                    static if (isAssignable!(T, typeof(arg)))
+                    {
+                        v[index] = arg;
+                        index++; // has to be on its own line (DMD 2.068)
+                    }
+                    else static if (isVector!(typeof(arg)) && isAssignable!(T, arg._T))
+                    {
+                        mixin(generateLoopCode!("v[index + @] = arg[@];", arg._N)());
+                        index += arg._N;
+                    }
+                    else
+                        static assert(false, "Unrecognized argument in Vector constructor");
+                }
+                assert(index == N, "Bad arguments in Vector constructor");
+            }
+        }
+
+        size_t toHash() const nothrow @safe
+        {
+            size_t hash = 0;
+            foreach (elem; v) {
+                hash = elem.hashOf(hash);
+            }
+            return hash;
+        }
+
+        /// Assign a Vector from a compatible type.
+        @nogc ref Vector opAssign(U)(U x) pure nothrow if (isAssignable!(T, U))
+        {
+            mixin(generateLoopCode!("v[@] = x;", N)()); // copy to each component
+            return this;
+        }
+
+        /// Assign a Vector with a static array type.
+        @nogc ref Vector opAssign(U)(U arr) pure nothrow if ((isStaticArray!(U) && isAssignable!(T, typeof(arr[0])) && (arr.length == N)))
+        {
+            mixin(generateLoopCode!("v[@] = arr[@];", N)());
+            return this;
+        }
+
+        /// Assign with a dynamic array.
+        /// Size is checked in debug-mode.
+        @nogc ref Vector opAssign(U)(U arr) pure nothrow if (isDynamicArray!(U) && isAssignable!(T, typeof(arr[0])))
+        {
+            assert(arr.length == N);
+            mixin(generateLoopCode!("v[@] = arr[@];", N)());
+            return this;
+        }
+
+        /// Assign from a samey Vector.
+        @nogc ref Vector opAssign(U)(U u) pure nothrow if (is(U : Vector))
+        {
+            v[] = u.v[];
+            return this;
+        }
+
+        /// Assign from other vectors types (same size, compatible type).
+        @nogc ref Vector opAssign(U)(U x) pure nothrow if (isVector!U
+                                                       && isAssignable!(T, U._T)
+                                                       && (!is(U: Vector))
+                                                       && (U._N == _N))
+        {
+            mixin(generateLoopCode!("v[@] = x.v[@];", N)());
+            return this;
+        }
+
+        /// Returns: a pointer to content.
+        @nogc inout(T)* ptr() pure inout nothrow @property
+        {
+            return v.ptr;
+        }
+
+        @nogc bool opEquals(U)(U other) pure const nothrow
+            if (is(U : Vector))
+        {
+            for (int i = 0; i < N; ++i)
+            {
+                if (v[i] != other.v[i])
+                {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        @nogc bool opEquals(U)(U other) pure const nothrow
+            if (isConvertible!U)
+        {
+            Vector conv = other;
+            return opEquals(conv);
+        }
+
+        @nogc Vector opUnary(string op)() pure const nothrow
+            if (op == "+" || op == "-" || op == "~" || op == "!")
+        {
+            Vector res = void;
+            mixin(generateLoopCode!("res.v[@] = " ~ op ~ " v[@];", N)());
+            return res;
+        }
+
+        @nogc ref Vector opOpAssign(string op, U)(U operand) pure nothrow
+            if (is(U : Vector))
+        {
+            mixin(generateLoopCode!("v[@] " ~ op ~ "= operand.v[@];", N)());
+            return this;
+        }
+
+        @nogc ref Vector opOpAssign(string op, U)(U operand) pure nothrow if (isConvertible!U)
+        {
+            Vector conv = operand;
+            return opOpAssign!op(conv);
+        }
+
+        @nogc Vector opBinary(string op, U)(U operand) pure const nothrow
+            if (is(U: Vector) || (isConvertible!U))
+        {
+            Vector result = void;
+            static if (is(U: T))
+                mixin(generateLoopCode!("result.v[@] = cast(T)(v[@] " ~ op ~ " operand);", N)());
+            else
+            {
+                Vector other = operand;
+                mixin(generateLoopCode!("result.v[@] = cast(T)(v[@] " ~ op ~ " other.v[@]);", N)());
+            }
+            return result;
+        }
+
+        @nogc Vector opBinaryRight(string op, U)(U operand) pure const nothrow if (isConvertible!U)
+        {
+            Vector result = void;
+            static if (is(U: T))
+                mixin(generateLoopCode!("result.v[@] = cast(T)(operand " ~ op ~ " v[@]);", N)());
+            else
+            {
+                Vector other = operand;
+                mixin(generateLoopCode!("result.v[@] = cast(T)(other.v[@] " ~ op ~ " v[@]);", N)());
+            }
+            return result;
+        }
+
+        @nogc ref T opIndex(size_t i) pure nothrow
+        {
+            return v[i];
+        }
+
+        @nogc ref const(T) opIndex(size_t i) pure const nothrow
+        {
+            return v[i];
+        }
+
+        @nogc T opIndexAssign(U : T)(U x, size_t i) pure nothrow
+        {
+            return v[i] = x;
+        }
+
+
+        /// Implements swizzling.
+        ///
+        /// Example:
+        /// ---
+        /// vec4i vi = [4, 1, 83, 10];
+        /// assert(vi.zxxyw == [83, 4, 4, 1, 10]);
+        /// ---
+        @nogc @property auto opDispatch(string op, U = void)() pure const nothrow if (isValidSwizzle!(op))
+        {
+            alias Vector!(T, op.length) returnType;
+            returnType res = void;
+            enum indexTuple = swizzleTuple!op;
+            foreach(i, index; indexTuple)
+                res.v[i] = v[index];
+            return res;
+        }
+
+        /// Support swizzling assignment like in shader languages.
+        ///
+        /// Example:
+        /// ---
+        /// vec3f v = [0, 1, 2];
+        /// v.yz = v.zx;
+        /// assert(v == [0, 2, 0]);
+        /// ---
+        @nogc @property void opDispatch(string op, U)(U x) pure
+            if ((op.length >= 2)
+                && (isValidSwizzleUnique!op)                   // v.xyy will be rejected
+                && is(typeof(Vector!(T, op.length)(x)))) // can be converted to a small vector of the right size
+        {
+            Vector!(T, op.length) conv = x;
+            enum indexTuple = swizzleTuple!op;
+            foreach(i, index; indexTuple)
+                v[index] = conv[i];
+        }
+
+        /// Casting to small vectors of the same size.
+        /// Example:
+        /// ---
+        /// vec4f vf;
+        /// vec4d vd = cast!(vec4d)vf;
+        /// ---
+        @nogc U opCast(U)() pure const nothrow if (isVector!U && (U._N == _N))
+        {
+            U res = void;
+            mixin(generateLoopCode!("res.v[@] = cast(U._T)v[@];", N)());
+            return res;
+        }
+
+        /// Implement slices operator overloading.
+        /// Allows to go back to slice world.
+        /// Returns: length.
+        @nogc int opDollar() pure const nothrow
+        {
+            return N;
+        }
+
+        /// Slice containing vector values
+        /// Returns: a slice which covers the whole Vector.
+        @nogc T[] opSlice() pure nothrow
+        {
+            return v[];
+        }
+
+        /// vec[a..b]
+        @nogc T[] opSlice(int a, int b) pure nothrow
+        {
+            return v[a..b];
+        }
+
+        /// Squared Euclidean length of the Vector
+        /// Returns: squared length.
+        @nogc T squaredMagnitude() pure const nothrow
+        {
+            T sumSquares = 0;
+            mixin(generateLoopCode!("sumSquares += v[@] * v[@];", N)());
+            return sumSquares;
+        }
+
+        /// Squared Euclidean distance between this vector and another one
+        /// Returns: squared Euclidean distance.
+        @nogc T squaredDistanceTo(Vector v) pure const nothrow
+        {
+            return (v - this).squaredMagnitude();
+        }
+
+        static if (isFloatingPoint!T)
+        {
+            /// Euclidean length of the vector
+            /// Returns: Euclidean length
+            @nogc T magnitude() pure const nothrow
+            {
+                return sqrt(squaredMagnitude());
+            }
+
+            /// Inverse Euclidean length of the vector
+            /// Returns: Inverse of Euclidean length.
+            @nogc T inverseMagnitude() pure const nothrow
+            {
+                return 1 / sqrt(squaredMagnitude());
+            }
+
+            alias fastInverseLength = fastInverseMagnitude;
+            /// Faster but less accurate inverse of Euclidean length.
+            /// Returns: Inverse of Euclidean length.
+            @nogc T fastInverseMagnitude() pure const nothrow
+            {
+                return inverseSqrt(squaredMagnitude());
+            }
+
+            /// Euclidean distance between this vector and another one
+            /// Returns: Euclidean distance between this and other.
+            @nogc T distanceTo(Vector other) pure const nothrow
+            {
+                return (other - this).magnitude();
+            }
+
+            /// In-place normalization.
+            @nogc void normalize() pure nothrow
+            {
+                auto invMag = inverseMagnitude();
+                mixin(generateLoopCode!("v[@] *= invMag;", N)());
+            }
+
+            /// Returns a normalized copy of this Vector
+            /// Returns: Normalized vector.
+            @nogc Vector normalized() pure const nothrow
+            {
+                Vector res = this;
+                res.normalize();
+                return res;
+            }
+
+            /// Faster but less accurate in-place normalization.
+            @nogc void fastNormalize() pure nothrow
+            {
+                auto invLength = fastInverseMagnitude();
+                mixin(generateLoopCode!("v[@] *= invLength;", N)());
+            }
+
+            /// Faster but less accurate vector normalization.
+            /// Returns: Normalized vector.
+            @nogc Vector fastNormalized() pure const nothrow
+            {
+                Vector res = this;
+                res.fastNormalize();
+                return res;
+            }
+
+            static if (N == 3)
+            {
+                /// Gets an orthogonal vector from a 3-dimensional vector.
+                /// Doesn’t normalize the output.
+                /// Authors: Sam Hocevar
+                /// See_also: Source at $(WEB lolengine.net/blog/2013/09/21/picking-orthogonal-vector-combing-coconuts).
+                @nogc Vector getOrthogonalVector() pure const nothrow
+                {
+                    return abs(x) > abs(z) ? Vector(-y, x, 0.0) : Vector(0.0, -z, y);
+                }
+            }
+        }
+    }
+
+    private
+    {
+        enum _N = N;
+        alias T _T;
+
+        // define types that can be converted to this, but are not the same type
+        template isConvertible(T)
+        {
+            enum bool isConvertible = (!is(T : Vector))
+            && is(typeof(
+                {
+                    T x;
+                    Vector v = x;
+                }()));
+        }
+
+        // define types that can't be converted to this
+        template isForeign(T)
+        {
+            enum bool isForeign = (!isConvertible!T) && (!is(T: Vector));
+        }
+
+        template isValidSwizzle(string op, int lastSwizzleClass = -1)
+        {
+            static if (op.length == 0)
+                enum bool isValidSwizzle = true;
+            else
+            {
+                enum len = op.length;
+                enum int swizzleClass = swizzleClassify!(op[0]);
+                enum bool swizzleClassValid = (lastSwizzleClass == -1 || (swizzleClass == lastSwizzleClass));
+                enum bool isValidSwizzle = (swizzleIndex!(op[0]) != -1)
+                                         && swizzleClassValid
+                                         && isValidSwizzle!(op[1..len], swizzleClass);
+            }
+        }
+
+        template searchElement(char c, string s)
+        {
+            static if (s.length == 0)
+            {
+                enum bool result = false;
+            }
+            else
+            {
+                enum string tail = s[1..s.length];
+                enum bool result = (s[0] == c) || searchElement!(c, tail).result;
+            }
+        }
+
+        template hasNoDuplicates(string s)
+        {
+            static if (s.length == 1)
+            {
+                enum bool result = true;
+            }
+            else
+            {
+                enum tail = s[1..s.length];
+                enum bool result = !(searchElement!(s[0], tail).result) && hasNoDuplicates!(tail).result;
+            }
+        }
+
+        // true if the swizzle has at the maximum one time each letter
+        template isValidSwizzleUnique(string op)
+        {
+            static if (isValidSwizzle!op)
+                enum isValidSwizzleUnique = hasNoDuplicates!op.result;
+            else
+                enum bool isValidSwizzleUnique = false;
+        }
+
+        template swizzleIndex(char c)
+        {
+            static if((c == 'x' || c == 'r') && N >= 1)
+                enum swizzleIndex = 0;
+            else static if((c == 'y' || c == 'g') && N >= 2)
+                enum swizzleIndex = 1;
+            else static if((c == 'z' || c == 'b') && N >= 3)
+                enum swizzleIndex = 2;
+            else static if ((c == 'w' || c == 'a') && N >= 4)
+                enum swizzleIndex = 3;
+            else
+                enum swizzleIndex = -1;
+        }
+
+        template swizzleClassify(char c)
+        {
+            static if(c == 'x' || c == 'y' || c == 'z' || c == 'w')
+                enum swizzleClassify = 0;
+            else static if(c == 'r' || c == 'g' || c == 'b' || c == 'a')
+                enum swizzleClassify = 1;
+            else
+                enum swizzleClassify = -1;
+        }
+
+        template swizzleTuple(string op)
+        {
+            enum opLength = op.length;
+            static if (op.length == 0)
+                enum swizzleTuple = [];
+            else
+                enum swizzleTuple = [ swizzleIndex!(op[0]) ] ~ swizzleTuple!(op[1..op.length]);
+        }
+    }
+}
+
+/// True if `T` is some kind of `Vector`
+enum isVector(T) = is(T : Vector!U, U...);
+
+///
+unittest
+{
+    static assert(isVector!vec2f);
+    static assert(isVector!vec3d);
+    static assert(isVector!(vec4!real));
+    static assert(!isVector!float);
+}
+
+/// Get the numeric type used to measure a vectors's coordinates.
+alias DimensionType(T : Vector!U, U...) = U[0];
+
+///
+unittest
+{
+    static assert(is(DimensionType!vec2f == float));
+    static assert(is(DimensionType!vec3d == double));
+}
+
+///
+template vec2(T) { alias Vector!(T, 2) vec2; }
+///
+template vec3(T) { alias Vector!(T, 3) vec3; }
+///
+template vec4(T) { alias Vector!(T, 4) vec4; }
+
+alias vec2!int    vec2i;  ///
+alias vec2!float  vec2f;  ///
+alias vec2!double vec2d;  ///
+
+alias vec3!int    vec3i;  ///
+alias vec3!float  vec3f;  ///
+alias vec3!double vec3d;  ///
+
+alias vec4!int    vec4i;  ///
+alias vec4!float  vec4f;  ///
+alias vec4!double vec4d;  ///
+
+
+/// Element-wise minimum.
+@nogc Vector!(T, N) minByElem(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
+{
+    import std.algorithm: min;
+    Vector!(T, N) res = void;
+    mixin(generateLoopCode!("res.v[@] = min(a.v[@], b.v[@]);", N)());
+    return res;
+}
+
+/// Element-wise maximum.
+@nogc Vector!(T, N) maxByElem(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
+{
+    import std.algorithm: max;
+    Vector!(T, N) res = void;
+    mixin(generateLoopCode!("res.v[@] = max(a.v[@], b.v[@]);", N)());
+    return res;
+}
+
+/// Element-wise absolute value.
+@nogc Vector!(T, N) absByElem(T, int N)(const Vector!(T, N) a) pure nothrow
+{
+    Vector!(T, N) res = void;
+    mixin(generateLoopCode!("res.v[@] = abs(a.v[@]);", N)());
+    return res;
+}
+
+/// Dot product of two vectors
+/// Returns: Dot product.
+@nogc T dot(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
+{
+    T sum = 0;
+    mixin(generateLoopCode!("sum += a.v[@] * b.v[@];", N)());
+    return sum;
+}
+
+/// Cross product of two 3D vectors
+/// Returns: 3D cross product.
+/// Thanks to vuaru for corrections.
+@nogc Vector!(T, 3) cross(T)(const Vector!(T, 3) a, const Vector!(T, 3) b) pure nothrow
+{
+    return Vector!(T, 3)(a.y * b.z - a.z * b.y,
+                         a.z * b.x - a.x * b.z,
+                         a.x * b.y - a.y * b.x);
+}
+
+/// 3D reflect, like the GLSL function.
+/// Returns: a reflected by normal b.
+@nogc Vector!(T, N) reflect(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
+{
+    return a - (2 * dot(b, a)) * b;
+}
+
+///
+@nogc unittest
+{
+    // reflect a 2D vector across the x axis (the normal points along the y axis)
+    assert(vec2f(1,1).reflect(vec2f(0,1)) == vec2f(1,-1));
+    assert(vec2f(1,1).reflect(vec2f(0,-1)) == vec2f(1,-1));
+
+    // note that the normal must be, well, normalized:
+    assert(vec2f(1,1).reflect(vec2f(0,20)) != vec2f(1,-1));
+
+    // think of this like a ball hitting a flat floor at an angle.
+    // the x and y components remain unchanged, and the z inverts
+    assert(vec3f(2,3,-0.5).reflect(vec3f(0,0,1)) == vec3f(2,3,0.5));
+}
+
+/// Angle between two vectors
+/// Returns: angle between vectors.
+/// See_also: "The Right Way to Calculate Stuff" at $(WEB www.plunk.org/~hatch/rightway.php)
+@nogc T angleBetween(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
+{
+    auto aN = a.normalized();
+    auto bN = b.normalized();
+    auto dp = dot(aN, bN);
+
+    if (dp < 0)
+        return T(PI) - 2 * asin((-bN-aN).magnitude / 2);
+    else
+        return 2 * asin((bN-aN).magnitude / 2);
+}
+
+static assert(vec2f.sizeof == 8);
+static assert(vec3d.sizeof == 24);
+static assert(vec4i.sizeof == 16);
+
+unittest
+{
+    static assert(vec2i.isValidSwizzle!"xyx");
+    static assert(!vec2i.isValidSwizzle!"xyz");
+    static assert(vec4i.isValidSwizzle!"brra");
+    static assert(!vec4i.isValidSwizzle!"rgyz");
+    static assert(vec2i.isValidSwizzleUnique!"xy");
+    static assert(vec2i.isValidSwizzleUnique!"yx");
+    static assert(!vec2i.isValidSwizzleUnique!"xx");
+
+    alias vec2l = vec2!long;
+    alias vec3ui = vec3!uint;
+    alias vec4ub = vec4!ubyte;
+
+    assert(vec2l(0, 1) == vec2i(0, 1));
+
+    int[2] arr = [0, 1];
+    int[] arr2 = new int[2];
+    arr2[] = arr[];
+    vec2i a = vec2i([0, 1]);
+    vec2i a2 = vec2i(0, 1);
+    immutable vec2i b = vec2i(0);
+    assert(b[0] == 0 && b[1] == 0);
+    vec2i c = arr;
+    vec2l d = arr2;
+    assert(a == a2);
+    assert(a == c);
+    assert(vec2l(a) == vec2l(a));
+    assert(vec2l(a) == d);
+
+    int[vec2i] hashMap;
+    hashMap[a] = (c - a).squaredMagnitude;
+    assert(hashMap[a] == (c - a).squaredMagnitude);
+
+    vec4i x = [4, 5, 6, 7];
+    assert(x == x);
+    --x[0];
+    assert(x[0] == 3);
+    ++x[0];
+    assert(x[0] == 4);
+    x[1] &= 1;
+    x[2] = 77 + x[2];
+    x[3] += 3;
+    assert(x == [4, 1, 83, 10]);
+    assert(x.xxywz == [4, 4, 1, 10, 83]);
+    assert(x.xxxxxxx == [4, 4, 4, 4, 4, 4, 4]);
+    assert(x.abgr == [10, 83, 1, 4]);
+    assert(a != b);
+    x = vec4i(x.xyz, 166);
+    assert(x == [4, 1, 83, 166]);
+
+    vec2l e = a;
+    vec2l f = a + b;
+    assert(f == vec2l(a));
+
+    vec3ui g = vec3i(78,9,4);
+    g ^= vec3i(78,9,4);
+    assert(g == vec3ui(0));
+    //g[0..2] = 1u;
+    //assert(g == [2, 1, 0]);
+
+    assert(vec2i(4, 5) + 1 == vec2i(5,6));
+    assert(vec2i(4, 5) - 1 == vec2i(3,4));
+    assert(1 + vec2i(4, 5) == vec2i(5,6));
+    assert(vec3f(1,1,1) * 0 == 0);
+    assert(1.0 * vec3d(4,5,6) == vec3f(4,5.0f,6.0));
+
+    auto dx = vec2i(1,2);
+    auto dy = vec2i(4,5);
+    auto dp = dot(dx, dy);
+    assert(dp == 14 );
+
+    vec3i h = cast(vec3i)(vec3d(0.5, 1.1, -2.2));
+    assert(h == [0, 1, -2]);
+    assert(h[] == [0, 1, -2]);
+    assert(h[1..3] == [1, -2]);
+    assert(h.zyx == [-2, 1, 0]);
+
+    h.yx = vec2i(5, 2); // swizzle assignment
+
+    assert(h.xy == [2, 5]);
+    assert(-h[1] == -5);
+    assert(++h[0] == 3);
+
+    //assert(h == [-2, 1, 0]);
+    assert(!__traits(compiles, h.xx = h.yy));
+    vec4ub j;
+
+    // larger vectors
+    alias Vector!(float, 5) vec5f;
+    vec5f l = vec5f(1, 2.0f, 3.0, 4u, 5.0L);
+    l = vec5f(l.xyz, vec2i(1, 2));
+
+    // the ctor should not compile if given too many arguments
+    static assert(!is(typeof(vec2f(1, 2, 3))));
+    static assert(!is(typeof(vec2f(vec2f(1, 2), 3))));
+    static assert( is(typeof(vec3f(vec2f(1, 2), 3))));
+    static assert( is(typeof(vec3f(1, 2, 3))));
+
+    assert(absByElem(vec3i(-1, 0, 2)) == vec3i(1, 0, 2));
+}
+
+private:
+
+/// SSE approximation of reciprocal square root.
+@nogc T inverseSqrt(T)(T x) pure nothrow if (isFloatingPoint!T)
+{
+    static if (is(T == float))
+    {
+        __m128 V = _mm_set_ss(x);
+        V = _mm_rsqrt_ss(V);
+        return _mm_cvtss_f32(V);
+    }
+    else
+    {
+        return 1 / sqrt(x);
+    }
+}
+
+
+package
+{
+    // This generates small loops for Vector, Matrix, and Box.
+    // Time has shown such sort of manually unrolled code works best on both DMD and LDC.
+
+    static string generateLoopCode(string formatString, int N)() pure nothrow
+    {
+        string result;
+        for (int i = 0; i < N; ++i)
+        {
+            string index = ctIntToString(i);
+            // replace all @ by indices
+
+            int after = 0;
+            int cur = 0;
+            for (; cur < formatString.length; ++cur)
+            {
+                char ch = formatString[cur];
+                if (ch == '@')
+                {
+                    if (cur > after)
+                        result ~= formatString[after..cur];
+                    result ~= index;
+                    after = cur+1;
+                }
+            }
+            if (cur > after)
+                result ~= formatString[after..cur];
+        }
+        return result;
+    }
+
+    // Speed-up CTFE conversions, replacement for std.conv
+    // Doesn't do the negatives.
+    static string ctIntToString(int n) pure nothrow
+    {
+        static immutable string[16] table = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"];
+        if (n < 10)
+            return table[n];
+        else
+        {
+            char[10] r;
+            for (int k = 0; k < 10; ++k)
+            {
+                r[9-k] = cast(char)('0' + n % 10);
+                n /= 10;
+                if (n == 0)
+                    return r[9-k..$].idup;
+            }
+            return r.idup; 
+        }
+    }
+}
+
+unittest
+{
+    assert(ctIntToString(132) == "132");
+    assert(ctIntToString(2147483647) == "2147483647");
+}
--- a/external/inteli/avx2intrin.d
+++ b/external/inteli/avx2intrin.d
--- a/external/inteli/avxintrin.d
+++ b/external/inteli/avxintrin.d
--- a/external/inteli/bmi2intrin.d
+++ b/external/inteli/bmi2intrin.d
@ -0,0 +1,363 @@
+/**
+* BMI2 intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
+*
+* Copyright: Copyright Johan Engelen 2021.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.bmi2intrin;
+
+import inteli.internals;
+
+nothrow @nogc pure @safe:
+
+/// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
+uint _bzhi_u32 (uint a, uint index)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+            return __builtin_ia32_bzhi_si(a, index);
+        else
+            return bzhi!uint(a, index);
+    }
+    else
+    {
+        return bzhi!uint(a, index);
+    }
+}
+unittest
+{
+    static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
+           assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
+    static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
+           assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
+    static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
+           assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
+}
+
+/// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
+ulong _bzhi_u64 (ulong a, uint index)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+        {
+            version(X86_64)
+            {
+                // This instruction not available in 32-bit x86.
+                return __builtin_ia32_bzhi_di(a, index);
+            }
+            else
+                return bzhi!ulong(a, index);
+        }
+        else
+            return bzhi!ulong(a, index);
+    }
+    else
+    {
+        return bzhi!ulong(a, index);
+    }
+}
+unittest
+{
+    static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
+           assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
+    static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
+           assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
+    static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
+           assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
+    static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
+           assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
+}
+
+// Helper function for BZHI
+private T bzhi(T)(T a, uint index)
+{
+    /+
+        n := index[7:0]
+        dst := a
+        IF (n < number of bits)
+            dst[MSB:n] := 0
+        FI
+    +/
+    enum numbits = T.sizeof*8;
+    T dst = a;
+    if (index < numbits)
+    {
+        T mask = (T(1) << index) - 1;
+        dst &= mask;
+    }
+    return dst;
+}
+
+/// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, 
+/// and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
+/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
+///       But, those particular semantics don't exist at the level of intrinsics.
+uint _mulx_u32 (uint a, uint b, uint* hi)
+{
+    // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for
+    // some reason, even with LLVM IR.
+    // Also same with GDC.
+    ulong result = cast(ulong) a * b;
+    *hi = cast(uint) (result >>> 32);
+    return cast(uint)result;
+}
+@system unittest
+{
+    uint hi;
+    assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
+    assert (hi == 0x014B_66DC);
+}
+
+/// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and 
+/// store the high 64-bits in `hi`. This does not read or write arithmetic flags.
+/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
+///       But, those particular semantics don't exist at the level of intrinsics.
+ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
+{
+    /+
+        dst[63:0] := (a * b)[63:0]
+        MEM[hi+63:hi]  := (a * b)[127:64]
+    +/
+
+    static if (LDC_with_optimizations)
+    {
+        static if (__VERSION__ >= 2094)
+            enum bool withLDCIR = true;
+        else
+            enum bool withLDCIR = false;
+    }
+    else
+    {
+        enum bool withLDCIR = false;
+    }
+
+    static if (withLDCIR)
+    {
+        // LDC x86: Generates mulx from -O0
+        enum ir = `
+            %4 = zext i64 %0 to i128
+            %5 = zext i64 %1 to i128
+            %6 = mul nuw i128 %5, %4
+            %7 = lshr i128 %6, 64
+            %8 = trunc i128 %7 to i64
+            store i64 %8, i64* %2, align 8
+            %9 = trunc i128 %6 to i64
+            ret i64 %9`;
+        return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
+    }
+    else
+    {
+        /+ Straight-forward implementation with `ucent`:
+        ucent result = cast(ucent) a * b;
+        *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
+        return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
+        +/
+
+        /+
+            Implementation using 64bit math is more complex...
+            a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
+                  = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
+                  = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
+                  = c2 << 64 + c11 << 32 + c12 << 32 + c0
+                  = z1 << 64  +  z0
+        // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
+        // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
+        // intermediate result is then the 'carry' that we need to add when calculating z1's sum.
+            z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
+        The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
+            z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
+        +/
+
+        const ulong a_low = a & 0xFFFF_FFFF;
+        const ulong a_high = a >>> 32;
+        const ulong b_low = b & 0xFFFF_FFFF;
+        const ulong b_high = b >>> 32;
+
+        const ulong c2 = a_high*b_high;
+        const ulong c11 = a_high*b_low;
+        const ulong c12 = a_low*b_high;
+        const ulong c0 = a_low*b_low;
+
+        const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
+        const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
+        const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
+
+        *hi = z1;
+        return z0;
+    }
+}
+@system unittest
+{
+    ulong hi;
+    // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
+    assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
+    assert (hi == 0x14b_66dc_33f6_acdc);
+}
+
+/// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
+uint _pdep_u32 (uint a, uint mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+            return __builtin_ia32_pdep_si(a, mask);
+        else
+            return pdep!uint(a, mask);
+    }
+    else
+    {
+        return pdep!uint(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
+           assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
+}
+
+/// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
+ulong _pdep_u64 (ulong a, ulong mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+        {
+            version(X86_64)
+            {
+                // This instruction not available in 32-bit x86.
+                return __builtin_ia32_pdep_di(a, mask);
+            }
+            else
+                return pdep!ulong(a, mask);
+        }
+        else
+            return pdep!ulong(a, mask);
+    }
+    else
+    {
+        return pdep!ulong(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
+           assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
+}
+
+// Helper function for PDEP
+private T pdep(T)(T a, T mask)
+{
+    /+
+        tmp := a
+        dst := 0
+        m := 0
+        k := 0
+        DO WHILE m < 32
+            IF mask[m] == 1
+                dst[m] := tmp[k]
+                k := k + 1
+            FI
+            m := m + 1
+        OD
+    +/
+    T dst;
+    T k_bitpos = 1;
+    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
+    foreach (m; 0..T.sizeof*8)
+    {
+        if (mask & m_bitpos)
+        {
+            dst |= (a & k_bitpos) ? m_bitpos : 0;
+            k_bitpos <<= 1;
+        }
+        m_bitpos <<= 1;
+    }
+    return dst;
+}
+
+
+/// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by 
+/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
+uint _pext_u32 (uint a, uint mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+            return __builtin_ia32_pext_si(a, mask);
+        else
+            return pext!uint(a, mask);
+    }
+    else
+    {
+        return pext!uint(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
+           assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
+}
+
+/// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by 
+/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
+ulong _pext_u64 (ulong a, ulong mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+        {
+            version(X86_64)
+            {
+                // This instruction not available in 32-bit x86.
+                return __builtin_ia32_pext_di(a, mask);
+            }
+            else
+                return pext!ulong(a, mask);
+        }
+        else
+            return pext!ulong(a, mask);
+    }
+    else
+    {
+        return pext!ulong(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
+           assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
+}
+
+// Helper function for PEXT
+private T pext(T)(T a, T mask)
+{
+    /+
+        tmp := a
+        dst := 0
+        m := 0
+        k := 0
+        DO WHILE m < number of bits in T
+            IF mask[m] == 1
+                dst[k] := tmp[m]
+                k := k + 1
+            FI
+            m := m + 1
+        OD
+    +/
+    T dst;
+    T k_bitpos = 1;
+    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
+    foreach (m; 0..T.sizeof*8)
+    {
+        if (mask & m_bitpos)
+        {
+            dst |= (a & m_bitpos) ? k_bitpos : 0;
+            k_bitpos <<= 1;
+        }
+        m_bitpos <<= 1;
+    }
+    return dst;
+}
--- a/external/inteli/emmintrin.d
+++ b/external/inteli/emmintrin.d
--- a/external/inteli/internals.d
+++ b/external/inteli/internals.d
--- a/external/inteli/math.d
+++ b/external/inteli/math.d
@ -0,0 +1,350 @@
+/**
+* Transcendental bonus functions.
+*
+* Copyright: Copyright Guillaumr Piolat 2016-2020.
+*            Copyright (C) 2007  Julien Pommier
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.math;
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+import inteli.emmintrin;
+import inteli.internals;
+
+nothrow @nogc:
+
+/// Natural `log` computed for a single 32-bit float.
+/// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
+/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
+// #BONUS
+float _mm_log_ss(float v) pure @safe
+{
+    __m128 r = _mm_log_ps(_mm_set1_ps(v));
+    return r.array[0];
+}
+
+/// Natural logarithm computed for 4 simultaneous float.
+/// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
+/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
+// #BONUS
+__m128 _mm_log_ps(__m128 x) pure @safe
+{
+    static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000];
+    static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524];
+    static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2];
+    static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1];
+    static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1];
+    static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1];
+    static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1];
+    static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1];
+    static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1];
+    static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1];
+    static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1];
+    static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
+    static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
+
+    /* the smallest non denormalized float number */
+    static immutable __m128i _psi_min_norm_pos  = [0x00800000,   0x00800000,   0x00800000, 0x00800000];
+
+    __m128i emm0;
+    __m128 one = _ps_1;
+    __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+    x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos);  /* cut off denormalized stuff */
+    emm0 = _mm_srli_epi32(cast(__m128i)x, 23);
+
+    /* keep only the fractional part */
+    x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask);
+    x = _mm_or_ps(x, _ps_0p5);
+
+    emm0 = _mm_sub_epi32(emm0, _pi32_0x7f);
+    __m128 e = _mm_cvtepi32_ps(emm0);
+    e += one;
+    __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF);
+    __m128 tmp = _mm_and_ps(x, mask);
+    x -= one;
+    e -= _mm_and_ps(one, mask);
+    x += tmp;
+    __m128 z = x * x;
+    __m128 y = _ps_cephes_log_p0;
+    y *= x;
+    y += _ps_cephes_log_p1;
+    y *= x;
+    y += _ps_cephes_log_p2;
+    y *= x;
+    y += _ps_cephes_log_p3;
+    y *= x;
+    y += _ps_cephes_log_p4;
+    y *= x;
+    y += _ps_cephes_log_p5;
+    y *= x;
+    y += _ps_cephes_log_p6;
+    y *= x;
+    y += _ps_cephes_log_p7;
+    y *= x;
+    y += _ps_cephes_log_p8;
+    y *= x;
+
+    y = y * z;
+    tmp = e * _ps_cephes_log_q1;
+    y += tmp;
+    tmp = z * _ps_0p5;
+    y = y - tmp;
+    tmp = e * _ps_cephes_log_q2;
+    x += y;
+    x += tmp;
+    x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+    return x;
+}
+
+/// Natural `exp` computed for a single float.
+/// This is an approximation, valid up to approximately -109dB of accuracy
+/// IMPORTANT: NaN input not supported.
+// #BONUS
+float _mm_exp_ss(float v) pure @safe
+{
+    __m128 r = _mm_exp_ps(_mm_set1_ps(v));
+    return r.array[0];
+}
+
+/// Natural `exp` computed for 4 simultaneous float in `x`.
+/// This is an approximation, valid up to approximately -109dB of accuracy
+/// IMPORTANT: NaN input not supported.
+// #BONUS
+__m128 _mm_exp_ps(__m128 x) pure @safe
+{
+    static immutable __m128 _ps_exp_hi         = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f];
+    static immutable __m128 _ps_exp_lo         = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f];
+    static immutable __m128 _ps_cephes_LOG2EF  = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341];
+    static immutable __m128 _ps_cephes_exp_C1  = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
+    static immutable __m128 _ps_cephes_exp_C2  = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
+    static immutable __m128 _ps_cephes_exp_p0  = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4];
+    static immutable __m128 _ps_cephes_exp_p1  = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3];
+    static immutable __m128 _ps_cephes_exp_p2  = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3];
+    static immutable __m128 _ps_cephes_exp_p3  = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2];
+    static immutable __m128 _ps_cephes_exp_p4  = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1];
+    static immutable __m128 _ps_cephes_exp_p5  = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1];
+
+    __m128 tmp = _mm_setzero_ps(), fx;
+    __m128i emm0;
+    __m128 one = _ps_1;
+
+    x = _mm_min_ps(x, _ps_exp_hi);
+    x = _mm_max_ps(x, _ps_exp_lo);
+
+    /* express exp(x) as exp(g + n*log(2)) */
+    fx = x * _ps_cephes_LOG2EF;
+    fx += _ps_0p5;
+
+    /* how to perform a floorf with SSE: just below */
+    emm0 = _mm_cvttps_epi32(fx);
+    tmp  = _mm_cvtepi32_ps(emm0);
+
+    /* if greater, substract 1 */
+    __m128 mask = _mm_cmpgt_ps(tmp, fx);
+    mask = _mm_and_ps(mask, one);
+    fx = tmp - mask;
+
+    tmp = fx * _ps_cephes_exp_C1;
+    __m128 z = fx * _ps_cephes_exp_C2;
+    x -= tmp;
+    x -= z;
+
+    z = x * x;
+
+    __m128 y = _ps_cephes_exp_p0;
+    y *= x;
+    y += _ps_cephes_exp_p1;
+    y *= x;
+    y += _ps_cephes_exp_p2;
+    y *= x;
+    y += _ps_cephes_exp_p3;
+    y *= x;
+    y += _ps_cephes_exp_p4;
+    y *= x;
+    y += _ps_cephes_exp_p5;
+    y *= z;
+    y += x;
+    y += one;
+
+    /* build 2^n */
+    emm0 = _mm_cvttps_epi32(fx);
+
+    emm0 = _mm_add_epi32(emm0, _pi32_0x7f);
+    emm0 = _mm_slli_epi32(emm0, 23);
+    __m128 pow2n = cast(__m128)emm0;
+    y *= pow2n;
+    return y;
+}
+
+/// Computes `base^exponent` for a single 32-bit float.
+/// This is an approximation, valid up to approximately -100dB of accuracy
+/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
+// #BONUS
+float _mm_pow_ss(float base, float exponent) pure @safe
+{
+    __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent));
+    return r.array[0];
+}
+
+/// Computes `base^exponent`, for 4 floats at once.
+/// This is an approximation, valid up to approximately -100dB of accuracy
+/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
+// #BONUS
+__m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe
+{
+    return _mm_exp_ps(exponents * _mm_log_ps(base));
+}
+
+/// Computes `base^exponent`, for 4 floats at once.
+/// This is an approximation, valid up to approximately -100dB of accuracy
+/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
+// #BONUS
+__m128 _mm_pow_ps(__m128 base, float exponent) pure @safe
+{
+    return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base));
+}
+
+unittest
+{
+    import std.math;
+
+    bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow
+    {
+        if (!isFinite(groundTruth))
+            return true; // no need to approximate where this is NaN or infinite
+
+        if (groundTruth == 0) // the approximaton should produce zero too if needed
+        {
+            return approx == 0;
+        }
+
+        if (approx == 0)
+        {
+            // If the approximation produces zero, the error should be below 140 dB
+            return ( abs(groundTruth) < 1e-7 );
+        }
+
+        if ( ( abs(groundTruth / approx) - 1 ) >= epsilon)
+        {
+            import core.stdc.stdio;
+            debug printf("approxEquals (%g, %g, %g) failed\n", groundTruth, approx, epsilon);
+            debug printf("ratio is %f\n", abs(groundTruth / approx) - 1);
+        }
+
+        return ( abs(groundTruth / approx) - 1 ) < epsilon;
+    }
+
+    // test _mm_log_ps
+    for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.05)
+    {
+        foreach (exponent; -23..23)
+        {
+            double x = mantissa * 2.0 ^^ exponent;
+            double phobosValue = log(x);
+            __m128 v = _mm_log_ps(_mm_set1_ps(x));
+            foreach(i; 0..4)
+                assert(approxEquals(phobosValue, v.array[i], 1.1e-6));
+        }
+    }
+
+    // test _mm_exp_ps    
+    for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
+    {
+        foreach (exponent; -23..23)
+        {
+            double x = mantissa * 2.0 ^^ exponent;
+
+            // don't test too high numbers because they saturate FP precision pretty fast
+            if (x > 50) continue;
+
+            double phobosValue = exp(x);
+            __m128 v = _mm_exp_ps(_mm_set1_ps(x));
+            foreach(i; 0..4)
+            {
+                if (!approxEquals(phobosValue, v.array[i], 3.4e-6))
+                {
+                    import core.stdc.stdio;
+                    printf("x = %f   truth = %f vs estimate = %fn", x, phobosValue, v.array[i]);
+                    assert(false);
+                }
+            }
+        }
+    }
+
+    // test than exp(-inf) is 0
+    {
+        __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity));
+        float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
+        assert(R.array == correct);
+    }
+
+    // test log baheviour with NaN and infinities
+    // the only guarantee for now is that _mm_log_ps(negative) yield a NaN
+    {
+        __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan));
+      // DOESN'T PASS
+      //  assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity
+      // DOESN'T PASS
+      //  assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity
+        assert(isNaN(R.array[2])); // log(negative number) = NaN
+
+        // DOESN'T PASS
+        //assert(isNaN(R[3])); // log(NaN) = NaN
+    }
+
+
+    // test _mm_pow_ps
+    for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
+    {
+        foreach (exponent; -8..4)
+        {
+            double powExponent = mantissa * 2.0 ^^ exponent;
+
+            for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.1)
+            {
+                foreach (exponent2; -4..4)
+                {
+                    double powBase = mantissa2 * 2.0 ^^ exponent2;
+                    double phobosValue = pow(powBase, powExponent);
+                    float fPhobos = phobosValue;
+                    if (!isFinite(fPhobos)) continue;
+                     __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent));
+
+                    foreach(i; 0..4)
+                    {
+                        if (!approxEquals(phobosValue, v.array[i], 1e-5))
+                        {
+                            printf("%g ^^ %g\n", powBase, powExponent);
+                            assert(false);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+private:
+
+static immutable __m128 _ps_1   = [1.0f, 1.0f, 1.0f, 1.0f];
+static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f];
+static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f];
--- a/external/inteli/mmx.d
+++ b/external/inteli/mmx.d
--- a/external/inteli/nmmintrin.d
+++ b/external/inteli/nmmintrin.d
--- a/external/inteli/package.d
+++ b/external/inteli/package.d
@ -0,0 +1,25 @@
+/**
+* Public API. You can `import inteli;` if want access to all intrinsics, under any circumstances.
+* That's the what intel-intrinsics enables.
+*
+* Copyright: Copyright Guillaume Piolat 2016-2020.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli; 
+
+// Importing with `import inteli;` simply imports all available intrinsics.
+public import inteli.types;
+public import inteli.mmx;        // MMX
+public import inteli.emmintrin;  // SSE
+public import inteli.xmmintrin;  // SSE2
+public import inteli.pmmintrin;  // SSE3
+public import inteli.tmmintrin;  // SSSE3
+public import inteli.smmintrin;  // SSE4.1
+public import inteli.nmmintrin;  // SSE4.2
+public import inteli.shaintrin;  // SHA
+public import inteli.bmi2intrin; // BMI2
+public import inteli.avxintrin;  // AVX
+public import inteli.avx2intrin; // AVX2
+
+public import inteli.math; // Bonus
+
--- a/external/inteli/pmmintrin.d
+++ b/external/inteli/pmmintrin.d
@ -0,0 +1,294 @@
+/**
+* SSE3 intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE3
+*
+* Copyright: Guillaume Piolat 2016-2020.
+*            Charles Gregory 2019.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.pmmintrin;
+
+public import inteli.types;
+import inteli.internals;
+public import inteli.emmintrin;
+
+
+// Note: this header will work whether you have SSE3 enabled or not.
+// With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 
+// generate SSE3 instruction (they are often enabled with -O1 or greater).
+// With GDC, use "dflags-gdc": ["-msse3"] or equivalent to generate SSE3 instructions.
+
+
+nothrow @nogc:
+
+/// Alternatively add and subtract packed double-precision (64-bit) 
+/// floating-point elements in `a` to/from packed elements in `b`.
+__m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted
+{
+    static if (DMD_with_DSIMD_and_SSE3)
+    {
+        return cast(__m128d) __simd(XMM.ADDSUBPD, cast(void16)a, cast(void16)b);
+    }
+    else static if (GDC_with_SSE3)
+    {
+        return __builtin_ia32_addsubpd(a, b);
+    }
+    else static if (LDC_with_SSE3)
+    {
+        return __builtin_ia32_addsubpd(a, b);
+    }
+    else
+    {
+        // ARM: well optimized starting with LDC 1.18.0 -O2, not disrupted by LLVM 13+
+        a.ptr[0] = a.array[0] - b.array[0];
+        a.ptr[1] = a.array[1] + b.array[1];
+        return a;
+    }
+}
+unittest
+{
+    auto v1 =_mm_setr_pd(1.0,2.0);
+    auto v2 =_mm_setr_pd(1.0,2.0);
+    assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array);
+}
+
+/// Alternatively add and subtract packed single-precision (32-bit) 
+/// floating-point elements in `a` to/from packed elements in `b`.
+float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted
+{
+    static if (DMD_with_DSIMD_and_SSE3)
+    {
+        return cast(__m128) __simd(XMM.ADDSUBPS, cast(void16)a, cast(void16)b);
+    }
+    else static if (GDC_with_SSE3)
+    {
+        return __builtin_ia32_addsubps(a, b);
+    }
+    else static if (LDC_with_SSE3)
+    {
+        return __builtin_ia32_addsubps(a, b);
+    }
+    else
+    {    
+        a.ptr[0] -= b.array[0];
+        a.ptr[1] += b.array[1];
+        a.ptr[2] -= b.array[2];
+        a.ptr[3] += b.array[3];
+        return a;
+    }
+}
+unittest
+{
+    auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
+    auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
+    assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array );
+}
+
+
+/// Horizontally add adjacent pairs of double-precision (64-bit) 
+/// floating-point elements in `a` and `b`.
+__m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted
+{
+    // PERF: ARM64?
+    static if (DMD_with_DSIMD_and_SSE3)
+    {
+        return cast(__m128d) __simd(XMM.HADDPD, cast(void16)a, cast(void16)b);
+    }
+    else static if (GDC_or_LDC_with_SSE3)
+    {
+        return __builtin_ia32_haddpd(a, b);
+    }
+    else
+    {
+        __m128d res;
+        res.ptr[0] = a.array[1] + a.array[0];
+        res.ptr[1] = b.array[1] + b.array[0];
+        return res;
+    }
+}
+unittest
+{
+    auto A =_mm_setr_pd(1.5, 2.0);
+    auto B =_mm_setr_pd(1.0, 2.0);
+    assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array );
+}
+
+/// Horizontally add adjacent pairs of single-precision (32-bit) 
+/// floating-point elements in `a` and `b`.
+__m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted
+{
+    static if (DMD_with_DSIMD_and_SSE3)
+    {
+        return cast(__m128) __simd(XMM.HADDPS, cast(void16)a, cast(void16)b);
+    }
+    else static if (GDC_or_LDC_with_SSE3)
+    {
+        return __builtin_ia32_haddps(a, b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        return vpaddq_f32(a, b);
+    }
+    else
+    {    
+        __m128 res;
+        res.ptr[0] = a.array[1] + a.array[0];
+        res.ptr[1] = a.array[3] + a.array[2];
+        res.ptr[2] = b.array[1] + b.array[0];
+        res.ptr[3] = b.array[3] + b.array[2];
+        return res;
+    }
+}
+unittest
+{
+    __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
+    __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
+    assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array );
+}
+
+/// Horizontally subtract adjacent pairs of double-precision (64-bit) 
+/// floating-point elements in `a` and `b`.
+__m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted
+{
+    static if (DMD_with_DSIMD_and_SSE3)
+    {
+        return cast(__m128d) __simd(XMM.HSUBPD, cast(void16)a, cast(void16)b);
+    }
+    else static if (GDC_or_LDC_with_SSE3)
+    {
+        return __builtin_ia32_hsubpd(a, b);
+    }
+    else
+    {
+        // yep, sounds optimal for ARM64 too. Strangely enough.
+        __m128d res;
+        res.ptr[0] = a.array[0] - a.array[1];
+        res.ptr[1] = b.array[0] - b.array[1];
+        return res;
+    }
+}
+unittest
+{
+    auto A =_mm_setr_pd(1.5, 2.0);
+    auto B =_mm_setr_pd(1.0, 2.0);
+    assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array );
+}
+
+/// Horizontally subtract adjacent pairs of single-precision (32-bit) 
+/// floating-point elements in `a` and `b`.
+__m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted
+{
+    static if (DMD_with_DSIMD_and_SSE3)
+    {
+        return cast(__m128) __simd(XMM.HSUBPS, cast(void16)a, cast(void16)b);
+    }
+    else static if (GDC_or_LDC_with_SSE3)
+    {
+        return __builtin_ia32_hsubps(a, b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        int4 mask = [0, 0x80000000, 0, 0x80000000];
+        a = cast(__m128)(cast(int4)a ^ mask);
+        b = cast(__m128)(cast(int4)b ^ mask);
+        return vpaddq_f32(a, b);
+    }
+    else
+    {
+        __m128 res;
+        res.ptr[0] = a.array[0] - a.array[1];
+        res.ptr[1] = a.array[2] - a.array[3];
+        res.ptr[2] = b.array[0] - b.array[1];
+        res.ptr[3] = b.array[2] - b.array[3];
+        return res;
+    }
+}
+unittest
+{
+    __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
+    __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
+    assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array);
+}
+
+/// Load 128-bits of integer data from unaligned memory.
+// Note: The saying is LDDQU was only ever useful around 2008
+// See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used
+alias _mm_lddqu_si128 = _mm_loadu_si128;
+
+/// Load a double-precision (64-bit) floating-point element from memory into both elements of result.
+__m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted
+{
+    // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
+    // Same for GDC with -O1
+    double value = *mem_addr;
+    __m128d res;
+    res.ptr[0] = value;
+    res.ptr[1] = value;
+    return res;
+}
+unittest
+{
+    double a = 7.5;
+    __m128d A = _mm_loaddup_pd(&a);
+    double[2] correct = [7.5, 7.5];
+    assert(A.array == correct);
+}
+
+/// Duplicate the low double-precision (64-bit) floating-point element from `a`.
+__m128d _mm_movedup_pd (__m128d a) pure @trusted
+{
+    // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
+    // Something efficient with -01 for GDC
+    a.ptr[1] = a.array[0];
+    return a;
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(7.0, 2.5);
+    assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array);
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`.
+__m128 _mm_movehdup_ps (__m128 a) pure @trusted
+{
+    static if (GDC_with_SSE3)
+    {
+        return __builtin_ia32_movshdup (a);
+    }
+    else
+    {
+        // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3
+        a.ptr[0] = a.array[1];
+        a.ptr[2] = a.array[3];
+        return a;
+    }
+    
+}
+unittest
+{
+    __m128 A = _mm_movehdup_ps(_mm_setr_ps(1, 2, 3, 4));
+    float[4] correct = [2.0f, 2, 4, 4 ];
+    assert(A.array == correct);
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`.
+__m128 _mm_moveldup_ps (__m128 a) pure @trusted
+{
+    static if (GDC_with_SSE3)
+    {
+        return __builtin_ia32_movsldup (a);
+    }
+    else
+    {
+        // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3
+        a.ptr[1] = a.array[0];
+        a.ptr[3] = a.array[2];
+        return a;
+    }
+}
+unittest
+{
+    __m128 A = _mm_moveldup_ps(_mm_setr_ps(1, 2, 3, 4));
+    float[4] correct = [1.0f, 1, 3, 3 ];
+    assert(A.array == correct);
+}
--- a/external/inteli/shaintrin.d
+++ b/external/inteli/shaintrin.d
@ -0,0 +1,268 @@
+/**
+* SHA intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=SHA
+* 
+* Copyright: Guillaume Piolat 2021.
+*            Johan Engelen 2021.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.shaintrin;
+
+// SHA instructions
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=SHA
+// Note: this header will work whether you have SHA enabled or not.
+// With LDC, use "dflags-ldc": ["-mattr=+sha"] or equivalent to actively
+// generate SHA instructions.
+// With GDC, use "dflags-gdc": ["-msha"] or equivalent to generate SHA instructions.
+
+public import inteli.types;
+import inteli.internals;
+
+
+
+nothrow @nogc:
+
+/+
+/// Perform an intermediate calculation for the next four SHA1 message values (unsigned 32-bit integers) using previous message values from a and b, and store the result in dst.
+__m128i _mm_sha1nexte_epu32(__m128i a, __m128i b) @trusted
+{
+    static if (SHA_builtins)
+    {
+        return __builtin_ia32_sha1nexte(cast(int4) a, cast(int4) b);
+    }
+    else
+    {
+        assert(0);
+    }
+}
+unittest
+{
+}
+/
+
+/+
+/// Perform the final calculation for the next four SHA1 message values (unsigned 32-bit integers) using the intermediate result in a and the previous message values in b, and store the result in dst.
+__m128i _mm_sha1msg1_epu32(__m128i a, __m128i b) @trusted
+{
+    static if (SHA_builtins)
+    {
+        return __builtin_ia32_sha1msg1(cast(int4) a, cast(int4) b);
+    }
+    else
+    {
+        assert(0);
+    }
+}
+unittest
+{
+}
+/
+
+/+
+/// Calculate SHA1 state variable E after four rounds of operation from the current SHA1 state variable a, add that value to the scheduled values (unsigned 32-bit integers) in b, and store the result in dst.
+__m128i _mm_sha1msg2_epu32(__m128i a, __m128i b) @trusted
+{
+    static if (SHA_builtins)
+    {
+        return __builtin_ia32_sha1msg2(cast(int4) a, cast(int4) b);
+    }
+    else
+    {
+        assert(0);
+    }
+}
+unittest
+{
+}
+/
+
+/+
+/// Perform four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D) from a and some pre-computed sum of the next 4 round message values (unsigned 32-bit integers), and state variable E from b, and store the updated SHA1 state (A,B,C,D) in dst. func contains the logic functions and round constants.
+__m128i _mm_sha1rnds4_epu32(__m128i a, __m128i b, const int func) @trusted
+{
+    static if (SHA_builtins)
+    {
+        return __builtin_ia32_sha1rnds4(cast(int4) a, cast(int4) b, func);
+    }
+    else
+    {
+        assert(0);
+    }
+
+}
+/
+
+/// Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from `a` and `b`, and return the result.
+__m128i _mm_sha256msg1_epu32(__m128i a, __m128i b) @trusted
+{
+    static if (GDC_or_LDC_with_SHA)
+    {
+        return __builtin_ia32_sha256msg1(cast(int4) a, cast(int4) b);
+    }
+    else
+    {
+        static uint sigma0(uint x) nothrow @nogc @safe
+        { 
+            return bitwiseRotateRight_uint(x, 7) ^ bitwiseRotateRight_uint(x, 18) ^ x >> 3;
+        }
+
+        int4 dst;
+        int4 a4 = cast(int4) a;
+        int4 b4 = cast(int4) b;
+        uint W4 = b4.array[0];
+        uint W3 = a4.array[3];
+        uint W2 = a4.array[2];
+        uint W1 = a4.array[1];
+        uint W0 = a4.array[0];
+        dst.ptr[3] = W3 + sigma0(W4);
+        dst.ptr[2] = W2 + sigma0(W3);
+        dst.ptr[1] = W1 + sigma0(W2);
+        dst.ptr[0] = W0 + sigma0(W1);
+        return cast(__m128i) dst;
+    }
+}
+unittest
+{
+    __m128i a = [15, 20, 130, 12345];
+    __m128i b = [15, 20, 130, 12345];
+    __m128i result = _mm_sha256msg1_epu32(a, b);
+    assert(result.array == [671416337, 69238821, 2114864873, 503574586]);
+}
+
+/// Perform 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from `a`, an initial SHA256 state (A,B,E,F) from `b`, and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers) and the corresponding round constants from k, and return the updated SHA256 state (A,B,E,F).
+__m128i _mm_sha256msg2_epu32(__m128i a, __m128i b) @trusted
+{
+    static if (GDC_or_LDC_with_SHA)
+    {
+        return __builtin_ia32_sha256msg2(cast(int4) a, cast(int4) b);
+    }
+    else
+    {
+        static uint sigma1(uint x) nothrow @nogc @safe
+        { 
+            return bitwiseRotateRight_uint(x, 17) ^ bitwiseRotateRight_uint(x, 19) ^ x >> 10; 
+        }
+
+        int4 dst;
+        int4 a4 = cast(int4) a;
+        int4 b4 = cast(int4) b;
+        uint W14 = b4.array[2];
+        uint W15 = b4.array[3];
+        uint W16 = a4.array[0] + sigma1(W14);
+        uint W17 = a4.array[1] + sigma1(W15);
+        uint W18 = a4.array[2] + sigma1(W16);
+        uint W19 = a4.array[3] + sigma1(W17);
+        dst.ptr[3] = W19;
+        dst.ptr[2] = W18;
+        dst.ptr[1] = W17;
+        dst.ptr[0] = W16;
+        return cast(__m128i) dst;
+    }
+}
+unittest
+{
+    __m128i a = [15, 20, 130, 12345];
+    __m128i b = [15, 20, 130, 12345];
+    __m128i result = _mm_sha256msg2_epu32(a, b);
+    assert(result.array == [5324815, 505126944, -2012842764, -1542210977]);
+}
+
+/// Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from `a` and `b`, and return the result.
+__m128i _mm_sha256rnds2_epu32(__m128i a, __m128i b, __m128i k) @trusted
+{
+    // TODO: the pragma(inline) false prevent a DMD 1.100
+    //       regression in Linux + x86_64 + -b release-unittest, report that
+
+    version(DigitalMars)
+    {
+        enum bool workaround = true;
+    }
+    else
+    {
+        enum bool workaround = false;
+    }
+
+    static if (GDC_or_LDC_with_SHA)
+    {
+        return __builtin_ia32_sha256rnds2(cast(int4) a, cast(int4) b, cast(int4) k);
+    }
+    else
+    {
+        static uint Ch(uint x, uint y, uint z) nothrow @nogc @safe
+        { 
+            static if (workaround) pragma (inline, false);
+            return z ^ (x & (y ^ z)); 
+        }
+        
+        static uint Maj(uint x, uint y, uint z) nothrow @nogc @safe
+        { 
+            static if (workaround) pragma (inline, false);
+            return (x & y) | (z & (x ^ y)); 
+        }
+
+        static uint sum0(uint x) nothrow @nogc @safe
+        { 
+            static if (workaround) pragma (inline, false);
+            return bitwiseRotateRight_uint(x, 2) ^ bitwiseRotateRight_uint(x, 13) ^ bitwiseRotateRight_uint(x, 22); 
+        }
+
+        static uint sum1(uint x) nothrow @nogc @safe
+        { 
+            static if (workaround) pragma (inline, false);
+            return bitwiseRotateRight_uint(x, 6) ^ bitwiseRotateRight_uint(x, 11) ^ bitwiseRotateRight_uint(x, 25); 
+        }
+
+        int4 dst;
+        int4 a4 = cast(int4) a;
+        int4 b4 = cast(int4) b;
+        int4 k4 = cast(int4) k;
+
+        const A0 = b4.array[3];
+        const B0 = b4.array[2];
+        const C0 = a4.array[3];
+        const D0 = a4.array[2];
+        const E0 = b4.array[1];
+        const F0 = b4.array[0];
+        const G0 = a4.array[1];
+        const H0 = a4.array[0];
+        const W_K0 = k4.array[0];
+        const W_K1 = k4.array[1];
+        const A1 = Ch(E0, F0, G0) + sum1(E0) + W_K0 + H0 + Maj(A0, B0, C0) + sum0(A0);
+        const B1 = A0;
+        const C1 = B0;
+        const D1 = C0;
+        const E1 = Ch(E0, F0, G0) + sum1(E0) + W_K0 + H0 + D0;
+        const F1 = E0;
+        const G1 = F0;
+        const H1 = G0;
+        const A2 = Ch(E1, F1, G1) + sum1(E1) + W_K1 + H1 + Maj(A1, B1, C1) + sum0(A1);
+        const B2 = A1;
+        const C2 = B1;
+        const D2 = C1;
+        const E2 = Ch(E1, F1, G1) + sum1(E1) + W_K1 + H1 + D1;
+        const F2 = E1;
+        const G2 = F1;
+        const H2 = G1;
+
+        dst.ptr[3] = A2;
+        dst.ptr[2] = B2;
+        dst.ptr[1] = E2;
+        dst.ptr[0] = F2;
+
+        return cast(__m128i) dst;
+    }
+}
+unittest
+{
+    __m128i a = [15, 20, 130, 12345];
+    __m128i b = [15, 20, 130, 12345];
+    __m128i k = [15, 20, 130, 12345];
+    __m128i result = _mm_sha256rnds2_epu32(a, b, k);
+    assert(result.array == [1384123044, -2050674062, 327754346, 956342016]);
+}
+
+private uint bitwiseRotateRight_uint(const uint value, const uint count) @safe
+{
+    assert(count < 8 * uint.sizeof);
+    return cast(uint) ((value >> count) | (value << (uint.sizeof * 8 - count)));
+}
--- a/external/inteli/smmintrin.d
+++ b/external/inteli/smmintrin.d
--- a/external/inteli/tmmintrin.d
+++ b/external/inteli/tmmintrin.d
--- a/external/inteli/types.d
+++ b/external/inteli/types.d
@ -0,0 +1,456 @@
+/**
+* `core.simd` emulation layer.
+*
+* Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
+*            cet 2024.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.types;
+
+
+pure:
+nothrow:
+@nogc:
+
+version(GNU)
+{
+    // Note: for GDC support, be sure to use https://explore.dgnu.org/
+
+    // Future: just detect vectors, do not base upon arch.
+
+    version(X86_64)
+    {
+        enum MMXSizedVectorsAreEmulated = false;
+        enum SSESizedVectorsAreEmulated = false;
+
+        // Does GDC support AVX-sized vectors?
+        static if (__VERSION__ >= 2100) // Starting at GDC 12.1 only.
+        {
+            enum AVXSizedVectorsAreEmulated = !(is(__vector(double[4]))); 
+        }
+        else
+        {
+            enum AVXSizedVectorsAreEmulated = true;
+        }
+
+        import gcc.builtins;
+    }
+    else
+    {
+        enum MMXSizedVectorsAreEmulated = true;
+        enum SSESizedVectorsAreEmulated = true;
+        enum AVXSizedVectorsAreEmulated = true;
+    }
+}
+else version(LDC)
+{
+    public import ldc.simd;
+
+    // Use this alias to mention it should only be used with LDC,
+    // for example when emulated shufflevector would just be wasteful.
+    alias shufflevectorLDC = shufflevector;
+
+    enum MMXSizedVectorsAreEmulated = false;
+    enum SSESizedVectorsAreEmulated = false;
+    enum AVXSizedVectorsAreEmulated = false;
+}
+else version(DigitalMars)
+{
+    public import core.simd;
+
+    static if (__VERSION__ >= 2100)
+    {
+        // Note: turning this true is very desirable for DMD performance,
+        // but also leads to many bugs being discovered upstream.
+        // The fact that it works at all relies on many workardounds.
+        // In particular intel-intrinsics with this "on" is a honeypot for DMD backend bugs,
+        // and a very strong DMD codegen test suite.
+        // What happens typically is that contributors end up on a DMD bug in their PR.
+        // But finally, in 2022 D_SIMD has been activated, at least for SSE and some instructions.
+        enum bool tryToEnableCoreSimdWithDMD = true;
+    }
+    else
+    {
+        enum bool tryToEnableCoreSimdWithDMD = false;
+    }
+
+    version(D_SIMD)
+    {
+        enum MMXSizedVectorsAreEmulated = true;
+        enum SSESizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD;
+
+        // Note: with DMD, AVX-sized vectors can't be enabled yet.
+        // On linux + x86_64, this will fail since a few operands seem to be missing. 
+        // FUTURE: enable AVX-sized vectors in DMD. :)
+        //
+        // Blockers: https://issues.dlang.org/show_bug.cgi?id=24283 and 24284
+        //           Probably other, unreported issues.
+        version(D_AVX)
+            enum AVXSizedVectorsAreEmulated = true;
+        else
+            enum AVXSizedVectorsAreEmulated = true;
+    }
+    else
+    {
+        // Some DMD 32-bit targets don't have D_SIMD
+        enum MMXSizedVectorsAreEmulated = true;
+        enum SSESizedVectorsAreEmulated = true;
+        enum AVXSizedVectorsAreEmulated = true;
+    }
+}
+
+enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated;
+
+static if (CoreSimdIsEmulated)
+{
+    // core.simd is emulated in some capacity: introduce `VectorOps`
+
+    mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N)
+    {
+        enum Count = N;
+        alias Base = BaseType;
+
+        BaseType* ptr() return pure nothrow @nogc
+        {
+            return array.ptr;
+        }
+
+        // Unary operators
+        VectorType opUnary(string op)() pure nothrow @safe @nogc
+        {
+            VectorType res = void;
+            mixin("res.array[] = " ~ op ~ "array[];");
+            return res;
+        }
+
+        // Binary operators
+        VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc
+        {
+            VectorType res = void;
+            mixin("res.array[] = array[] " ~ op ~ " other.array[];");
+            return res;
+        }
+
+        // Assigning a BaseType value
+        void opAssign(BaseType e) pure nothrow @safe @nogc
+        {
+            array[] = e;
+        }
+
+        // Assigning a static array
+        void opAssign(ArrayType v) pure nothrow @safe @nogc
+        {
+            array[] = v[];
+        }
+
+        void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc
+        {
+            mixin("array[] "  ~ op ~ "= other.array[];");
+        }
+
+        // Assigning a dyn array
+        this(ArrayType v) pure nothrow @safe @nogc
+        {
+            array[] = v[];
+        }
+
+        // Broadcast constructor
+        this(BaseType x) pure nothrow @safe @nogc
+        {
+            array[] = x;
+        }
+
+        /// We can't support implicit conversion but do support explicit casting.
+        /// "Vector types of the same size can be implicitly converted among each other."
+        /// Casting to another vector type is always just a raw copy.
+        VecDest opCast(VecDest)() pure const nothrow @trusted @nogc
+            if (VecDest.sizeof == VectorType.sizeof)
+            {
+                VecDest dest = void;
+                // Copy
+                dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[];
+                return dest;
+            }
+
+        ref inout(BaseType) opIndex(size_t i) inout return pure nothrow @safe @nogc
+        {
+            return array[i];
+        }
+
+    }
+}
+else
+{
+    public import core.simd;
+
+    // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can
+    // And GDC sometimes need those unsigned vector types for some intrinsics.
+    // For internal use only.
+    package alias ushort8 = Vector!(ushort[8]);
+    package alias ubyte8  = Vector!(ubyte[8]);
+    package alias ubyte16 = Vector!(ubyte[16]);
+
+    static if (!AVXSizedVectorsAreEmulated)
+    {
+        package alias ushort16 = Vector!(ushort[16]);
+        package alias ubyte32  = Vector!(ubyte[32]);
+    }
+}
+
+// Emulate ldc.simd cmpMask and other masks.
+// Note: these should be deprecated on non-LDC, 
+// since it's slower to generate that code.
+version(LDC)
+{} 
+else
+{
+    // TODO: deprecated and write plain versions instead
+
+    private template BaseType(V)
+    {
+        alias typeof( ( { V v; return v; }()).array[0]) BaseType;
+    }
+
+    private template TrueMask(V)
+    {
+        alias Elem = BaseType!V;
+
+        static if (is(Elem == float))
+        {
+            immutable uint m1 = 0xffffffff;
+            enum Elem TrueMask = *cast(float*)(&m1);
+        }
+        else static if (is(Elem == double))
+        {
+            immutable ulong m1 = 0xffffffff_ffffffff;
+            enum Elem TrueMask = *cast(double*)(&m1);
+        }
+        else // integer case
+        {
+            enum Elem TrueMask = -1;
+        }
+    }
+
+    Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison
+    {
+        enum size_t Count = Vec.array.length;
+        Vec result;
+        foreach(int i; 0..Count)
+        {
+            bool cond = a.array[i] == b.array[i];
+            result.ptr[i] = cond ? TrueMask!Vec : 0;
+        }
+        return result;
+    }
+
+    Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison
+    {
+        enum size_t Count = Vec.array.length;
+        Vec result;
+        foreach(int i; 0..Count)
+        {
+            bool cond = a.array[i] > b.array[i];
+            result.ptr[i] = cond ? TrueMask!Vec : 0;
+        }
+        return result;
+    }
+}
+
+unittest
+{
+    float4 a = [1, 3, 5, 7];
+    float4 b = [2, 3, 4, 5];
+    int4 c = cast(int4)(greaterMask!float4(a, b));
+    static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff];
+    assert(c.array == correct);
+}
+
+static if (MMXSizedVectorsAreEmulated)
+{
+    /// MMX-like SIMD types
+    struct float2
+    {
+        float[2] array;
+        mixin VectorOps!(float2, float[2]);
+    }
+
+    struct byte8
+    {
+        byte[8] array;
+        mixin VectorOps!(byte8, byte[8]);
+    }
+
+    struct short4
+    {
+        short[4] array;
+        mixin VectorOps!(short4, short[4]);
+    }
+
+    struct int2
+    {
+        int[2] array;
+        mixin VectorOps!(int2, int[2]);
+    }
+
+    struct long1
+    {
+        long[1] array;
+        mixin VectorOps!(long1, long[1]);
+    }
+}
+else
+{
+    // For this compiler, defining MMX-sized vectors is working.
+    public import core.simd;
+    alias long1 = Vector!(long[1]);
+    alias float2 = Vector!(float[2]);
+    alias int2 = Vector!(int[2]);
+    alias short4 = Vector!(short[4]);
+    alias byte8 = Vector!(byte[8]);
+}
+
+static assert(float2.sizeof == 8);
+static assert(byte8.sizeof == 8);
+static assert(short4.sizeof == 8);
+static assert(int2.sizeof == 8);
+static assert(long1.sizeof == 8);
+
+
+static if (SSESizedVectorsAreEmulated)
+{
+    /// SSE-like SIMD types
+
+    struct float4
+    {
+        float[4] array;
+        mixin VectorOps!(float4, float[4]);
+    }
+
+    struct byte16
+    {
+        byte[16] array;
+        mixin VectorOps!(byte16, byte[16]);
+    }
+
+    struct short8
+    {
+        short[8] array;
+        mixin VectorOps!(short8, short[8]);
+    }
+
+    struct int4
+    {
+        int[4] array;
+        mixin VectorOps!(int4, int[4]);
+    }
+
+    struct long2
+    {
+        long[2] array;
+        mixin VectorOps!(long2, long[2]);
+    }
+
+    struct double2
+    {
+        double[2] array;
+        mixin VectorOps!(double2, double[2]);
+    }
+}
+
+static assert(float4.sizeof == 16);
+static assert(byte16.sizeof == 16);
+static assert(short8.sizeof == 16);
+static assert(int4.sizeof == 16);
+static assert(long2.sizeof == 16);
+static assert(double2.sizeof == 16);
+
+
+static if (AVXSizedVectorsAreEmulated)
+{
+    /// AVX-like SIMD types
+
+    struct float8
+    {
+        float[8] array;
+        mixin VectorOps!(float8, float[8]);
+    }
+
+    struct byte32
+    {
+        byte[32] array;
+        mixin VectorOps!(byte32, byte[32]);
+    }
+
+    struct short16
+    {
+        short[16] array;
+        mixin VectorOps!(short16, short[16]);
+    }
+
+    struct int8
+    {
+        int[8] array;
+        mixin VectorOps!(int8, int[8]);
+    }
+
+    struct long4
+    {
+        long[4] array;
+        mixin VectorOps!(long4, long[4]);
+    }
+
+    struct double4
+    {
+        double[4] array;
+        mixin VectorOps!(double4, double[4]);
+    }
+}
+else
+{
+    public import core.simd;    
+}
+static assert(float8.sizeof == 32);
+static assert(byte32.sizeof == 32);
+static assert(short16.sizeof == 32);
+static assert(int8.sizeof == 32);
+static assert(long4.sizeof == 32);
+static assert(double4.sizeof == 32);
+
+
+
+
+alias __m256 = float8;
+alias __m256i = long4; // long long __vector with ICC, GCC, and clang
+alias __m256d = double4;
+alias __m128 = float4;
+alias __m128i = int4;
+alias __m128d = double2;
+alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long
+
+int _MM_SHUFFLE2(int x, int y) pure @safe
+{
+    assert(x >= 0 && x <= 1);
+    assert(y >= 0 && y <= 1);
+    return (x << 1) | y;
+}
+
+int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe
+{
+    assert(x >= 0 && x <= 3);
+    assert(y >= 0 && y <= 3);
+    assert(z >= 0 && z <= 3);
+    assert(w >= 0 && w <= 3);
+    return (z<<6) | (y<<4) | (x<<2) | w;
+}
+
+// test assignment from scalar to vector type
+unittest
+{
+    float4 A = 3.0f;
+    float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f];
+    assert(A.array == correctA);
+
+    int2 B = 42;
+    int[2] correctB = [42, 42];
+    assert(B.array == correctB);
+}
--- a/external/inteli/xmmintrin.d
+++ b/external/inteli/xmmintrin.d
--- a/src/gears/main.d
+++ b/src/gears/main.d
@ -14,22 +14,6 @@ void main()
 	r.Renderer rd = r.Init(&window);
 	scope(exit) r.Destroy(&rd);

-	/*
-	Vec4 f1 = Vec4(r: 2.0, a: 5.5);
-	Vec4 f2;
-
-	Vec4* f = &f1;
-
-	asm
-	{
-		mov R8, f;
-		movups XMM0, f1.r.offsetof[R8];
-		movups f2, XMM0;
-	}
-
-	writeln(f2);
-	*/
-
 	while (true)
 	{
 		p.HandleEvents(&window);
--- a/src/gears/renderer.d
+++ b/src/gears/renderer.d
@ -32,6 +32,7 @@ enum Format : VkFormat
 	RGBA_F32 = VK_FORMAT_R32G32B32A32_SFLOAT,
 	RGBA_UINT = VK_FORMAT_B8G8R8A8_UINT,
 	RGBA_UNORM = VK_FORMAT_R8G8B8A8_UNORM,
+	RGBA_SRGB = VK_FORMAT_R8G8B8A8_SRGB,
 }

 alias FMT = Format;
@ -101,9 +102,17 @@ struct Renderer

 	PushConst push_const;

+	Vec3 camera_pos = Vec3(0.0);
+
 	Model yoder;
 }

+struct Camera
+{
+	Vec3 pos = Vec3(0.0);
+	Vec3 target = Vec3(0.0);
+}
+
 struct GlobalUniforms
 {
 	Vec2 res;
@ -128,17 +137,6 @@ extern(C) struct Material
 	f32 shininess = 0.0;
 }

-static assert(Material.ambient.offsetof == 0, "ambient offset incorrect");
-static assert(Material.diffuse.offsetof == 16, "ambient offset incorrect");
-static assert(Material.specular.offsetof == 32, "ambient offset incorrect");
-static assert(Material.albedo_texture.offsetof == 48, "ambient offset incorrect");
-static assert(Material.ambient_texture.offsetof == 52, "ambient offset incorrect");
-static assert(Material.specular_texture.offsetof == 56, "ambient offset incorrect");
-static assert(Material.albedo_has_texture.offsetof == 60, "ambient offset incorrect");
-static assert(Material.ambient_has_texture.offsetof == 64, "ambient offset incorrect");
-static assert(Material.specular_has_texture.offsetof == 68, "ambient offset incorrect");
-static assert(Material.shininess.offsetof == 72, "ambient offset incorrect");
-
 struct UIVertex
 {
 	Vec2 p0;
@ -244,7 +242,7 @@ Cycle(Renderer* rd)

 	SetUniform(rd, &rd.globals);

-	DrawRect(rd, 150.0, 300.0, 500.0, 700.0, Vec4(r: 0.0, g: 0.0, b: 1.0, a: 1.0));
+	DrawRect(rd, 150.0, 300.0, 500.0, 700.0, Vec4(0.0, 0.0, 1.0, 1.0));

 	PrepComputeDrawImage(rd);

--- a/src/gears/vulkan.d
+++ b/src/gears/vulkan.d
@ -732,7 +732,7 @@ CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, u32 ch, u8[] data)
 		assert(Transfer(vk, &buf, data), "CreateImageView failure: Buffer Transfer error");

 		ImageView conv_view;
-		CreateImageView(vk, &conv_view, w, h, VK_FORMAT_R32G32B32A32_SFLOAT);
+		CreateImageView(vk, &conv_view, w, h, FMT.RGBA_F32);

 		WriteConvDescriptor(vk, &buf);
 		WriteConvDescriptor(vk, &conv_view);
@ -783,10 +783,12 @@ CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, u32 ch, u8[] data)
 				
 		FinishComputePass(vk);

-		vkWaitForFences(vk.device, 1, &vk.comp_fence, VK_TRUE, 1000000000);
+		vkWaitForFences(vk.device, 1, &vk.comp_fence, VK_TRUE, u64.max);

-		//Destroy(vk, &buf);
-		//Destroy(&conv_view, vk.device, vk.vma);
+		vkQueueWaitIdle(vk.tfer_queue);
+
+		Destroy(vk, &buf);
+		Destroy(&conv_view, vk.device, vk.vma);
 	}
 }

@ -834,7 +836,7 @@ FinishComputePass(Vulkan* vk)
 }

 pragma(inline): void
-CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, VkFormat format = VK_FORMAT_R8G8B8A8_SRGB)
+CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, Format format = FMT.RGBA_UNORM)
 {
 	VmaAllocationCreateInfo alloc_info = {
 		usage: VMA_MEMORY_USAGE_GPU_ONLY,
@ -849,7 +851,7 @@ CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, VkFormat format = VK_
 		format: format,
 		tiling: VK_IMAGE_TILING_OPTIMAL,
 		initialLayout: VK_IMAGE_LAYOUT_UNDEFINED,
-		usage: format == VK_FORMAT_R8G8B8A8_SRGB ? (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT) : (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
+		usage: format == FMT.RGBA_F32 ? (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT) : (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT), 
 		samples: VK_SAMPLE_COUNT_1_BIT,
 		extent: {
 			width: w,
--- a/src/shared/aliases.d
+++ b/src/shared/aliases.d
@ -1,5 +1,6 @@
 import core.memory;
 import std.stdint;
+import dplug.math;

 debug
 {
@ -28,3 +29,10 @@ alias b32 = uint;
 alias intptr = intptr_t;
 alias uintptr = uintptr_t;

+alias Vec2 = vec2f;
+alias Vec3 = vec3f;
+alias Vec4 = vec4f;
+
+alias Mat2 = mat2f;
+alias Mat3 = mat3f;
+alias Mat4 = mat4f;
--- a/src/shared/util.d
+++ b/src/shared/util.d
@ -359,38 +359,3 @@ Hash(string str)
 	return xxh3_64bits_withSeed(str.ptr, str.length, HASH_SEED);
 }

-struct Matrix(T, int S)
-{
-	T[S][S] m;
-	alias m this;
-}
-
-alias Mat2 = Matrix!(f32, 2);
-alias Mat3 = Matrix!(f32, 3);
-alias Mat4 = Matrix!(f32, 4);
-
-struct Vector(T, int S)
-{
-	union
-	{
-		struct
-		{
-			T r = 0.0;
-			T g = 0.0;
-			static if (S > 2) T b = 0.0;
-			static if (S > 3) T a = 0.0;
-		};
-		struct
-		{
-			T x;
-			T y;
-			static if (S > 2) T z;
-			static if (S > 3) T w;
-		};
-		T[S] v;
-	}
-}
-
-alias Vec2 = Vector!(f32, 2);
-alias Vec3 = Vector!(f32, 3);
-alias Vec4 = Vector!(f32, 4);