add clean up for model transfers, add math and intrinsics libs

2025-07-20 17:30:29 +10:00 · 2025-07-20 17:30:29 +10:00 · ba4ccad085
commit ba4ccad085
parent 6127a0bb70
25 changed files with 31148 additions and 75 deletions
--- a/dub.json
+++ b/dub.json
@ -9,8 +9,8 @@
 			"targetPath": "build",
 			"sourceFiles-linux": ["build/libvma.a", "build/libstb_image.a", "build/libm3d.a"],
 			"sourceFiles-windows": [],
-			"importPaths": ["src/gears", "src/shared", "src/generated", "external/xxhash"],
+			"importPaths": ["src/gears", "src/shared", "src/generated", "external/xxhash", "external/dplug/math", "external/inteli"],
-			"sourcePaths": ["src/gears", "src/shared", "src/generated", "external/xxhash"],
+			"sourcePaths": ["src/gears", "src/shared", "src/generated", "external/xxhash", "external/dplug/math", "external/inteli"],
 			"libs-linux": ["xcb", "X11", "X11-xcb", "vulkan", "stdc++"],
 			"libs-windows": [],
 			"preGenerateCommands-linux": ["./build-vma.sh", "build/Codegen", "dub main:packer"],
@ -22,8 +22,8 @@
 			"targetType": "executable",
 			"targetPath": "build",
 			"targetName": "Packer",
-			"importPaths": ["src/packer", "src/shared", "src/generated", "external/xxhash"],
+			"importPaths": ["src/packer", "src/shared", "src/generated", "external/xxhash", "external/dplug/math", "external/inteli"],
-			"sourcePaths": ["src/packer", "src/shared", "src/generated", "external/xxhash"],
+			"sourcePaths": ["src/packer", "src/shared", "src/generated", "external/xxhash", "external/dplug/math", "external/inteli"],
 			"sourceFiles-linux": ["build/libstb_image.a", "build/libm3d.a"],
 			"preGenerateCommands-linux": ["./build-vma.sh"],
 			"postGenerateCommands-linux": ["build/Packer"],
@ -35,8 +35,8 @@
 			"targetType": "executable",
 			"targetPath": "build",
 			"targetName": "Codegen",
-			"importPaths": ["src/codegen", "src/shared", "external/xxhash"],
+			"importPaths": ["src/codegen", "src/shared", "external/xxhash", "external/dplug/math", "external/inteli"],
-			"sourcePaths": ["src/codegen", "src/shared", "external/xxhash"],
+			"sourcePaths": ["src/codegen", "src/shared", "external/xxhash", "external/dplug/math", "external/inteli"],
 			"sourceFiles-linux": ["build/libstb_image.a"],
 			"preGenerateCommands-linux": ["./build-vma.sh"],
 			"preGenerateCommands-windows": [],
--- a/external/dplug/math/box.d
+++ b/external/dplug/math/box.d
@ -0,0 +1,689 @@
 /**
 * N-dimensional half-open interval [a, b[.
 *
 * Copyright: Copyright Guillaume Piolat 2015-2021.
 *            Copyright Ahmet Sait 2021.
 *            Copyright Ryan Roden-Corrent 2016.
 *            Copyright Nathan Sashihara 2018.
 *            Copyright Colden Cullen 2014.
 *
 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
 */
 module dplug.math.box;
 import std.math,
       std.traits;
 import dplug.math.vector;
 /// N-dimensional half-open interval [a, b[.
 struct Box(T, int N)
 {
    static assert(N > 0);
    public
    {
        alias bound_t = Vector!(T, N);
        bound_t min; // not enforced, the box can have negative volume
        bound_t max;
        /// Construct a box which extends between 2 points.
        /// Boundaries: min is inside the box, max is just outside.
        @nogc this(bound_t min_, bound_t max_) pure nothrow
        {
            min = min_;
            max = max_;
        }
        static if (N == 1)
        {
            @nogc this(T min_, T max_) pure nothrow
            {
                min.x = min_;
                max.x = max_;
            }
        }
        static if (N == 2)
        {
            @nogc this(T min_x, T min_y, T max_x, T max_y) pure nothrow
            {
                min = bound_t(min_x, min_y);
                max = bound_t(max_x, max_y);
            }
        }
        static if (N == 3)
        {
            @nogc this(T min_x, T min_y, T min_z, T max_x, T max_y, T max_z) pure nothrow
            {
                min = bound_t(min_x, min_y, min_z);
                max = bound_t(max_x, max_y, max_z);
            }
        }
        @property
        {
            /// Returns: Dimensions of the box.
            @nogc bound_t size() pure const nothrow
            {
                return max - min;
            }
            /// Sets size of the box assuming min point is the pivot.
            /// Returns: Dimensions of the box.
            @nogc bound_t size(bound_t value) pure nothrow
            {
                max = min + value;
                return value;
            }
            /// Returns: Center of the box.
            @nogc bound_t center() pure const nothrow
            {
                return (min + max) / 2;
            }
            static if (N >= 1)
            {
                /// Returns: Width of the box, always applicable.
                @nogc T width() pure const nothrow @property
                {
                    return max.x - min.x;
                }
                /// Sets width of the box assuming min point is the pivot.
                /// Returns: Width of the box, always applicable.
                @nogc T width(T value) pure nothrow @property
                {
                    max.x = min.x + value;
                    return value;
                }
            }
            static if (N >= 2)
            {
                /// Returns: Height of the box, if applicable.
                @nogc T height() pure const nothrow @property
                {
                    return max.y - min.y;
                }
                /// Sets height of the box assuming min point is the pivot.
                /// Returns: Height of the box, if applicable.
                @nogc T height(T value) pure nothrow @property
                {
                    max.y = min.y + value;
                    return value;
                }
            }
            static if (N >= 3)
            {
                /// Returns: Depth of the box, if applicable.
                @nogc T depth() pure const nothrow @property
                {
                    return max.z - min.z;
                }
                /// Sets depth of the box assuming min point is the pivot.
                /// Returns: Depth of the box, if applicable.
                @nogc T depth(T value) pure nothrow @property
                {
                    max.z = min.z + value;
                    return value;
                }
            }
            /// Returns: Signed volume of the box.
            @nogc T volume() pure const nothrow
            {
                T res = 1;
                bound_t size = size();
                for(int i = 0; i < N; ++i)
                    res *= size[i];
                return res;
            }
            /// Returns: true if empty.
            @nogc bool empty() pure const nothrow
            {
                bound_t size = size();
                mixin(generateLoopCode!("if (min[@] == max[@]) return true;", N)());
                return false;
            }
        }
        /// Returns: true if it contains point.
        @nogc bool contains(bound_t point) pure const nothrow
        {
            assert(isSorted());
            for(int i = 0; i < N; ++i)
                if ( !(point[i] >= min[i] && point[i] < max[i]) )
                    return false;
            return true;
        }
        static if (N >= 2)
        {
            /// Returns: true if it contains point `x`, `y`.
            @nogc bool contains(T x, T y) pure const nothrow
            {
                assert(isSorted());
                if ( !(x >= min.x && x < max.x) )
                    return false;
                if ( !(y >= min.y && y < max.y) )
                    return false;
                return true;
            }
        }
        static if (N >= 3)
        {
            /// Returns: true if it contains point `x`, `y`, `z`.
            @nogc bool contains(T x, T y, T z) pure const nothrow
            {
                assert(isSorted());
                if ( !(x >= min.x && x < max.x) )
                    return false;
                if ( !(y >= min.y && y < max.y) )
                    return false;
                if ( !(z >= min.z && z < max.z) )
                    return false;
                return true;
            }
        }
        /// Returns: true if it contains box other.
        @nogc bool contains(Box other) pure const nothrow
        {
            assert(isSorted());
            assert(other.isSorted());
            mixin(generateLoopCode!("if ( (other.min[@] < min[@]) || (other.max[@] > max[@]) ) return false;", N)());
            return true;
        }
        /// Euclidean squared distance from a point.
        /// See_also: Numerical Recipes Third Edition (2007)
        @nogc real squaredDistance(bound_t point) pure const nothrow
        {
            assert(isSorted());
            real distanceSquared = 0;
            for (int i = 0; i < N; ++i)
            {
                if (point[i] < min[i])
                    distanceSquared += (point[i] - min[i]) ^^ 2;
                if (point[i] > max[i])
                    distanceSquared += (point[i] - max[i]) ^^ 2;
            }
            return distanceSquared;
        }
        /// Euclidean distance from a point.
        /// See_also: squaredDistance.
        @nogc real distance(bound_t point) pure const nothrow
        {
            return sqrt(squaredDistance(point));
        }
        /// Euclidean squared distance from another box.
        /// See_also: Numerical Recipes Third Edition (2007)
        @nogc real squaredDistance(Box o) pure const nothrow
        {
            assert(isSorted());
            assert(o.isSorted());
            real distanceSquared = 0;
            for (int i = 0; i < N; ++i)
            {
                if (o.max[i] < min[i])
                    distanceSquared += (o.max[i] - min[i]) ^^ 2;
                if (o.min[i] > max[i])
                    distanceSquared += (o.min[i] - max[i]) ^^ 2;
            }
            return distanceSquared;
        }
        /// Euclidean distance from another box.
        /// See_also: squaredDistance.
        @nogc real distance(Box o) pure const nothrow
        {
            return sqrt(squaredDistance(o));
        }
        /// Assumes sorted boxes.
        /// This function deals with empty boxes correctly.
        /// Returns: Intersection of two boxes.
        @nogc Box intersection(Box o) pure const nothrow
        {
            assert(isSorted());
            assert(o.isSorted());
            // Return an empty box if one of the boxes is empty
            if (empty())
                return this;
            if (o.empty())
                return o;
            Box result = void;
            for (int i = 0; i < N; ++i)
            {
                T maxOfMins = (min.v[i] > o.min.v[i]) ? min.v[i] : o.min.v[i];
                T minOfMaxs = (max.v[i] < o.max.v[i]) ? max.v[i] : o.max.v[i];
                result.min.v[i] = maxOfMins;
                result.max.v[i] = minOfMaxs >= maxOfMins ? minOfMaxs : maxOfMins;
            }
            return result;
        }
        /// Assumes sorted boxes.
        /// This function deals with empty boxes correctly.
        /// Returns: Intersection of two boxes.
        @nogc bool intersects(Box other) pure const nothrow
        {
            Box inter = this.intersection(other);
            return inter.isSorted() && !inter.empty();
        }
        /// Extends the area of this Box.
        @nogc Box grow(bound_t space) pure const nothrow
        {
            Box res = this;
            res.min -= space;
            res.max += space;
            return res;
        }
        /// Shrink the area of this Box. The box might became unsorted.
        @nogc Box shrink(bound_t space) pure const nothrow
        {
            return grow(-space);
        }
        /// Extends the area of this Box.
        @nogc Box grow(T space) pure const nothrow
        {
            return grow(bound_t(space));
        }
        /// Translate this Box.
        @nogc Box translate(bound_t offset) pure const nothrow
        {
            return Box(min + offset, max + offset);
        }
        /// Scale the box by factor `scale`, and round the result to integer if needed.
        @nogc Box scaleByFactor(float scale) const nothrow
        {
            Box res;
            static if (isFloatingPoint!T)
            {
                res.min.x = min.x * scale;
                res.min.y = min.y * scale;
                res.max.x = max.x * scale;
                res.max.y = max.y * scale;
            }
            else
            {
                res.min.x = cast(T)( round(min.x * scale) );
                res.min.y = cast(T)( round(min.y * scale) );
                res.max.x = cast(T)( round(max.x * scale) );
                res.max.y = cast(T)( round(max.y * scale) );
            }
            return res;
        }
        static if (N == 2) // useful for UI that have horizontal and vertical scale
        {
            /// Scale the box by factor `scaleX` horizontally and `scaleY` vetically. 
            /// Round the result to integer if needed.
            @nogc Box scaleByFactor(float scaleX, float scaleY) const nothrow
            {
                Box res;
                static if (isFloatingPoint!T)
                {
                    res.min.x = min.x * scaleX;
                    res.min.y = min.y * scaleY;
                    res.max.x = max.x * scaleX;
                    res.max.y = max.y * scaleY;
                }
                else
                {
                    res.min.x = cast(T)( round(min.x * scaleX) );
                    res.min.y = cast(T)( round(min.y * scaleY) );
                    res.max.x = cast(T)( round(max.x * scaleX) );
                    res.max.y = cast(T)( round(max.y * scaleY) );
                }
                return res;
            }
        }
        static if (N >= 2)
        {
            /// Translate this Box by `x`, `y`.
            @nogc Box translate(T x, T y) pure const nothrow
            {
                Box res = this;
                res.min.x += x;
                res.min.y += y;
                res.max.x += x;
                res.max.y += y;
                return res;
            }
        }
        static if (N >= 3)
        {
            /// Translate this Box by `x`, `y`.
            @nogc Box translate(T x, T y, T z) pure const nothrow
            {
                Box res = this;
                res.min.x += x;
                res.min.y += y;
                res.min.z += z;
                res.max.x += x;
                res.max.y += y;
                res.max.z += z;
                return res;
            }
        }
        /// Shrinks the area of this Box.
        /// Returns: Shrinked box.
        @nogc Box shrink(T space) pure const nothrow
        {
            return shrink(bound_t(space));
        }
        /// Expands the box to include point.
        /// Returns: Expanded box.
        @nogc Box expand(bound_t point) pure const nothrow
        {
            import vector = dplug.math.vector;
            return Box(vector.minByElem(min, point), vector.maxByElem(max, point));
        }
        /// Expands the box to include another box.
        /// This function deals with empty boxes correctly.
        /// Returns: Expanded box.
        @nogc Box expand(Box other) pure const nothrow
        {
            assert(isSorted());
            assert(other.isSorted());
            // handle empty boxes
            if (empty())
                return other;
            if (other.empty())
                return this;
            Box result = void;
            for (int i = 0; i < N; ++i)
            {
                T minOfMins = (min.v[i] < other.min.v[i]) ? min.v[i] : other.min.v[i];
                T maxOfMaxs = (max.v[i] > other.max.v[i]) ? max.v[i] : other.max.v[i];
                result.min.v[i] = minOfMins;
                result.max.v[i] = maxOfMaxs;
            }
            return result;
        }
        /// Returns: true if each dimension of the box is >= 0.
        @nogc bool isSorted() pure const nothrow
        {
            for(int i = 0; i < N; ++i)
            {
                if (min[i] > max[i])
                    return false;
            }
            return true;
        }
        /// Returns: Absolute value of the Box to ensure each dimension of the
        /// box is >= 0.
        @nogc Box abs() pure const nothrow
        {
            Box!(T, N) s = this;
            for (int i = 0; i < N; ++i)
            {
                if (s.min.v[i] > s.max.v[i])
                {
                    T tmp = s.min.v[i];
                    s.min.v[i] = s.max.v[i];
                    s.max.v[i] = tmp;
                }
            }
            return s;
        }
        /// Assign with another box.
        @nogc ref Box opAssign(U)(U x) nothrow if (isBox!U)
        {
            static if(is(U.element_t : T))
            {
                static if(U._size == _size)
                {
                    min = x.min;
                    max = x.max;
                }
                else
                {
                    static assert(false, "no conversion between boxes with different dimensions");
                }
            }
            else
            {
                static assert(false, "no conversion from " ~ U.element_t.stringof ~ " to " ~ element_t.stringof);
            }
            return this;
        }
        /// Returns: true if comparing equal boxes.
        @nogc bool opEquals(U)(U other) pure const nothrow if (is(U : Box))
        {
            return (min == other.min) && (max == other.max);
        }
        /// Cast to other box types.
        @nogc U opCast(U)() pure const nothrow if (isBox!U)
        {
            U b = void;
            for(int i = 0; i < N; ++i)
            {
                b.min[i] = cast(U.element_t)(min[i]);
                b.max[i] = cast(U.element_t)(max[i]);
            }
            return b; // return a box where each element has been casted
        }
        static if (N == 2)
        {
            /// Helper function to create rectangle with a given point, width and height.
            static @nogc Box rectangle(T x, T y, T width, T height) pure nothrow
            {
                return Box(x, y, x + width, y + height);
            }
        }
    }
    private
    {
        enum _size = N;
        alias T element_t;
    }
 }
 /// Instanciate to use a 2D box.
 template box2(T)
 {
    alias Box!(T, 2) box2;
 }
 /// Instanciate to use a 3D box.
 template box3(T)
 {
    alias Box!(T, 3) box3;
 }
 alias box2!int box2i; /// 2D box with integer coordinates.
 alias box3!int box3i; /// 3D box with integer coordinates.
 alias box2!float box2f; /// 2D box with float coordinates.
 alias box3!float box3f; /// 3D box with float coordinates.
 alias box2!double box2d; /// 2D box with double coordinates.
 alias box3!double box3d; /// 3D box with double coordinates.
 /// Returns: A 2D rectangle with point `x`,`y`, `width` and `height`.
 box2i rectangle(int x, int y, int width, int height) pure nothrow @nogc
 {
    return box2i(x, y, x + width, y + height);
 }
 /// Returns: A 2D rectangle with point `x`,`y`, `width` and `height`.
 box2f rectanglef(float x, float y, float width, float height) pure nothrow @nogc
 {
    return box2f(x, y, x + width, y + height);
 }
 /// Returns: A 2D rectangle with point `x`,`y`, `width` and `height`.
 box2d rectangled(double x, double y, double width, double height) pure nothrow @nogc
 {
    return box2d(x, y, x + width, y + height);
 }
 unittest
 {
    box2i a = box2i(1, 2, 3, 4);
    assert(a.width == 2);
    assert(a.height == 2);
    assert(a.volume == 4);
    box2i b = box2i(vec2i(1, 2), vec2i(3, 4));
    assert(a == b);
    box3i q = box3i(-3, -2, -1, 0, 1, 2);
    q.bound_t s = q.bound_t(11, 17, 19);
    q.bound_t q_min = q.min;
    assert((q.size = s) == s);
    assert(q.size == s);
    assert(q.min == q_min);
    assert(q.max == q.min + s);
    assert(q.max -  q.min == s);
    assert((q.width = s.z) == s.z);
    assert(q.width == s.z);
    assert(q.min.x == q_min.x);
    assert(q.max.x == q.min.x + s.z);
    assert(q.max.x -  q.min.x == s.z);
    assert((q.height = s.y) == s.y);
    assert(q.height == s.y);
    assert(q.min.y == q_min.y);
    assert(q.max.y == q.min.y + s.y);
    assert(q.max.y -  q.min.y == s.y);
    assert((q.depth = s.x) == s.x);
    assert(q.depth == s.x);
    assert(q.min.z == q_min.z);
    assert(q.max.z == q.min.z + s.x);
    assert(q.max.z -  q.min.z == s.x);
    assert(q.size == s.zyx);
    box3i n = box3i(2, 1, 0, -1, -2, -3);
    assert(n.abs == box3i(-1, -2, -3, 2, 1, 0));
    box2f bf = cast(box2f)b;
    assert(bf == box2f(1.0f, 2.0f, 3.0f, 4.0f));
    box3f qf = box3f(-0, 1f, 2.5f, 3.25f, 5.125f, 7.0625f);
    qf.bound_t sf = qf.bound_t(-11.5f, -17.25f, -19.125f);
    qf.bound_t qf_min = qf.min;
    assert((qf.size = sf) == sf);
    assert(qf.size == sf);
    assert(qf.min == qf_min);
    assert(qf.max == qf.min + sf);
    assert(qf.max -  qf.min == sf);
    assert((qf.width = sf.z) == sf.z);
    assert(qf.width == sf.z);
    assert(qf.min.x == qf_min.x);
    assert(qf.max.x == qf.min.x + sf.z);
    assert(qf.max.x -  qf.min.x == sf.z);
    assert((qf.height = sf.y) == sf.y);
    assert(qf.height == sf.y);
    assert(qf.min.y == qf_min.y);
    assert(qf.max.y == qf.min.y + sf.y);
    assert(qf.max.y -  qf.min.y == sf.y);
    assert((qf.depth = sf.x) == sf.x);
    assert(qf.depth == sf.x);
    assert(qf.min.z == qf_min.z);
    assert(qf.max.z == qf.min.z + sf.x);
    assert(qf.max.z -  qf.min.z == sf.x);
    assert(qf.size == sf.zyx);
    box2i c = box2i(0, 0, 1,1);
    assert(c.translate(vec2i(3, 3)) == box2i(3, 3, 4, 4));
    assert(c.translate(3, 3) == box2i(3, 3, 4, 4));
    assert(c.contains(vec2i(0, 0)));
    assert(c.contains(0, 0));
    assert(!c.contains(vec2i(1, 1)));
    assert(!c.contains(1, 1));
    assert(b.contains(b));
    box2i d = c.expand(vec2i(3, 3));
    assert(d.contains(vec2i(2, 2)));
    assert(d == d.expand(d));
    assert(!box2i(0, 0, 4, 4).contains(box2i(2, 2, 6, 6)));
    assert(box2f(0, 0, 0, 0).empty());
    assert(!box2f(0, 2, 1, 1).empty());
    assert(!box2f(0, 0, 1, 1).empty());
    assert(box2i(260, 100, 360, 200).intersection(box2i(100, 100, 200, 200)).empty());
    // union with empty box is identity
    assert(a.expand(box2i(10, 4, 10, 6)) == a);
    // intersection with empty box is empty
    assert(a.intersection(box2i(10, 4, 10, 6)).empty);
    assert(box2i.rectangle(1, 2, 3, 4) == box2i(1, 2, 4, 6));
    assert(rectangle(1, 2, 3, 4) == box2i(1, 2, 4, 6));
    assert(rectanglef(1, 2, 3, 4) == box2f(1, 2, 4, 6));
    assert(rectangled(1, 2, 3, 4) == box2d(1, 2, 4, 6));
    assert(rectangle(10, 10, 20, 20).scaleByFactor(1.5f) == rectangle(15, 15, 30, 30));
    assert(rectangle(10, 10, 20, 20).scaleByFactor(1.5f, 2.0f) == rectangle(15, 20, 30, 40));
 }
 /// True if `T` is a kind of Box
 enum isBox(T) = is(T : Box!U, U...);
 unittest
 {
    static assert( isBox!box2f);
    static assert( isBox!box3d);
    static assert( isBox!(Box!(real, 2)));
    static assert(!isBox!vec2f);
 }
 /// Get the numeric type used to measure a box's dimensions.
 alias DimensionType(T : Box!U, U...) = U[0];
 ///
 unittest
 {
    static assert(is(DimensionType!box2f == float));
    static assert(is(DimensionType!box3d == double));
 }
--- a/external/dplug/math/matrix.d
+++ b/external/dplug/math/matrix.d
@ -0,0 +1,852 @@
 /**
 * Custom sized 2D Matrices.
 *
 * Copyright: Copyright Guillaume Piolat 2015-2021.
 *            Copyright Aleksandr Druzhinin 2016-2020.
 *            Copyright Nathan Sashihara 2018.
 *            Copyright Thibaut Charles 2018.
 *
 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
 */
 module dplug.math.matrix;
 import std.math,
       std.typetuple,
       std.traits,
       std.typecons;
 import dplug.math.vector;
 /// Generic non-resizeable matrix with R rows and C columns.
 /// Intended for 3D use (size 3x3 and 4x4).
 /// Important: <b>Matrices here are in row-major order whereas OpenGL is column-major.</b>
 /// Params:
 ///   T = type of elements
 ///   R = number of rows
 ///   C = number of columns
 struct Matrix(T, int R, int C)
 {
    public
    {
        static assert(R >= 1 && C >= 1);
        alias Vector!(T, C) row_t;
        alias Vector!(T, R) column_t;
        enum bool isSquare = (R == C);
        // fields definition
        union
        {
            T[C*R] v;        // all elements
            row_t[R] rows;   // all rows
            T[C][R] c;       // components
        }
        @nogc this(U...)(U values) pure nothrow
        {
            static if ((U.length == C*R) && allSatisfy!(isTAssignable, U))
            {
                // construct with components
                foreach(int i, x; values)
                    v[i] = x;
            }
            else static if ((U.length == 1) && (isAssignable!(U[0])) && (!is(U[0] : Matrix)))
            {
                // construct with assignment
                opAssign!(U[0])(values[0]);
            }
            else static assert(false, "cannot create a matrix from given arguments");
        }
        /// Construct a matrix from columns.
        @nogc static Matrix fromColumns(column_t[] columns) pure nothrow
        {
            assert(columns.length == C);
            Matrix res;
            for (int i = 0; i < R; ++i)
                for (int j = 0; j < C; ++j)
                {
                   res.c[i][j] = columns[j][i];
                }
            return res;
        }
        /// Construct a matrix from rows.
        @nogc static Matrix fromRows(row_t[] rows) pure nothrow
        {
            assert(rows.length == R);
            Matrix res;
            res.rows[] = rows[];
            return res;
        }
        /// Construct matrix with a scalar.
        @nogc this(U)(T x) pure nothrow
        {
            for (int i = 0; i < _N; ++i)
                v[i] = x;
        }
        /// Assign with a scalar.
        @nogc ref Matrix opAssign(U : T)(U x) pure nothrow
        {
            for (int i = 0; i < R * C; ++i)
                v[i] = x;
            return this;
        }
        /// Assign with a samey matrice.
        @nogc ref Matrix opAssign(U : Matrix)(U x) pure nothrow
        {
            for (int i = 0; i < R * C; ++i)
                v[i] = x.v[i];
            return this;
        }
        /// Assign from other small matrices (same size, compatible type).
        @nogc ref Matrix opAssign(U)(U x) pure nothrow
            if (isMatrixInstantiation!U
                && is(U._T : _T)
                && (!is(U: Matrix))
                && (U._R == R) && (U._C == C))
        {
            for (int i = 0; i < R * C; ++i)
                v[i] = x.v[i];
            return this;
        }
        /// Assign with a static array of size R * C.
        @nogc ref Matrix opAssign(U)(U x) pure nothrow
            if ((isStaticArray!U)
                && is(typeof(x[0]) : T)
                && (U.length == R * C))
        {
            for (int i = 0; i < R * C; ++i)
                v[i] = x[i];
            return this;
        }
        /// Assign with a static array of shape (R, C).
        @nogc ref Matrix opAssign(U)(U x) pure nothrow
            if ((isStaticArray!U) && isStaticArray!(typeof(x[0]))
                && is(typeof(x[0][0]) : T)
                && (U.length == R)
                && (x[0].length == C))
        {
            foreach (i; 0..R)
                rows[i] = x[i];
            return this;
        }
        /// Assign with a dynamic array of size R * C.
        @nogc ref Matrix opAssign(U)(U x) pure nothrow
            if ((isDynamicArray!U)
                && is(typeof(x[0]) : T))
        {
            assert(x.length == R * C);
            for (int i = 0; i < R * C; ++i)
                v[i] = x[i];
            return this;
        }
        /// Assign with a dynamic array of shape (R, C).
        @nogc ref Matrix opAssign(U)(U x) pure nothrow
            if ((isDynamicArray!U) && isDynamicArray!(typeof(x[0]))
                && is(typeof(x[0][0]) : T))
        {
            assert(x.length == R);
            foreach (i; 0..R)
            {
                assert(x[i].length == C);
                rows[i] = x[i];
            }
            return this;
        }
        /// Return a pointer to content.
        @nogc inout(T)* ptr() pure inout nothrow @property
        {
            return v.ptr;
        }
        /// Returns a column as a vector
        /// Returns: column j as a vector.
        @nogc column_t column(int j) pure const nothrow
        {
            column_t res = void;
            for (int i = 0; i < R; ++i)
                res.v[i] = c[i][j];
            return res;
        }
        /// Returns a row as a vector
        /// Returns: row i as a vector.
        @nogc row_t row(int i) pure const nothrow
        {
            return rows[i];
        }
        /// Matrix * scalar multiplication.
        @nogc Matrix opBinary(string op)(T factor) pure const nothrow if (op == "*")
        {
            Matrix result = void;
            for (int i = 0; i < R; ++i)
            {
                for (int j = 0; j < C; ++j)
                {
                    result.c[i][j] = c[i][j] * factor;
                }
            }
            return result;
        }
        /// Matrix * vector multiplication.
        @nogc column_t opBinary(string op)(row_t x) pure const nothrow if (op == "*")
        {
            column_t res = void;
            for (int i = 0; i < R; ++i)
            {
                T sum = 0;
                for (int j = 0; j < C; ++j)
                {
                    sum += c[i][j] * x.v[j];
                }
                res.v[i] = sum;
            }
            return res;
        }
        /// Matrix * matrix multiplication.
        @nogc auto opBinary(string op, U)(U x) pure const nothrow
            if (isMatrixInstantiation!U && (U._R == C) && (op == "*"))
        {
            Matrix!(T, R, U._C) result = void;
            for (int i = 0; i < R; ++i)
            {
                for (int j = 0; j < U._C; ++j)
                {
                    T sum = 0;
                    for (int k = 0; k < C; ++k)
                        sum += c[i][k] * x.c[k][j];
                    result.c[i][j] = sum;
                }
            }
            return result;
        }
        /// Matrix add and substraction.
        @nogc Matrix opBinary(string op, U)(U other) pure const nothrow
            if (is(U : Matrix) && (op == "+" || op == "-"))
        {
            Matrix result = void;
            for (int i = 0; i < R; ++i)
            {
                for (int j = 0; j < C; ++j)
                {
                    mixin("result.c[i][j] = c[i][j] " ~ op ~ " other.c[i][j];");
                }
            }
            return result;
        }
        // matrix *= scalar
        @nogc ref Matrix opOpAssign(string op, U : T)(U x) pure nothrow if (op == "*")
        {
            for (int i = 0; i < R * C; ++i)
                v[i] *= x;
            return this;
        }
        /// Assignment operator with another samey matrix.
        @nogc ref Matrix opOpAssign(string op, U)(U operand) pure nothrow 
            if (is(U : Matrix) && (op == "*" || op == "+" || op == "-"))
        {
            mixin("Matrix result = this " ~ op ~ " operand;");
            return opAssign!Matrix(result);
        }
        /// Matrix += <something convertible to a Matrix>
        /// Matrix -= <something convertible to a Matrix>
        @nogc ref Matrix opOpAssign(string op, U)(U operand) pure nothrow 
            if ((isConvertible!U) && (op == "*" || op == "+" || op == "-"))
        {
            Matrix conv = operand;
            return opOpAssign!op(conv);
        }
        /// Cast to other matrix types.
        /// If the size are different, the resulting matrix is truncated
        /// and/or filled with identity coefficients.
        @nogc U opCast(U)() pure const nothrow if (isMatrixInstantiation!U)
        {
            U res = U.identity();
            enum minR = R < U._R ? R : U._R;
            enum minC = C < U._C ? C : U._C;
            for (int i = 0; i < minR; ++i)
                for (int j = 0; j < minC; ++j)
                {
                    res.c[i][j] = cast(U._T)(c[i][j]);
                }
            return res;
        }
        @nogc bool opEquals(U)(U other) pure const nothrow if (is(U : Matrix))
        {
            for (int i = 0; i < R * C; ++i)
                if (v[i] != other.v[i])
                    return false;
            return true;
        }
        @nogc bool opEquals(U)(U other) pure const nothrow
            if ((isAssignable!U) && (!is(U: Matrix)))
        {
            Matrix conv = other;
            return opEquals(conv);
        }
        // +matrix, -matrix, ~matrix, !matrix
        @nogc Matrix opUnary(string op)() pure const nothrow if (op == "+" || op == "-" || op == "~" || op == "!")
        {
            Matrix res = void;
            for (int i = 0; i < N; ++i)
                mixin("res.v[i] = " ~ op ~ "v[i];");
            return res;
        }
        static if (isSquare && isFloatingPoint!T && R == 1)
        {
            /// Returns an inverted copy of this matrix
            /// Returns: inverse of matrix.
            /// Note: Matrix inversion is provided for 1x1, 2x2, 3x3 and 4x4 floating point matrices.
            @nogc Matrix inverse() pure const nothrow
            {
                assert(c[0][0] != 0); // Programming error if matrix is not invertible.
                return Matrix( 1 / c[0][0]);
            }
        }
        static if (isSquare && isFloatingPoint!T && R == 2)
        {
            /// Returns an inverted copy of this matrix
            /// Returns: inverse of matrix.
            /// Note: Matrix inversion is provided for 1x1, 2x2, 3x3 and 4x4 floating point matrices.
            @nogc Matrix inverse() pure const nothrow
            {
                T det = (c[0][0] * c[1][1] - c[0][1] * c[1][0]);
                assert(det != 0); // Programming error if matrix is not invertible.
                T invDet = 1 / det;
                return Matrix( c[1][1] * invDet, -c[0][1] * invDet,
                                   -c[1][0] * invDet,  c[0][0] * invDet);
            }
        }
        static if (isSquare && isFloatingPoint!T && R == 3)
        {
            /// Returns an inverted copy of this matrix
            /// Returns: inverse of matrix.
            /// Note: Matrix inversion is provided for 1x1, 2x2, 3x3 and 4x4 floating point matrices.
            @nogc Matrix inverse() pure const nothrow
            {
                T det = c[0][0] * (c[1][1] * c[2][2] - c[2][1] * c[1][2])
                      - c[0][1] * (c[1][0] * c[2][2] - c[1][2] * c[2][0])
                      + c[0][2] * (c[1][0] * c[2][1] - c[1][1] * c[2][0]);
                assert(det != 0); // Programming error if matrix is not invertible.
                T invDet = 1 / det;
                Matrix res = void;
                res.c[0][0] =  (c[1][1] * c[2][2] - c[2][1] * c[1][2]) * invDet;
                res.c[0][1] = -(c[0][1] * c[2][2] - c[0][2] * c[2][1]) * invDet;
                res.c[0][2] =  (c[0][1] * c[1][2] - c[0][2] * c[1][1]) * invDet;
                res.c[1][0] = -(c[1][0] * c[2][2] - c[1][2] * c[2][0]) * invDet;
                res.c[1][1] =  (c[0][0] * c[2][2] - c[0][2] * c[2][0]) * invDet;
                res.c[1][2] = -(c[0][0] * c[1][2] - c[1][0] * c[0][2]) * invDet;
                res.c[2][0] =  (c[1][0] * c[2][1] - c[2][0] * c[1][1]) * invDet;
                res.c[2][1] = -(c[0][0] * c[2][1] - c[2][0] * c[0][1]) * invDet;
                res.c[2][2] =  (c[0][0] * c[1][1] - c[1][0] * c[0][1]) * invDet;
                return res;
            }
        }
        static if (isSquare && isFloatingPoint!T && R == 4)
        {
            /// Returns an inverted copy of this matrix
            /// Returns: inverse of matrix.
            /// Note: Matrix inversion is provided for 1x1, 2x2, 3x3 and 4x4 floating point matrices.
            @nogc Matrix inverse() pure const nothrow
            {
                T det2_01_01 = c[0][0] * c[1][1] - c[0][1] * c[1][0];
                T det2_01_02 = c[0][0] * c[1][2] - c[0][2] * c[1][0];
                T det2_01_03 = c[0][0] * c[1][3] - c[0][3] * c[1][0];
                T det2_01_12 = c[0][1] * c[1][2] - c[0][2] * c[1][1];
                T det2_01_13 = c[0][1] * c[1][3] - c[0][3] * c[1][1];
                T det2_01_23 = c[0][2] * c[1][3] - c[0][3] * c[1][2];
                T det3_201_012 = c[2][0] * det2_01_12 - c[2][1] * det2_01_02 + c[2][2] * det2_01_01;
                T det3_201_013 = c[2][0] * det2_01_13 - c[2][1] * det2_01_03 + c[2][3] * det2_01_01;
                T det3_201_023 = c[2][0] * det2_01_23 - c[2][2] * det2_01_03 + c[2][3] * det2_01_02;
                T det3_201_123 = c[2][1] * det2_01_23 - c[2][2] * det2_01_13 + c[2][3] * det2_01_12;
                T det = - det3_201_123 * c[3][0] + det3_201_023 * c[3][1] - det3_201_013 * c[3][2] + det3_201_012 * c[3][3];
                assert(det != 0); // Programming error if matrix is not invertible.
                T invDet = 1 / det;
                T det2_03_01 = c[0][0] * c[3][1] - c[0][1] * c[3][0];
                T det2_03_02 = c[0][0] * c[3][2] - c[0][2] * c[3][0];
                T det2_03_03 = c[0][0] * c[3][3] - c[0][3] * c[3][0];
                T det2_03_12 = c[0][1] * c[3][2] - c[0][2] * c[3][1];
                T det2_03_13 = c[0][1] * c[3][3] - c[0][3] * c[3][1];
                T det2_03_23 = c[0][2] * c[3][3] - c[0][3] * c[3][2];
                T det2_13_01 = c[1][0] * c[3][1] - c[1][1] * c[3][0];
                T det2_13_02 = c[1][0] * c[3][2] - c[1][2] * c[3][0];
                T det2_13_03 = c[1][0] * c[3][3] - c[1][3] * c[3][0];
                T det2_13_12 = c[1][1] * c[3][2] - c[1][2] * c[3][1];
                T det2_13_13 = c[1][1] * c[3][3] - c[1][3] * c[3][1];
                T det2_13_23 = c[1][2] * c[3][3] - c[1][3] * c[3][2];
                T det3_203_012 = c[2][0] * det2_03_12 - c[2][1] * det2_03_02 + c[2][2] * det2_03_01;
                T det3_203_013 = c[2][0] * det2_03_13 - c[2][1] * det2_03_03 + c[2][3] * det2_03_01;
                T det3_203_023 = c[2][0] * det2_03_23 - c[2][2] * det2_03_03 + c[2][3] * det2_03_02;
                T det3_203_123 = c[2][1] * det2_03_23 - c[2][2] * det2_03_13 + c[2][3] * det2_03_12;
                T det3_213_012 = c[2][0] * det2_13_12 - c[2][1] * det2_13_02 + c[2][2] * det2_13_01;
                T det3_213_013 = c[2][0] * det2_13_13 - c[2][1] * det2_13_03 + c[2][3] * det2_13_01;
                T det3_213_023 = c[2][0] * det2_13_23 - c[2][2] * det2_13_03 + c[2][3] * det2_13_02;
                T det3_213_123 = c[2][1] * det2_13_23 - c[2][2] * det2_13_13 + c[2][3] * det2_13_12;
                T det3_301_012 = c[3][0] * det2_01_12 - c[3][1] * det2_01_02 + c[3][2] * det2_01_01;
                T det3_301_013 = c[3][0] * det2_01_13 - c[3][1] * det2_01_03 + c[3][3] * det2_01_01;
                T det3_301_023 = c[3][0] * det2_01_23 - c[3][2] * det2_01_03 + c[3][3] * det2_01_02;
                T det3_301_123 = c[3][1] * det2_01_23 - c[3][2] * det2_01_13 + c[3][3] * det2_01_12;
                Matrix res = void;
                res.c[0][0] = - det3_213_123 * invDet;
                res.c[1][0] = + det3_213_023 * invDet;
                res.c[2][0] = - det3_213_013 * invDet;
                res.c[3][0] = + det3_213_012 * invDet;
                res.c[0][1] = + det3_203_123 * invDet;
                res.c[1][1] = - det3_203_023 * invDet;
                res.c[2][1] = + det3_203_013 * invDet;
                res.c[3][1] = - det3_203_012 * invDet;
                res.c[0][2] = + det3_301_123 * invDet;
                res.c[1][2] = - det3_301_023 * invDet;
                res.c[2][2] = + det3_301_013 * invDet;
                res.c[3][2] = - det3_301_012 * invDet;
                res.c[0][3] = - det3_201_123 * invDet;
                res.c[1][3] = + det3_201_023 * invDet;
                res.c[2][3] = - det3_201_013 * invDet;
                res.c[3][3] = + det3_201_012 * invDet;
                return res;
            }
        }
        /// Returns a transposed copy of this matrix
        /// Returns: transposed matrice.
        @nogc Matrix!(T, C, R) transposed() pure const nothrow
        {
            Matrix!(T, C, R) res;
            for (int i = 0; i < C; ++i)
                for (int j = 0; j < R; ++j)
                    res.c[i][j] = c[j][i];
            return res;
        }
        static if (isSquare && R > 1)
        {
            /// Makes a diagonal matrix from a vector.
            @nogc static Matrix diag(Vector!(T, R) v) pure nothrow
            {
                Matrix res = void;
                for (int i = 0; i < R; ++i)
                    for (int j = 0; j < C; ++j)
                        res.c[i][j] = (i == j) ? v.v[i] : 0;
                return res;
            }
            /// In-place translate by (v, 1)
            @nogc void translate(Vector!(T, R-1) v) pure nothrow
            {
                for (int i = 0; i < R; ++i)
                {
                    T dot = 0;
                    for (int j = 0; j + 1 < C; ++j)
                        dot += v.v[j] * c[i][j];
                    c[i][C-1] += dot;
                }
            }
            /// Make a translation matrix.
            @nogc static Matrix translation(Vector!(T, R-1) v) pure nothrow
            {
                Matrix res = identity();
                for (int i = 0; i + 1 < R; ++i)
                    res.c[i][C-1] += v.v[i];
                return res;
            }
            /// In-place matrix scaling.
            void scale(Vector!(T, R-1) v) pure nothrow
            {
                for (int i = 0; i < R; ++i)
                    for (int j = 0; j + 1 < C; ++j)
                        c[i][j] *= v.v[j];
            }
            /// Make a scaling matrix.
            @nogc static Matrix scaling(Vector!(T, R-1) v) pure nothrow
            {
                Matrix res = identity();
                for (int i = 0; i + 1 < R; ++i)
                    res.c[i][i] = v.v[i];
                return res;
            }
        }
        // rotations are implemented for 3x3 and 4x4 matrices.
        static if (isSquare && (R == 3 || R == 4) && isFloatingPoint!T)
        {
            @nogc public static Matrix rotateAxis(int i, int j)(T angle) pure nothrow
            {
                Matrix res = identity();
                const T cosa = cos(angle);
                const T sina = sin(angle);
                res.c[i][i] = cosa;
                res.c[i][j] = -sina;
                res.c[j][i] = sina;
                res.c[j][j] = cosa;
                return res;
            }
            /// Rotate along X axis
            /// Returns: rotation matrix along axis X
            alias rotateAxis!(1, 2) rotateX;
            /// Rotate along Y axis
            /// Returns: rotation matrix along axis Y
            alias rotateAxis!(2, 0) rotateY;
            /// Rotate along Z axis
            /// Returns: rotation matrix along axis Z
            alias rotateAxis!(0, 1) rotateZ;
            /// Similar to the glRotate matrix, however the angle is expressed in radians
            /// See_also: $(LINK http://www.cs.rutgers.edu/~decarlo/428/gl_man/rotate.html)
            @nogc static Matrix rotation(T angle, vec3!T axis) pure nothrow
            {
                Matrix res = identity();
                const T c = cos(angle);
                const oneMinusC = 1 - c;
                const T s = sin(angle);
                axis = axis.normalized();
                T x = axis.x,
                  y = axis.y,
                  z = axis.z;
                T xy = x * y,
                  yz = y * z,
                  xz = x * z;
                res.c[0][0] = x * x * oneMinusC + c;
                res.c[0][1] = x * y * oneMinusC - z * s;
                res.c[0][2] = x * z * oneMinusC + y * s;
                res.c[1][0] = y * x * oneMinusC + z * s;
                res.c[1][1] = y * y * oneMinusC + c;
                res.c[1][2] = y * z * oneMinusC - x * s;
                res.c[2][0] = z * x * oneMinusC - y * s;
                res.c[2][1] = z * y * oneMinusC + x * s;
                res.c[2][2] = z * z * oneMinusC + c;
                return res;
            }
        }
        // 4x4 specific transformations for 3D usage
        static if (isSquare && R == 4 && isFloatingPoint!T)
        {
            /// Orthographic projection
            /// Returns: orthographic projection.
            @nogc static Matrix orthographic(T left, T right, T bottom, T top, T near, T far) pure nothrow
            {
                T dx = right - left,
                  dy = top - bottom,
                  dz = far - near;
                T tx = -(right + left) / dx;
                T ty = -(top + bottom) / dy;
                T tz = -(far + near)   / dz;
                return Matrix(2 / dx,   0,      0,    tx,
                                0,    2 / dy,   0,    ty,
                                0,      0,   -2 / dz, tz,
                                0,      0,      0,     1);
            }
            /// Perspective projection
            /// Returns: perspective projection.
            @nogc static Matrix perspective(T FOVInRadians, T aspect, T zNear, T zFar) pure nothrow
            {
                T f = 1 / tan(FOVInRadians / 2);
                T d = 1 / (zNear - zFar);
                return Matrix(f / aspect, 0,                  0,                    0,
                                       0, f,                  0,                    0,
                                       0, 0, (zFar + zNear) * d, 2 * d * zFar * zNear,
                                       0, 0,                 -1,                    0);
            }
            /// Look At projection
            /// Returns: "lookAt" projection.
            /// Thanks to vuaru for corrections.
            @nogc static Matrix lookAt(vec3!T eye, vec3!T target, vec3!T up) pure nothrow
            {
                vec3!T Z = (eye - target).normalized();
                vec3!T X = cross(-up, Z).normalized();
                vec3!T Y = cross(Z, -X);
                return Matrix(-X.x,        -X.y,        -X.z,      dot(X, eye),
                               Y.x,         Y.y,         Y.z,     -dot(Y, eye),
                               Z.x,         Z.y,         Z.z,     -dot(Z, eye),
                               0,           0,           0,        1);
            }
        }
    }
    package
    {
        alias T _T;
        enum _R = R;
        enum _C = C;
    }
    private
    {
        template isAssignable(T)
        {
            enum bool isAssignable = std.traits.isAssignable!(Matrix, T);
        }
        template isConvertible(T)
        {
            enum bool isConvertible = (!is(T : Matrix)) && isAssignable!T;
        }
        template isTAssignable(U)
        {
            enum bool isTAssignable = std.traits.isAssignable!(T, U);
        }
        template isRowConvertible(U)
        {
            enum bool isRowConvertible = is(U : row_t);
        }
        template isColumnConvertible(U)
        {
            enum bool isColumnConvertible = is(U : column_t);
        }
    }
    public
    {
        /// Construct an identity matrix
        /// Returns: an identity matrix.
        /// Note: the identity matrix, while only meaningful for square matrices,
        /// is also defined for non-square ones.
        @nogc static Matrix identity() pure nothrow
        {
            Matrix res = void;
            for (int i = 0; i < R; ++i)
                for (int j = 0; j < C; ++j)
                    res.c[i][j] = (i == j) ? 1 : 0;
            return res;
        }
        /// Construct an constant matrix
        /// Returns: a constant matrice.
        @nogc static Matrix constant(U)(U x) pure nothrow
        {
            Matrix res = void;
            for (int i = 0; i < R * C; ++i)
                res.v[i] = cast(T)x;
            return res;
        }
    }
 }
 template isMatrixInstantiation(U)
 {
    private static void isMatrix(T, int R, int C)(Matrix!(T, R, C) x)
    {
    }
    enum bool isMatrixInstantiation = is(typeof(isMatrix(U.init)));
 }
 // GLSL is a big inspiration here
 // we defines types with more or less the same names
 ///
 template mat2x2(T) { alias Matrix!(T, 2, 2) mat2x2; }
 ///
 template mat3x3(T) { alias Matrix!(T, 3, 3) mat3x3; }
 ///
 template mat4x4(T) { alias Matrix!(T, 4, 4) mat4x4; }
 // WARNING: in GLSL, first number is _columns_, second is rows
 // It is the opposite here: first number is rows, second is columns
 // With this convention mat2x3 * mat3x4 -> mat2x4.
 ///
 template mat2x3(T) { alias Matrix!(T, 2, 3) mat2x3; }
 ///
 template mat2x4(T) { alias Matrix!(T, 2, 4) mat2x4; }
 ///
 template mat3x2(T) { alias Matrix!(T, 3, 2) mat3x2; }
 ///
 template mat3x4(T) { alias Matrix!(T, 3, 4) mat3x4; }
 ///
 template mat4x2(T) { alias Matrix!(T, 4, 2) mat4x2; }
 ///
 template mat4x3(T) { alias Matrix!(T, 4, 3) mat4x3; }
 // shorter names for most common matrices
 alias mat2x2 mat2;///
 alias mat3x3 mat3;///
 alias mat4x4 mat4;///
 // Define a lot of type names
 // Most useful are probably mat4f and mat4d
 alias mat2!float  mat2f;///
 alias mat2!double mat2d;///
 alias mat3!float  mat3f;///
 alias mat3!double mat3d;///
 alias mat4!float  mat4f;///
 alias mat4!double mat4d;///
 alias mat2x2!float  mat2x2f;///
 alias mat2x2!double mat2x2d;///
 alias mat3x3!float  mat3x3f;///
 alias mat3x3!double mat3x3d;///
 alias mat4x4!float  mat4x4f;///
 alias mat4x4!double mat4x4d;///
 unittest
 {
    alias mat2i = mat2!int;
    alias mat2x3f = mat2x3!float;
    alias mat3x4f = mat3x4!float;
    alias mat2x4f = mat2x4!float;
    mat2i x = mat2i(0, 1,
                    2, 3);
    assert(x.c[0][0] == 0 && x.c[0][1] == 1 && x.c[1][0] == 2 && x.c[1][1] == 3);
    vec2i[2] cols = [vec2i(0, 2), vec2i(1, 3)];
    mat2i y = mat2i.fromColumns(cols[]);
    assert(y.c[0][0] == 0 && y.c[0][1] == 1 && y.c[1][0] == 2 && y.c[1][1] == 3);
    y = mat2i.fromRows(cols[]);
    assert(y.c[0][0] == 0 && y.c[1][0] == 1 && y.c[0][1] == 2 && y.c[1][1] == 3);
    y = y.transposed();
    assert(x == y);
    x = [0, 1, 2, 3];
    assert(x == y);
    mat2i z = x * y;
    assert(z == mat2i([2, 3, 6, 11]));
    vec2i vz = z * vec2i(2, -1);
    assert(vz == vec2i(1, 1));
    mat2f a = z;
    mat2d ad = a;
    ad += a;
    mat2f w = [4, 5, 6, 7];
    z = cast(mat2i)w;
    assert(w == z);
    {
        mat2x3f A;
        mat3x4f B;
        mat2x4f C = A * B;
    }
    assert(mat2i.diag(vec2i(1, 2)) == mat2i(1, 0,
                                            0, 2));
    // Construct with a single scalar
    auto D = mat4f(1.0f);
    assert(D.v[] == [1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1, ]);
    {
        double[4][3] starray = [
            [ 0,  1,  2,  3],
            [ 4,  5,  6,  7,],
            [ 8,  9, 10, 11,],
        ];
        // starray has the shape 3x4
        assert(starray.length == 3);
        assert(starray[0].length == 4);
        auto m = mat3x4!double(starray);
        assert(m.v[] == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ]);
    }
    {
        auto dynarray = [
            [ 0,  1,  2,  3],
            [ 4,  5,  6,  7,],
            [ 8,  9, 10, 11,],
        ];
        // dynarray has the shape 3x4
        assert(dynarray.length == 3);
        assert(dynarray[0].length == 4);
        auto m = mat3x4!double(dynarray);
        assert(m.v[] == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ]);
    }
 }
 // Issue #206 (matrix *= scalar) not yielding matrix * scalar but matrix * matrix(scalar)
 unittest
 {
    mat4f mvp = mat4f.identity;
    mvp *= 2;
    assert(mvp == mat4f(2, 0, 0, 0,
                        0, 2, 0, 0,
                        0, 0, 2, 0,
                        0, 0, 0, 2));
    mvp = mat4f.identity * 2;
    assert(mvp == mat4f(2, 0, 0, 0,
                        0, 2, 0, 0,
                        0, 0, 2, 0,
                        0, 0, 0, 2));
    mvp = mat4f(1) * mat4f(1);
    assert(mvp == mat4f(4, 4, 4, 4,
                        4, 4, 4, 4,
                        4, 4, 4, 4,
                        4, 4, 4, 4));
    mvp = mat4f(1);
    mvp *= mat4f(1);
    assert(mvp == mat4f(4, 4, 4, 4,
                        4, 4, 4, 4,
                        4, 4, 4, 4,
                        4, 4, 4, 4));
 }
--- a/external/dplug/math/package.d
+++ b/external/dplug/math/package.d
@ -0,0 +1,12 @@
 /**
 * Math package: rectangles, vectors, matrices.
 *
 * Copyright: Copyright Guillaume Piolat 2021.
 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
 * Note: this is part of the former gfm:math package, hence containing copyright from many GFM contributors.
 */
 module dplug.math;
 public import dplug.math.vector,
              dplug.math.box,
              dplug.math.matrix;
--- a/external/dplug/math/vector.d
+++ b/external/dplug/math/vector.d
@ -0,0 +1,823 @@
 /**
 * N-dimensional small vector math.
 *
 * Copyright: Copyright Guillaume Piolat 2021.
 *            Copyright Chance Snow 2021.
 *            Copyright Aleksandr Druzhinin 2018.
 *            Copyright Nathan Sashihara 2018.
 *            Copyright Ryan Roden-Corrent 2016.
 *            Copyright Steven Dwy 2015.
 *            Copyright Martin Nowak 2015.
 *            Copyright Tanel Tagaväli 2015.
 * 
 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
 */
 module dplug.math.vector;
 import std.traits,
       std.math,
       std.array;
 import inteli.emmintrin;
 /**
 * Generic 1D small vector.
 * Params:
 *    N = number of elements
 *    T = type of elements
 */
 struct Vector(T, int N)
 {
 nothrow:
    public
    {
        static assert(N >= 1);
        // fields definition
        union
        {
            T[N] v;
            struct
            {
                static if (N >= 1)
                {
                    T x;
                    alias x r;
                }
                static if (N >= 2)
                {
                    T y;
                    alias y g;
                }
                static if (N >= 3)
                {
                    T z;
                    alias z b;
                }
                static if (N >= 4)
                {
                    T w;
                    alias w a;
                }
            }
        }
        /// Construct a Vector with a `T[]` or the values as arguments
        @nogc this(Args...)(Args args) pure nothrow
        {
            static if (args.length == 1)
            {
                // Construct a Vector from a single value.
                opAssign!(Args[0])(args[0]);
            }
            else
            {
                // validate the total argument count across scalars and vectors
                template argCount(T...) {
                    static if(T.length == 0)
                        enum argCount = 0; // done recursing
                    else static if(isVector!(T[0]))
                        enum argCount = T[0]._N + argCount!(T[1..$]);
                    else
                        enum argCount = 1 + argCount!(T[1..$]);
                }
                static assert(argCount!Args <= N, "Too many arguments in vector constructor");
                int index = 0;
                foreach(arg; args)
                {
                    static if (isAssignable!(T, typeof(arg)))
                    {
                        v[index] = arg;
                        index++; // has to be on its own line (DMD 2.068)
                    }
                    else static if (isVector!(typeof(arg)) && isAssignable!(T, arg._T))
                    {
                        mixin(generateLoopCode!("v[index + @] = arg[@];", arg._N)());
                        index += arg._N;
                    }
                    else
                        static assert(false, "Unrecognized argument in Vector constructor");
                }
                assert(index == N, "Bad arguments in Vector constructor");
            }
        }
        size_t toHash() const nothrow @safe
        {
            size_t hash = 0;
            foreach (elem; v) {
                hash = elem.hashOf(hash);
            }
            return hash;
        }
        /// Assign a Vector from a compatible type.
        @nogc ref Vector opAssign(U)(U x) pure nothrow if (isAssignable!(T, U))
        {
            mixin(generateLoopCode!("v[@] = x;", N)()); // copy to each component
            return this;
        }
        /// Assign a Vector with a static array type.
        @nogc ref Vector opAssign(U)(U arr) pure nothrow if ((isStaticArray!(U) && isAssignable!(T, typeof(arr[0])) && (arr.length == N)))
        {
            mixin(generateLoopCode!("v[@] = arr[@];", N)());
            return this;
        }
        /// Assign with a dynamic array.
        /// Size is checked in debug-mode.
        @nogc ref Vector opAssign(U)(U arr) pure nothrow if (isDynamicArray!(U) && isAssignable!(T, typeof(arr[0])))
        {
            assert(arr.length == N);
            mixin(generateLoopCode!("v[@] = arr[@];", N)());
            return this;
        }
        /// Assign from a samey Vector.
        @nogc ref Vector opAssign(U)(U u) pure nothrow if (is(U : Vector))
        {
            v[] = u.v[];
            return this;
        }
        /// Assign from other vectors types (same size, compatible type).
        @nogc ref Vector opAssign(U)(U x) pure nothrow if (isVector!U
                                                       && isAssignable!(T, U._T)
                                                       && (!is(U: Vector))
                                                       && (U._N == _N))
        {
            mixin(generateLoopCode!("v[@] = x.v[@];", N)());
            return this;
        }
        /// Returns: a pointer to content.
        @nogc inout(T)* ptr() pure inout nothrow @property
        {
            return v.ptr;
        }
        @nogc bool opEquals(U)(U other) pure const nothrow
            if (is(U : Vector))
        {
            for (int i = 0; i < N; ++i)
            {
                if (v[i] != other.v[i])
                {
                    return false;
                }
            }
            return true;
        }
        @nogc bool opEquals(U)(U other) pure const nothrow
            if (isConvertible!U)
        {
            Vector conv = other;
            return opEquals(conv);
        }
        @nogc Vector opUnary(string op)() pure const nothrow
            if (op == "+" || op == "-" || op == "~" || op == "!")
        {
            Vector res = void;
            mixin(generateLoopCode!("res.v[@] = " ~ op ~ " v[@];", N)());
            return res;
        }
        @nogc ref Vector opOpAssign(string op, U)(U operand) pure nothrow
            if (is(U : Vector))
        {
            mixin(generateLoopCode!("v[@] " ~ op ~ "= operand.v[@];", N)());
            return this;
        }
        @nogc ref Vector opOpAssign(string op, U)(U operand) pure nothrow if (isConvertible!U)
        {
            Vector conv = operand;
            return opOpAssign!op(conv);
        }
        @nogc Vector opBinary(string op, U)(U operand) pure const nothrow
            if (is(U: Vector) || (isConvertible!U))
        {
            Vector result = void;
            static if (is(U: T))
                mixin(generateLoopCode!("result.v[@] = cast(T)(v[@] " ~ op ~ " operand);", N)());
            else
            {
                Vector other = operand;
                mixin(generateLoopCode!("result.v[@] = cast(T)(v[@] " ~ op ~ " other.v[@]);", N)());
            }
            return result;
        }
        @nogc Vector opBinaryRight(string op, U)(U operand) pure const nothrow if (isConvertible!U)
        {
            Vector result = void;
            static if (is(U: T))
                mixin(generateLoopCode!("result.v[@] = cast(T)(operand " ~ op ~ " v[@]);", N)());
            else
            {
                Vector other = operand;
                mixin(generateLoopCode!("result.v[@] = cast(T)(other.v[@] " ~ op ~ " v[@]);", N)());
            }
            return result;
        }
        @nogc ref T opIndex(size_t i) pure nothrow
        {
            return v[i];
        }
        @nogc ref const(T) opIndex(size_t i) pure const nothrow
        {
            return v[i];
        }
        @nogc T opIndexAssign(U : T)(U x, size_t i) pure nothrow
        {
            return v[i] = x;
        }
        /// Implements swizzling.
        ///
        /// Example:
        /// ---
        /// vec4i vi = [4, 1, 83, 10];
        /// assert(vi.zxxyw == [83, 4, 4, 1, 10]);
        /// ---
        @nogc @property auto opDispatch(string op, U = void)() pure const nothrow if (isValidSwizzle!(op))
        {
            alias Vector!(T, op.length) returnType;
            returnType res = void;
            enum indexTuple = swizzleTuple!op;
            foreach(i, index; indexTuple)
                res.v[i] = v[index];
            return res;
        }
        /// Support swizzling assignment like in shader languages.
        ///
        /// Example:
        /// ---
        /// vec3f v = [0, 1, 2];
        /// v.yz = v.zx;
        /// assert(v == [0, 2, 0]);
        /// ---
        @nogc @property void opDispatch(string op, U)(U x) pure
            if ((op.length >= 2)
                && (isValidSwizzleUnique!op)                   // v.xyy will be rejected
                && is(typeof(Vector!(T, op.length)(x)))) // can be converted to a small vector of the right size
        {
            Vector!(T, op.length) conv = x;
            enum indexTuple = swizzleTuple!op;
            foreach(i, index; indexTuple)
                v[index] = conv[i];
        }
        /// Casting to small vectors of the same size.
        /// Example:
        /// ---
        /// vec4f vf;
        /// vec4d vd = cast!(vec4d)vf;
        /// ---
        @nogc U opCast(U)() pure const nothrow if (isVector!U && (U._N == _N))
        {
            U res = void;
            mixin(generateLoopCode!("res.v[@] = cast(U._T)v[@];", N)());
            return res;
        }
        /// Implement slices operator overloading.
        /// Allows to go back to slice world.
        /// Returns: length.
        @nogc int opDollar() pure const nothrow
        {
            return N;
        }
        /// Slice containing vector values
        /// Returns: a slice which covers the whole Vector.
        @nogc T[] opSlice() pure nothrow
        {
            return v[];
        }
        /// vec[a..b]
        @nogc T[] opSlice(int a, int b) pure nothrow
        {
            return v[a..b];
        }
        /// Squared Euclidean length of the Vector
        /// Returns: squared length.
        @nogc T squaredMagnitude() pure const nothrow
        {
            T sumSquares = 0;
            mixin(generateLoopCode!("sumSquares += v[@] * v[@];", N)());
            return sumSquares;
        }
        /// Squared Euclidean distance between this vector and another one
        /// Returns: squared Euclidean distance.
        @nogc T squaredDistanceTo(Vector v) pure const nothrow
        {
            return (v - this).squaredMagnitude();
        }
        static if (isFloatingPoint!T)
        {
            /// Euclidean length of the vector
            /// Returns: Euclidean length
            @nogc T magnitude() pure const nothrow
            {
                return sqrt(squaredMagnitude());
            }
            /// Inverse Euclidean length of the vector
            /// Returns: Inverse of Euclidean length.
            @nogc T inverseMagnitude() pure const nothrow
            {
                return 1 / sqrt(squaredMagnitude());
            }
            alias fastInverseLength = fastInverseMagnitude;
            /// Faster but less accurate inverse of Euclidean length.
            /// Returns: Inverse of Euclidean length.
            @nogc T fastInverseMagnitude() pure const nothrow
            {
                return inverseSqrt(squaredMagnitude());
            }
            /// Euclidean distance between this vector and another one
            /// Returns: Euclidean distance between this and other.
            @nogc T distanceTo(Vector other) pure const nothrow
            {
                return (other - this).magnitude();
            }
            /// In-place normalization.
            @nogc void normalize() pure nothrow
            {
                auto invMag = inverseMagnitude();
                mixin(generateLoopCode!("v[@] *= invMag;", N)());
            }
            /// Returns a normalized copy of this Vector
            /// Returns: Normalized vector.
            @nogc Vector normalized() pure const nothrow
            {
                Vector res = this;
                res.normalize();
                return res;
            }
            /// Faster but less accurate in-place normalization.
            @nogc void fastNormalize() pure nothrow
            {
                auto invLength = fastInverseMagnitude();
                mixin(generateLoopCode!("v[@] *= invLength;", N)());
            }
            /// Faster but less accurate vector normalization.
            /// Returns: Normalized vector.
            @nogc Vector fastNormalized() pure const nothrow
            {
                Vector res = this;
                res.fastNormalize();
                return res;
            }
            static if (N == 3)
            {
                /// Gets an orthogonal vector from a 3-dimensional vector.
                /// Doesn’t normalize the output.
                /// Authors: Sam Hocevar
                /// See_also: Source at $(WEB lolengine.net/blog/2013/09/21/picking-orthogonal-vector-combing-coconuts).
                @nogc Vector getOrthogonalVector() pure const nothrow
                {
                    return abs(x) > abs(z) ? Vector(-y, x, 0.0) : Vector(0.0, -z, y);
                }
            }
        }
    }
    private
    {
        enum _N = N;
        alias T _T;
        // define types that can be converted to this, but are not the same type
        template isConvertible(T)
        {
            enum bool isConvertible = (!is(T : Vector))
            && is(typeof(
                {
                    T x;
                    Vector v = x;
                }()));
        }
        // define types that can't be converted to this
        template isForeign(T)
        {
            enum bool isForeign = (!isConvertible!T) && (!is(T: Vector));
        }
        template isValidSwizzle(string op, int lastSwizzleClass = -1)
        {
            static if (op.length == 0)
                enum bool isValidSwizzle = true;
            else
            {
                enum len = op.length;
                enum int swizzleClass = swizzleClassify!(op[0]);
                enum bool swizzleClassValid = (lastSwizzleClass == -1 || (swizzleClass == lastSwizzleClass));
                enum bool isValidSwizzle = (swizzleIndex!(op[0]) != -1)
                                         && swizzleClassValid
                                         && isValidSwizzle!(op[1..len], swizzleClass);
            }
        }
        template searchElement(char c, string s)
        {
            static if (s.length == 0)
            {
                enum bool result = false;
            }
            else
            {
                enum string tail = s[1..s.length];
                enum bool result = (s[0] == c) || searchElement!(c, tail).result;
            }
        }
        template hasNoDuplicates(string s)
        {
            static if (s.length == 1)
            {
                enum bool result = true;
            }
            else
            {
                enum tail = s[1..s.length];
                enum bool result = !(searchElement!(s[0], tail).result) && hasNoDuplicates!(tail).result;
            }
        }
        // true if the swizzle has at the maximum one time each letter
        template isValidSwizzleUnique(string op)
        {
            static if (isValidSwizzle!op)
                enum isValidSwizzleUnique = hasNoDuplicates!op.result;
            else
                enum bool isValidSwizzleUnique = false;
        }
        template swizzleIndex(char c)
        {
            static if((c == 'x' || c == 'r') && N >= 1)
                enum swizzleIndex = 0;
            else static if((c == 'y' || c == 'g') && N >= 2)
                enum swizzleIndex = 1;
            else static if((c == 'z' || c == 'b') && N >= 3)
                enum swizzleIndex = 2;
            else static if ((c == 'w' || c == 'a') && N >= 4)
                enum swizzleIndex = 3;
            else
                enum swizzleIndex = -1;
        }
        template swizzleClassify(char c)
        {
            static if(c == 'x' || c == 'y' || c == 'z' || c == 'w')
                enum swizzleClassify = 0;
            else static if(c == 'r' || c == 'g' || c == 'b' || c == 'a')
                enum swizzleClassify = 1;
            else
                enum swizzleClassify = -1;
        }
        template swizzleTuple(string op)
        {
            enum opLength = op.length;
            static if (op.length == 0)
                enum swizzleTuple = [];
            else
                enum swizzleTuple = [ swizzleIndex!(op[0]) ] ~ swizzleTuple!(op[1..op.length]);
        }
    }
 }
 /// True if `T` is some kind of `Vector`
 enum isVector(T) = is(T : Vector!U, U...);
 ///
 unittest
 {
    static assert(isVector!vec2f);
    static assert(isVector!vec3d);
    static assert(isVector!(vec4!real));
    static assert(!isVector!float);
 }
 /// Get the numeric type used to measure a vectors's coordinates.
 alias DimensionType(T : Vector!U, U...) = U[0];
 ///
 unittest
 {
    static assert(is(DimensionType!vec2f == float));
    static assert(is(DimensionType!vec3d == double));
 }
 ///
 template vec2(T) { alias Vector!(T, 2) vec2; }
 ///
 template vec3(T) { alias Vector!(T, 3) vec3; }
 ///
 template vec4(T) { alias Vector!(T, 4) vec4; }
 alias vec2!int    vec2i;  ///
 alias vec2!float  vec2f;  ///
 alias vec2!double vec2d;  ///
 alias vec3!int    vec3i;  ///
 alias vec3!float  vec3f;  ///
 alias vec3!double vec3d;  ///
 alias vec4!int    vec4i;  ///
 alias vec4!float  vec4f;  ///
 alias vec4!double vec4d;  ///
 /// Element-wise minimum.
@nogc Vector!(T, N) minByElem(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
 {
    import std.algorithm: min;
    Vector!(T, N) res = void;
    mixin(generateLoopCode!("res.v[@] = min(a.v[@], b.v[@]);", N)());
    return res;
 }
 /// Element-wise maximum.
@nogc Vector!(T, N) maxByElem(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
 {
    import std.algorithm: max;
    Vector!(T, N) res = void;
    mixin(generateLoopCode!("res.v[@] = max(a.v[@], b.v[@]);", N)());
    return res;
 }
 /// Element-wise absolute value.
@nogc Vector!(T, N) absByElem(T, int N)(const Vector!(T, N) a) pure nothrow
 {
    Vector!(T, N) res = void;
    mixin(generateLoopCode!("res.v[@] = abs(a.v[@]);", N)());
    return res;
 }
 /// Dot product of two vectors
 /// Returns: Dot product.
@nogc T dot(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
 {
    T sum = 0;
    mixin(generateLoopCode!("sum += a.v[@] * b.v[@];", N)());
    return sum;
 }
 /// Cross product of two 3D vectors
 /// Returns: 3D cross product.
 /// Thanks to vuaru for corrections.
@nogc Vector!(T, 3) cross(T)(const Vector!(T, 3) a, const Vector!(T, 3) b) pure nothrow
 {
    return Vector!(T, 3)(a.y * b.z - a.z * b.y,
                         a.z * b.x - a.x * b.z,
                         a.x * b.y - a.y * b.x);
 }
 /// 3D reflect, like the GLSL function.
 /// Returns: a reflected by normal b.
@nogc Vector!(T, N) reflect(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
 {
    return a - (2 * dot(b, a)) * b;
 }
 ///
@nogc unittest
 {
    // reflect a 2D vector across the x axis (the normal points along the y axis)
    assert(vec2f(1,1).reflect(vec2f(0,1)) == vec2f(1,-1));
    assert(vec2f(1,1).reflect(vec2f(0,-1)) == vec2f(1,-1));
    // note that the normal must be, well, normalized:
    assert(vec2f(1,1).reflect(vec2f(0,20)) != vec2f(1,-1));
    // think of this like a ball hitting a flat floor at an angle.
    // the x and y components remain unchanged, and the z inverts
    assert(vec3f(2,3,-0.5).reflect(vec3f(0,0,1)) == vec3f(2,3,0.5));
 }
 /// Angle between two vectors
 /// Returns: angle between vectors.
 /// See_also: "The Right Way to Calculate Stuff" at $(WEB www.plunk.org/~hatch/rightway.php)
@nogc T angleBetween(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
 {
    auto aN = a.normalized();
    auto bN = b.normalized();
    auto dp = dot(aN, bN);
    if (dp < 0)
        return T(PI) - 2 * asin((-bN-aN).magnitude / 2);
    else
        return 2 * asin((bN-aN).magnitude / 2);
 }
 static assert(vec2f.sizeof == 8);
 static assert(vec3d.sizeof == 24);
 static assert(vec4i.sizeof == 16);
 unittest
 {
    static assert(vec2i.isValidSwizzle!"xyx");
    static assert(!vec2i.isValidSwizzle!"xyz");
    static assert(vec4i.isValidSwizzle!"brra");
    static assert(!vec4i.isValidSwizzle!"rgyz");
    static assert(vec2i.isValidSwizzleUnique!"xy");
    static assert(vec2i.isValidSwizzleUnique!"yx");
    static assert(!vec2i.isValidSwizzleUnique!"xx");
    alias vec2l = vec2!long;
    alias vec3ui = vec3!uint;
    alias vec4ub = vec4!ubyte;
    assert(vec2l(0, 1) == vec2i(0, 1));
    int[2] arr = [0, 1];
    int[] arr2 = new int[2];
    arr2[] = arr[];
    vec2i a = vec2i([0, 1]);
    vec2i a2 = vec2i(0, 1);
    immutable vec2i b = vec2i(0);
    assert(b[0] == 0 && b[1] == 0);
    vec2i c = arr;
    vec2l d = arr2;
    assert(a == a2);
    assert(a == c);
    assert(vec2l(a) == vec2l(a));
    assert(vec2l(a) == d);
    int[vec2i] hashMap;
    hashMap[a] = (c - a).squaredMagnitude;
    assert(hashMap[a] == (c - a).squaredMagnitude);
    vec4i x = [4, 5, 6, 7];
    assert(x == x);
    --x[0];
    assert(x[0] == 3);
    ++x[0];
    assert(x[0] == 4);
    x[1] &= 1;
    x[2] = 77 + x[2];
    x[3] += 3;
    assert(x == [4, 1, 83, 10]);
    assert(x.xxywz == [4, 4, 1, 10, 83]);
    assert(x.xxxxxxx == [4, 4, 4, 4, 4, 4, 4]);
    assert(x.abgr == [10, 83, 1, 4]);
    assert(a != b);
    x = vec4i(x.xyz, 166);
    assert(x == [4, 1, 83, 166]);
    vec2l e = a;
    vec2l f = a + b;
    assert(f == vec2l(a));
    vec3ui g = vec3i(78,9,4);
    g ^= vec3i(78,9,4);
    assert(g == vec3ui(0));
    //g[0..2] = 1u;
    //assert(g == [2, 1, 0]);
    assert(vec2i(4, 5) + 1 == vec2i(5,6));
    assert(vec2i(4, 5) - 1 == vec2i(3,4));
    assert(1 + vec2i(4, 5) == vec2i(5,6));
    assert(vec3f(1,1,1) * 0 == 0);
    assert(1.0 * vec3d(4,5,6) == vec3f(4,5.0f,6.0));
    auto dx = vec2i(1,2);
    auto dy = vec2i(4,5);
    auto dp = dot(dx, dy);
    assert(dp == 14 );
    vec3i h = cast(vec3i)(vec3d(0.5, 1.1, -2.2));
    assert(h == [0, 1, -2]);
    assert(h[] == [0, 1, -2]);
    assert(h[1..3] == [1, -2]);
    assert(h.zyx == [-2, 1, 0]);
    h.yx = vec2i(5, 2); // swizzle assignment
    assert(h.xy == [2, 5]);
    assert(-h[1] == -5);
    assert(++h[0] == 3);
    //assert(h == [-2, 1, 0]);
    assert(!__traits(compiles, h.xx = h.yy));
    vec4ub j;
    // larger vectors
    alias Vector!(float, 5) vec5f;
    vec5f l = vec5f(1, 2.0f, 3.0, 4u, 5.0L);
    l = vec5f(l.xyz, vec2i(1, 2));
    // the ctor should not compile if given too many arguments
    static assert(!is(typeof(vec2f(1, 2, 3))));
    static assert(!is(typeof(vec2f(vec2f(1, 2), 3))));
    static assert( is(typeof(vec3f(vec2f(1, 2), 3))));
    static assert( is(typeof(vec3f(1, 2, 3))));
    assert(absByElem(vec3i(-1, 0, 2)) == vec3i(1, 0, 2));
 }
 private:
 /// SSE approximation of reciprocal square root.
@nogc T inverseSqrt(T)(T x) pure nothrow if (isFloatingPoint!T)
 {
    static if (is(T == float))
    {
        __m128 V = _mm_set_ss(x);
        V = _mm_rsqrt_ss(V);
        return _mm_cvtss_f32(V);
    }
    else
    {
        return 1 / sqrt(x);
    }
 }
 package
 {
    // This generates small loops for Vector, Matrix, and Box.
    // Time has shown such sort of manually unrolled code works best on both DMD and LDC.
    static string generateLoopCode(string formatString, int N)() pure nothrow
    {
        string result;
        for (int i = 0; i < N; ++i)
        {
            string index = ctIntToString(i);
            // replace all @ by indices
            int after = 0;
            int cur = 0;
            for (; cur < formatString.length; ++cur)
            {
                char ch = formatString[cur];
                if (ch == '@')
                {
                    if (cur > after)
                        result ~= formatString[after..cur];
                    result ~= index;
                    after = cur+1;
                }
            }
            if (cur > after)
                result ~= formatString[after..cur];
        }
        return result;
    }
    // Speed-up CTFE conversions, replacement for std.conv
    // Doesn't do the negatives.
    static string ctIntToString(int n) pure nothrow
    {
        static immutable string[16] table = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"];
        if (n < 10)
            return table[n];
        else
        {
            char[10] r;
            for (int k = 0; k < 10; ++k)
            {
                r[9-k] = cast(char)('0' + n % 10);
                n /= 10;
                if (n == 0)
                    return r[9-k..$].idup;
            }
            return r.idup; 
        }
    }
 }
 unittest
 {
    assert(ctIntToString(132) == "132");
    assert(ctIntToString(2147483647) == "2147483647");
 }
--- a/external/inteli/avx2intrin.d
+++ b/external/inteli/avx2intrin.d
--- a/external/inteli/avxintrin.d
+++ b/external/inteli/avxintrin.d
--- a/external/inteli/bmi2intrin.d
+++ b/external/inteli/bmi2intrin.d
@ -0,0 +1,363 @@
 /**
 * BMI2 intrinsics.
 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
 *
 * Copyright: Copyright Johan Engelen 2021.
 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
 */
 module inteli.bmi2intrin;
 import inteli.internals;
 nothrow @nogc pure @safe:
 /// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
 uint _bzhi_u32 (uint a, uint index)
 {
    static if (GDC_or_LDC_with_BMI2)
    {
        if (!__ctfe)
            return __builtin_ia32_bzhi_si(a, index);
        else
            return bzhi!uint(a, index);
    }
    else
    {
        return bzhi!uint(a, index);
    }
 }
 unittest
 {
    static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
           assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
    static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
           assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
    static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
           assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
 }
 /// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
 ulong _bzhi_u64 (ulong a, uint index)
 {
    static if (GDC_or_LDC_with_BMI2)
    {
        if (!__ctfe)
        {
            version(X86_64)
            {
                // This instruction not available in 32-bit x86.
                return __builtin_ia32_bzhi_di(a, index);
            }
            else
                return bzhi!ulong(a, index);
        }
        else
            return bzhi!ulong(a, index);
    }
    else
    {
        return bzhi!ulong(a, index);
    }
 }
 unittest
 {
    static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
           assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
    static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
           assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
    static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
           assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
    static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
           assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
 }
 // Helper function for BZHI
 private T bzhi(T)(T a, uint index)
 {
    /+
        n := index[7:0]
        dst := a
        IF (n < number of bits)
            dst[MSB:n] := 0
        FI
    +/
    enum numbits = T.sizeof*8;
    T dst = a;
    if (index < numbits)
    {
        T mask = (T(1) << index) - 1;
        dst &= mask;
    }
    return dst;
 }
 /// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, 
 /// and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
 /// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
 ///       But, those particular semantics don't exist at the level of intrinsics.
 uint _mulx_u32 (uint a, uint b, uint* hi)
 {
    // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for
    // some reason, even with LLVM IR.
    // Also same with GDC.
    ulong result = cast(ulong) a * b;
    *hi = cast(uint) (result >>> 32);
    return cast(uint)result;
 }
@system unittest
 {
    uint hi;
    assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
    assert (hi == 0x014B_66DC);
 }
 /// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and 
 /// store the high 64-bits in `hi`. This does not read or write arithmetic flags.
 /// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
 ///       But, those particular semantics don't exist at the level of intrinsics.
 ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
 {
    /+
        dst[63:0] := (a * b)[63:0]
        MEM[hi+63:hi]  := (a * b)[127:64]
    +/
    static if (LDC_with_optimizations)
    {
        static if (__VERSION__ >= 2094)
            enum bool withLDCIR = true;
        else
            enum bool withLDCIR = false;
    }
    else
    {
        enum bool withLDCIR = false;
    }
    static if (withLDCIR)
    {
        // LDC x86: Generates mulx from -O0
        enum ir = `
            %4 = zext i64 %0 to i128
            %5 = zext i64 %1 to i128
            %6 = mul nuw i128 %5, %4
            %7 = lshr i128 %6, 64
            %8 = trunc i128 %7 to i64
            store i64 %8, i64* %2, align 8
            %9 = trunc i128 %6 to i64
            ret i64 %9`;
        return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
    }
    else
    {
        /+ Straight-forward implementation with `ucent`:
        ucent result = cast(ucent) a * b;
        *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
        return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
        +/
        /+
            Implementation using 64bit math is more complex...
            a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
                  = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
                  = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
                  = c2 << 64 + c11 << 32 + c12 << 32 + c0
                  = z1 << 64  +  z0
        // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
        // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
        // intermediate result is then the 'carry' that we need to add when calculating z1's sum.
            z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
        The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
            z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
        +/
        const ulong a_low = a & 0xFFFF_FFFF;
        const ulong a_high = a >>> 32;
        const ulong b_low = b & 0xFFFF_FFFF;
        const ulong b_high = b >>> 32;
        const ulong c2 = a_high*b_high;
        const ulong c11 = a_high*b_low;
        const ulong c12 = a_low*b_high;
        const ulong c0 = a_low*b_low;
        const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
        const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
        const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
        *hi = z1;
        return z0;
    }
 }
@system unittest
 {
    ulong hi;
    // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
    assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
    assert (hi == 0x14b_66dc_33f6_acdc);
 }
 /// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
 uint _pdep_u32 (uint a, uint mask)
 {
    static if (GDC_or_LDC_with_BMI2)
    {
        if (!__ctfe)
            return __builtin_ia32_pdep_si(a, mask);
        else
            return pdep!uint(a, mask);
    }
    else
    {
        return pdep!uint(a, mask);
    }
 }
 unittest
 {
    static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
           assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
 }
 /// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
 ulong _pdep_u64 (ulong a, ulong mask)
 {
    static if (GDC_or_LDC_with_BMI2)
    {
        if (!__ctfe)
        {
            version(X86_64)
            {
                // This instruction not available in 32-bit x86.
                return __builtin_ia32_pdep_di(a, mask);
            }
            else
                return pdep!ulong(a, mask);
        }
        else
            return pdep!ulong(a, mask);
    }
    else
    {
        return pdep!ulong(a, mask);
    }
 }
 unittest
 {
    static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
           assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
 }
 // Helper function for PDEP
 private T pdep(T)(T a, T mask)
 {
    /+
        tmp := a
        dst := 0
        m := 0
        k := 0
        DO WHILE m < 32
            IF mask[m] == 1
                dst[m] := tmp[k]
                k := k + 1
            FI
            m := m + 1
        OD
    +/
    T dst;
    T k_bitpos = 1;
    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
    foreach (m; 0..T.sizeof*8)
    {
        if (mask & m_bitpos)
        {
            dst |= (a & k_bitpos) ? m_bitpos : 0;
            k_bitpos <<= 1;
        }
        m_bitpos <<= 1;
    }
    return dst;
 }
 /// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by 
 /// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
 uint _pext_u32 (uint a, uint mask)
 {
    static if (GDC_or_LDC_with_BMI2)
    {
        if (!__ctfe)
            return __builtin_ia32_pext_si(a, mask);
        else
            return pext!uint(a, mask);
    }
    else
    {
        return pext!uint(a, mask);
    }
 }
 unittest
 {
    static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
           assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
 }
 /// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by 
 /// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
 ulong _pext_u64 (ulong a, ulong mask)
 {
    static if (GDC_or_LDC_with_BMI2)
    {
        if (!__ctfe)
        {
            version(X86_64)
            {
                // This instruction not available in 32-bit x86.
                return __builtin_ia32_pext_di(a, mask);
            }
            else
                return pext!ulong(a, mask);
        }
        else
            return pext!ulong(a, mask);
    }
    else
    {
        return pext!ulong(a, mask);
    }
 }
 unittest
 {
    static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
           assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
 }
 // Helper function for PEXT
 private T pext(T)(T a, T mask)
 {
    /+
        tmp := a
        dst := 0
        m := 0
        k := 0
        DO WHILE m < number of bits in T
            IF mask[m] == 1
                dst[k] := tmp[m]
                k := k + 1
            FI
            m := m + 1
        OD
    +/
    T dst;
    T k_bitpos = 1;
    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
    foreach (m; 0..T.sizeof*8)
    {
        if (mask & m_bitpos)
        {
            dst |= (a & m_bitpos) ? k_bitpos : 0;
            k_bitpos <<= 1;
        }
        m_bitpos <<= 1;
    }
    return dst;
 }
--- a/external/inteli/emmintrin.d
+++ b/external/inteli/emmintrin.d
--- a/external/inteli/internals.d
+++ b/external/inteli/internals.d
--- a/external/inteli/math.d
+++ b/external/inteli/math.d
@ -0,0 +1,350 @@
 /**
 * Transcendental bonus functions.
 *
 * Copyright: Copyright Guillaumr Piolat 2016-2020.
 *            Copyright (C) 2007  Julien Pommier
 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
 */
 module inteli.math;
 /* Copyright (C) 2007  Julien Pommier
  This software is provided 'as-is', without any express or implied
  warranty.  In no event will the authors be held liable for any damages
  arising from the use of this software.
  Permission is granted to anyone to use this software for any purpose,
  including commercial applications, and to alter it and redistribute it
  freely, subject to the following restrictions:
  1. The origin of this software must not be misrepresented; you must not
     claim that you wrote the original software. If you use this software
     in a product, an acknowledgment in the product documentation would be
     appreciated but is not required.
  2. Altered source versions must be plainly marked as such, and must not be
     misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution.
  (this is the zlib license)
 */
 import inteli.emmintrin;
 import inteli.internals;
 nothrow @nogc:
 /// Natural `log` computed for a single 32-bit float.
 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
 // #BONUS
 float _mm_log_ss(float v) pure @safe
 {
    __m128 r = _mm_log_ps(_mm_set1_ps(v));
    return r.array[0];
 }
 /// Natural logarithm computed for 4 simultaneous float.
 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
 // #BONUS
 __m128 _mm_log_ps(__m128 x) pure @safe
 {
    static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000];
    static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524];
    static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2];
    static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1];
    static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1];
    static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1];
    static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1];
    static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1];
    static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1];
    static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1];
    static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1];
    static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
    static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
    /* the smallest non denormalized float number */
    static immutable __m128i _psi_min_norm_pos  = [0x00800000,   0x00800000,   0x00800000, 0x00800000];
    __m128i emm0;
    __m128 one = _ps_1;
    __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
    x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos);  /* cut off denormalized stuff */
    emm0 = _mm_srli_epi32(cast(__m128i)x, 23);
    /* keep only the fractional part */
    x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask);
    x = _mm_or_ps(x, _ps_0p5);
    emm0 = _mm_sub_epi32(emm0, _pi32_0x7f);
    __m128 e = _mm_cvtepi32_ps(emm0);
    e += one;
    __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF);
    __m128 tmp = _mm_and_ps(x, mask);
    x -= one;
    e -= _mm_and_ps(one, mask);
    x += tmp;
    __m128 z = x * x;
    __m128 y = _ps_cephes_log_p0;
    y *= x;
    y += _ps_cephes_log_p1;
    y *= x;
    y += _ps_cephes_log_p2;
    y *= x;
    y += _ps_cephes_log_p3;
    y *= x;
    y += _ps_cephes_log_p4;
    y *= x;
    y += _ps_cephes_log_p5;
    y *= x;
    y += _ps_cephes_log_p6;
    y *= x;
    y += _ps_cephes_log_p7;
    y *= x;
    y += _ps_cephes_log_p8;
    y *= x;
    y = y * z;
    tmp = e * _ps_cephes_log_q1;
    y += tmp;
    tmp = z * _ps_0p5;
    y = y - tmp;
    tmp = e * _ps_cephes_log_q2;
    x += y;
    x += tmp;
    x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
    return x;
 }
 /// Natural `exp` computed for a single float.
 /// This is an approximation, valid up to approximately -109dB of accuracy
 /// IMPORTANT: NaN input not supported.
 // #BONUS
 float _mm_exp_ss(float v) pure @safe
 {
    __m128 r = _mm_exp_ps(_mm_set1_ps(v));
    return r.array[0];
 }
 /// Natural `exp` computed for 4 simultaneous float in `x`.
 /// This is an approximation, valid up to approximately -109dB of accuracy
 /// IMPORTANT: NaN input not supported.
 // #BONUS
 __m128 _mm_exp_ps(__m128 x) pure @safe
 {
    static immutable __m128 _ps_exp_hi         = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f];
    static immutable __m128 _ps_exp_lo         = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f];
    static immutable __m128 _ps_cephes_LOG2EF  = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341];
    static immutable __m128 _ps_cephes_exp_C1  = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
    static immutable __m128 _ps_cephes_exp_C2  = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
    static immutable __m128 _ps_cephes_exp_p0  = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4];
    static immutable __m128 _ps_cephes_exp_p1  = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3];
    static immutable __m128 _ps_cephes_exp_p2  = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3];
    static immutable __m128 _ps_cephes_exp_p3  = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2];
    static immutable __m128 _ps_cephes_exp_p4  = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1];
    static immutable __m128 _ps_cephes_exp_p5  = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1];
    __m128 tmp = _mm_setzero_ps(), fx;
    __m128i emm0;
    __m128 one = _ps_1;
    x = _mm_min_ps(x, _ps_exp_hi);
    x = _mm_max_ps(x, _ps_exp_lo);
    /* express exp(x) as exp(g + n*log(2)) */
    fx = x * _ps_cephes_LOG2EF;
    fx += _ps_0p5;
    /* how to perform a floorf with SSE: just below */
    emm0 = _mm_cvttps_epi32(fx);
    tmp  = _mm_cvtepi32_ps(emm0);
    /* if greater, substract 1 */
    __m128 mask = _mm_cmpgt_ps(tmp, fx);
    mask = _mm_and_ps(mask, one);
    fx = tmp - mask;
    tmp = fx * _ps_cephes_exp_C1;
    __m128 z = fx * _ps_cephes_exp_C2;
    x -= tmp;
    x -= z;
    z = x * x;
    __m128 y = _ps_cephes_exp_p0;
    y *= x;
    y += _ps_cephes_exp_p1;
    y *= x;
    y += _ps_cephes_exp_p2;
    y *= x;
    y += _ps_cephes_exp_p3;
    y *= x;
    y += _ps_cephes_exp_p4;
    y *= x;
    y += _ps_cephes_exp_p5;
    y *= z;
    y += x;
    y += one;
    /* build 2^n */
    emm0 = _mm_cvttps_epi32(fx);
    emm0 = _mm_add_epi32(emm0, _pi32_0x7f);
    emm0 = _mm_slli_epi32(emm0, 23);
    __m128 pow2n = cast(__m128)emm0;
    y *= pow2n;
    return y;
 }
 /// Computes `base^exponent` for a single 32-bit float.
 /// This is an approximation, valid up to approximately -100dB of accuracy
 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
 // #BONUS
 float _mm_pow_ss(float base, float exponent) pure @safe
 {
    __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent));
    return r.array[0];
 }
 /// Computes `base^exponent`, for 4 floats at once.
 /// This is an approximation, valid up to approximately -100dB of accuracy
 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
 // #BONUS
 __m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe
 {
    return _mm_exp_ps(exponents * _mm_log_ps(base));
 }
 /// Computes `base^exponent`, for 4 floats at once.
 /// This is an approximation, valid up to approximately -100dB of accuracy
 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
 // #BONUS
 __m128 _mm_pow_ps(__m128 base, float exponent) pure @safe
 {
    return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base));
 }
 unittest
 {
    import std.math;
    bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow
    {
        if (!isFinite(groundTruth))
            return true; // no need to approximate where this is NaN or infinite
        if (groundTruth == 0) // the approximaton should produce zero too if needed
        {
            return approx == 0;
        }
        if (approx == 0)
        {
            // If the approximation produces zero, the error should be below 140 dB
            return ( abs(groundTruth) < 1e-7 );
        }
        if ( ( abs(groundTruth / approx) - 1 ) >= epsilon)
        {
            import core.stdc.stdio;
            debug printf("approxEquals (%g, %g, %g) failed\n", groundTruth, approx, epsilon);
            debug printf("ratio is %f\n", abs(groundTruth / approx) - 1);
        }
        return ( abs(groundTruth / approx) - 1 ) < epsilon;
    }
    // test _mm_log_ps
    for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.05)
    {
        foreach (exponent; -23..23)
        {
            double x = mantissa * 2.0 ^^ exponent;
            double phobosValue = log(x);
            __m128 v = _mm_log_ps(_mm_set1_ps(x));
            foreach(i; 0..4)
                assert(approxEquals(phobosValue, v.array[i], 1.1e-6));
        }
    }
    // test _mm_exp_ps    
    for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
    {
        foreach (exponent; -23..23)
        {
            double x = mantissa * 2.0 ^^ exponent;
            // don't test too high numbers because they saturate FP precision pretty fast
            if (x > 50) continue;
            double phobosValue = exp(x);
            __m128 v = _mm_exp_ps(_mm_set1_ps(x));
            foreach(i; 0..4)
            {
                if (!approxEquals(phobosValue, v.array[i], 3.4e-6))
                {
                    import core.stdc.stdio;
                    printf("x = %f   truth = %f vs estimate = %fn", x, phobosValue, v.array[i]);
                    assert(false);
                }
            }
        }
    }
    // test than exp(-inf) is 0
    {
        __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity));
        float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
        assert(R.array == correct);
    }
    // test log baheviour with NaN and infinities
    // the only guarantee for now is that _mm_log_ps(negative) yield a NaN
    {
        __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan));
      // DOESN'T PASS
      //  assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity
      // DOESN'T PASS
      //  assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity
        assert(isNaN(R.array[2])); // log(negative number) = NaN
        // DOESN'T PASS
        //assert(isNaN(R[3])); // log(NaN) = NaN
    }
    // test _mm_pow_ps
    for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
    {
        foreach (exponent; -8..4)
        {
            double powExponent = mantissa * 2.0 ^^ exponent;
            for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.1)
            {
                foreach (exponent2; -4..4)
                {
                    double powBase = mantissa2 * 2.0 ^^ exponent2;
                    double phobosValue = pow(powBase, powExponent);
                    float fPhobos = phobosValue;
                    if (!isFinite(fPhobos)) continue;
                     __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent));
                    foreach(i; 0..4)
                    {
                        if (!approxEquals(phobosValue, v.array[i], 1e-5))
                        {
                            printf("%g ^^ %g\n", powBase, powExponent);
                            assert(false);
                        }
                    }
                }
            }
        }
    }
 }
 private:
 static immutable __m128 _ps_1   = [1.0f, 1.0f, 1.0f, 1.0f];
 static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f];
 static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f];
--- a/external/inteli/mmx.d
+++ b/external/inteli/mmx.d
--- a/external/inteli/nmmintrin.d
+++ b/external/inteli/nmmintrin.d
--- a/external/inteli/package.d
+++ b/external/inteli/package.d
@ -0,0 +1,25 @@
 /**
 * Public API. You can `import inteli;` if want access to all intrinsics, under any circumstances.
 * That's the what intel-intrinsics enables.
 *
 * Copyright: Copyright Guillaume Piolat 2016-2020.
 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
 */
 module inteli; 
 // Importing with `import inteli;` simply imports all available intrinsics.
 public import inteli.types;
 public import inteli.mmx;        // MMX
 public import inteli.emmintrin;  // SSE
 public import inteli.xmmintrin;  // SSE2
 public import inteli.pmmintrin;  // SSE3
 public import inteli.tmmintrin;  // SSSE3
 public import inteli.smmintrin;  // SSE4.1
 public import inteli.nmmintrin;  // SSE4.2
 public import inteli.shaintrin;  // SHA
 public import inteli.bmi2intrin; // BMI2
 public import inteli.avxintrin;  // AVX
 public import inteli.avx2intrin; // AVX2
 public import inteli.math; // Bonus
--- a/external/inteli/pmmintrin.d
+++ b/external/inteli/pmmintrin.d
@ -0,0 +1,294 @@
 /**
 * SSE3 intrinsics.
 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE3
 *
 * Copyright: Guillaume Piolat 2016-2020.
 *            Charles Gregory 2019.
 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
 */
 module inteli.pmmintrin;
 public import inteli.types;
 import inteli.internals;
 public import inteli.emmintrin;
 // Note: this header will work whether you have SSE3 enabled or not.
 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 
 // generate SSE3 instruction (they are often enabled with -O1 or greater).
 // With GDC, use "dflags-gdc": ["-msse3"] or equivalent to generate SSE3 instructions.
 nothrow @nogc:
 /// Alternatively add and subtract packed double-precision (64-bit) 
 /// floating-point elements in `a` to/from packed elements in `b`.
 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted
 {
    static if (DMD_with_DSIMD_and_SSE3)
    {
        return cast(__m128d) __simd(XMM.ADDSUBPD, cast(void16)a, cast(void16)b);
    }
    else static if (GDC_with_SSE3)
    {
        return __builtin_ia32_addsubpd(a, b);
    }
    else static if (LDC_with_SSE3)
    {
        return __builtin_ia32_addsubpd(a, b);
    }
    else
    {
        // ARM: well optimized starting with LDC 1.18.0 -O2, not disrupted by LLVM 13+
        a.ptr[0] = a.array[0] - b.array[0];
        a.ptr[1] = a.array[1] + b.array[1];
        return a;
    }
 }
 unittest
 {
    auto v1 =_mm_setr_pd(1.0,2.0);
    auto v2 =_mm_setr_pd(1.0,2.0);
    assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array);
 }
 /// Alternatively add and subtract packed single-precision (32-bit) 
 /// floating-point elements in `a` to/from packed elements in `b`.
 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted
 {
    static if (DMD_with_DSIMD_and_SSE3)
    {
        return cast(__m128) __simd(XMM.ADDSUBPS, cast(void16)a, cast(void16)b);
    }
    else static if (GDC_with_SSE3)
    {
        return __builtin_ia32_addsubps(a, b);
    }
    else static if (LDC_with_SSE3)
    {
        return __builtin_ia32_addsubps(a, b);
    }
    else
    {    
        a.ptr[0] -= b.array[0];
        a.ptr[1] += b.array[1];
        a.ptr[2] -= b.array[2];
        a.ptr[3] += b.array[3];
        return a;
    }
 }
 unittest
 {
    auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
    auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
    assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array );
 }
 /// Horizontally add adjacent pairs of double-precision (64-bit) 
 /// floating-point elements in `a` and `b`.
 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted
 {
    // PERF: ARM64?
    static if (DMD_with_DSIMD_and_SSE3)
    {
        return cast(__m128d) __simd(XMM.HADDPD, cast(void16)a, cast(void16)b);
    }
    else static if (GDC_or_LDC_with_SSE3)
    {
        return __builtin_ia32_haddpd(a, b);
    }
    else
    {
        __m128d res;
        res.ptr[0] = a.array[1] + a.array[0];
        res.ptr[1] = b.array[1] + b.array[0];
        return res;
    }
 }
 unittest
 {
    auto A =_mm_setr_pd(1.5, 2.0);
    auto B =_mm_setr_pd(1.0, 2.0);
    assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array );
 }
 /// Horizontally add adjacent pairs of single-precision (32-bit) 
 /// floating-point elements in `a` and `b`.
 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted
 {
    static if (DMD_with_DSIMD_and_SSE3)
    {
        return cast(__m128) __simd(XMM.HADDPS, cast(void16)a, cast(void16)b);
    }
    else static if (GDC_or_LDC_with_SSE3)
    {
        return __builtin_ia32_haddps(a, b);
    }
    else static if (LDC_with_ARM64)
    {
        return vpaddq_f32(a, b);
    }
    else
    {    
        __m128 res;
        res.ptr[0] = a.array[1] + a.array[0];
        res.ptr[1] = a.array[3] + a.array[2];
        res.ptr[2] = b.array[1] + b.array[0];
        res.ptr[3] = b.array[3] + b.array[2];
        return res;
    }
 }
 unittest
 {
    __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
    __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
    assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array );
 }
 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 
 /// floating-point elements in `a` and `b`.
 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted
 {
    static if (DMD_with_DSIMD_and_SSE3)
    {
        return cast(__m128d) __simd(XMM.HSUBPD, cast(void16)a, cast(void16)b);
    }
    else static if (GDC_or_LDC_with_SSE3)
    {
        return __builtin_ia32_hsubpd(a, b);
    }
    else
    {
        // yep, sounds optimal for ARM64 too. Strangely enough.
        __m128d res;
        res.ptr[0] = a.array[0] - a.array[1];
        res.ptr[1] = b.array[0] - b.array[1];
        return res;
    }
 }
 unittest
 {
    auto A =_mm_setr_pd(1.5, 2.0);
    auto B =_mm_setr_pd(1.0, 2.0);
    assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array );
 }
 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 
 /// floating-point elements in `a` and `b`.
 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted
 {
    static if (DMD_with_DSIMD_and_SSE3)
    {
        return cast(__m128) __simd(XMM.HSUBPS, cast(void16)a, cast(void16)b);
    }
    else static if (GDC_or_LDC_with_SSE3)
    {
        return __builtin_ia32_hsubps(a, b);
    }
    else static if (LDC_with_ARM64)
    {
        int4 mask = [0, 0x80000000, 0, 0x80000000];
        a = cast(__m128)(cast(int4)a ^ mask);
        b = cast(__m128)(cast(int4)b ^ mask);
        return vpaddq_f32(a, b);
    }
    else
    {
        __m128 res;
        res.ptr[0] = a.array[0] - a.array[1];
        res.ptr[1] = a.array[2] - a.array[3];
        res.ptr[2] = b.array[0] - b.array[1];
        res.ptr[3] = b.array[2] - b.array[3];
        return res;
    }
 }
 unittest
 {
    __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
    __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
    assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array);
 }
 /// Load 128-bits of integer data from unaligned memory.
 // Note: The saying is LDDQU was only ever useful around 2008
 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used
 alias _mm_lddqu_si128 = _mm_loadu_si128;
 /// Load a double-precision (64-bit) floating-point element from memory into both elements of result.
 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted
 {
    // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
    // Same for GDC with -O1
    double value = *mem_addr;
    __m128d res;
    res.ptr[0] = value;
    res.ptr[1] = value;
    return res;
 }
 unittest
 {
    double a = 7.5;
    __m128d A = _mm_loaddup_pd(&a);
    double[2] correct = [7.5, 7.5];
    assert(A.array == correct);
 }
 /// Duplicate the low double-precision (64-bit) floating-point element from `a`.
 __m128d _mm_movedup_pd (__m128d a) pure @trusted
 {
    // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
    // Something efficient with -01 for GDC
    a.ptr[1] = a.array[0];
    return a;
 }
 unittest
 {
    __m128d A = _mm_setr_pd(7.0, 2.5);
    assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array);
 }
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`.
 __m128 _mm_movehdup_ps (__m128 a) pure @trusted
 {
    static if (GDC_with_SSE3)
    {
        return __builtin_ia32_movshdup (a);
    }
    else
    {
        // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3
        a.ptr[0] = a.array[1];
        a.ptr[2] = a.array[3];
        return a;
    }
 }
 unittest
 {
    __m128 A = _mm_movehdup_ps(_mm_setr_ps(1, 2, 3, 4));
    float[4] correct = [2.0f, 2, 4, 4 ];
    assert(A.array == correct);
 }
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`.
 __m128 _mm_moveldup_ps (__m128 a) pure @trusted
 {
    static if (GDC_with_SSE3)
    {
        return __builtin_ia32_movsldup (a);
    }
    else
    {
        // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3
        a.ptr[1] = a.array[0];
        a.ptr[3] = a.array[2];
        return a;
    }
 }
 unittest
 {
    __m128 A = _mm_moveldup_ps(_mm_setr_ps(1, 2, 3, 4));
    float[4] correct = [1.0f, 1, 3, 3 ];
    assert(A.array == correct);
 }
--- a/external/inteli/shaintrin.d
+++ b/external/inteli/shaintrin.d
@ -0,0 +1,268 @@
 /**
 * SHA intrinsics.
 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=SHA
 * 
 * Copyright: Guillaume Piolat 2021.
 *            Johan Engelen 2021.
 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
 */
 module inteli.shaintrin;
 // SHA instructions
 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=SHA
 // Note: this header will work whether you have SHA enabled or not.
 // With LDC, use "dflags-ldc": ["-mattr=+sha"] or equivalent to actively
 // generate SHA instructions.
 // With GDC, use "dflags-gdc": ["-msha"] or equivalent to generate SHA instructions.
 public import inteli.types;
 import inteli.internals;
 nothrow @nogc:
 /+
 /// Perform an intermediate calculation for the next four SHA1 message values (unsigned 32-bit integers) using previous message values from a and b, and store the result in dst.
 __m128i _mm_sha1nexte_epu32(__m128i a, __m128i b) @trusted
 {
    static if (SHA_builtins)
    {
        return __builtin_ia32_sha1nexte(cast(int4) a, cast(int4) b);
    }
    else
    {
        assert(0);
    }
 }
 unittest
 {
 }
 +/
 /+
 /// Perform the final calculation for the next four SHA1 message values (unsigned 32-bit integers) using the intermediate result in a and the previous message values in b, and store the result in dst.
 __m128i _mm_sha1msg1_epu32(__m128i a, __m128i b) @trusted
 {
    static if (SHA_builtins)
    {
        return __builtin_ia32_sha1msg1(cast(int4) a, cast(int4) b);
    }
    else
    {
        assert(0);
    }
 }
 unittest
 {
 }
 +/
 /+
 /// Calculate SHA1 state variable E after four rounds of operation from the current SHA1 state variable a, add that value to the scheduled values (unsigned 32-bit integers) in b, and store the result in dst.
 __m128i _mm_sha1msg2_epu32(__m128i a, __m128i b) @trusted
 {
    static if (SHA_builtins)
    {
        return __builtin_ia32_sha1msg2(cast(int4) a, cast(int4) b);
    }
    else
    {
        assert(0);
    }
 }
 unittest
 {
 }
 +/
 /+
 /// Perform four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D) from a and some pre-computed sum of the next 4 round message values (unsigned 32-bit integers), and state variable E from b, and store the updated SHA1 state (A,B,C,D) in dst. func contains the logic functions and round constants.
 __m128i _mm_sha1rnds4_epu32(__m128i a, __m128i b, const int func) @trusted
 {
    static if (SHA_builtins)
    {
        return __builtin_ia32_sha1rnds4(cast(int4) a, cast(int4) b, func);
    }
    else
    {
        assert(0);
    }
 }
 +/
 /// Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from `a` and `b`, and return the result.
 __m128i _mm_sha256msg1_epu32(__m128i a, __m128i b) @trusted
 {
    static if (GDC_or_LDC_with_SHA)
    {
        return __builtin_ia32_sha256msg1(cast(int4) a, cast(int4) b);
    }
    else
    {
        static uint sigma0(uint x) nothrow @nogc @safe
        { 
            return bitwiseRotateRight_uint(x, 7) ^ bitwiseRotateRight_uint(x, 18) ^ x >> 3;
        }
        int4 dst;
        int4 a4 = cast(int4) a;
        int4 b4 = cast(int4) b;
        uint W4 = b4.array[0];
        uint W3 = a4.array[3];
        uint W2 = a4.array[2];
        uint W1 = a4.array[1];
        uint W0 = a4.array[0];
        dst.ptr[3] = W3 + sigma0(W4);
        dst.ptr[2] = W2 + sigma0(W3);
        dst.ptr[1] = W1 + sigma0(W2);
        dst.ptr[0] = W0 + sigma0(W1);
        return cast(__m128i) dst;
    }
 }
 unittest
 {
    __m128i a = [15, 20, 130, 12345];
    __m128i b = [15, 20, 130, 12345];
    __m128i result = _mm_sha256msg1_epu32(a, b);
    assert(result.array == [671416337, 69238821, 2114864873, 503574586]);
 }
 /// Perform 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from `a`, an initial SHA256 state (A,B,E,F) from `b`, and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers) and the corresponding round constants from k, and return the updated SHA256 state (A,B,E,F).
 __m128i _mm_sha256msg2_epu32(__m128i a, __m128i b) @trusted
 {
    static if (GDC_or_LDC_with_SHA)
    {
        return __builtin_ia32_sha256msg2(cast(int4) a, cast(int4) b);
    }
    else
    {
        static uint sigma1(uint x) nothrow @nogc @safe
        { 
            return bitwiseRotateRight_uint(x, 17) ^ bitwiseRotateRight_uint(x, 19) ^ x >> 10; 
        }
        int4 dst;
        int4 a4 = cast(int4) a;
        int4 b4 = cast(int4) b;
        uint W14 = b4.array[2];
        uint W15 = b4.array[3];
        uint W16 = a4.array[0] + sigma1(W14);
        uint W17 = a4.array[1] + sigma1(W15);
        uint W18 = a4.array[2] + sigma1(W16);
        uint W19 = a4.array[3] + sigma1(W17);
        dst.ptr[3] = W19;
        dst.ptr[2] = W18;
        dst.ptr[1] = W17;
        dst.ptr[0] = W16;
        return cast(__m128i) dst;
    }
 }
 unittest
 {
    __m128i a = [15, 20, 130, 12345];
    __m128i b = [15, 20, 130, 12345];
    __m128i result = _mm_sha256msg2_epu32(a, b);
    assert(result.array == [5324815, 505126944, -2012842764, -1542210977]);
 }
 /// Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from `a` and `b`, and return the result.
 __m128i _mm_sha256rnds2_epu32(__m128i a, __m128i b, __m128i k) @trusted
 {
    // TODO: the pragma(inline) false prevent a DMD 1.100
    //       regression in Linux + x86_64 + -b release-unittest, report that
    version(DigitalMars)
    {
        enum bool workaround = true;
    }
    else
    {
        enum bool workaround = false;
    }
    static if (GDC_or_LDC_with_SHA)
    {
        return __builtin_ia32_sha256rnds2(cast(int4) a, cast(int4) b, cast(int4) k);
    }
    else
    {
        static uint Ch(uint x, uint y, uint z) nothrow @nogc @safe
        { 
            static if (workaround) pragma (inline, false);
            return z ^ (x & (y ^ z)); 
        }
        static uint Maj(uint x, uint y, uint z) nothrow @nogc @safe
        { 
            static if (workaround) pragma (inline, false);
            return (x & y) | (z & (x ^ y)); 
        }
        static uint sum0(uint x) nothrow @nogc @safe
        { 
            static if (workaround) pragma (inline, false);
            return bitwiseRotateRight_uint(x, 2) ^ bitwiseRotateRight_uint(x, 13) ^ bitwiseRotateRight_uint(x, 22); 
        }
        static uint sum1(uint x) nothrow @nogc @safe
        { 
            static if (workaround) pragma (inline, false);
            return bitwiseRotateRight_uint(x, 6) ^ bitwiseRotateRight_uint(x, 11) ^ bitwiseRotateRight_uint(x, 25); 
        }
        int4 dst;
        int4 a4 = cast(int4) a;
        int4 b4 = cast(int4) b;
        int4 k4 = cast(int4) k;
        const A0 = b4.array[3];
        const B0 = b4.array[2];
        const C0 = a4.array[3];
        const D0 = a4.array[2];
        const E0 = b4.array[1];
        const F0 = b4.array[0];
        const G0 = a4.array[1];
        const H0 = a4.array[0];
        const W_K0 = k4.array[0];
        const W_K1 = k4.array[1];
        const A1 = Ch(E0, F0, G0) + sum1(E0) + W_K0 + H0 + Maj(A0, B0, C0) + sum0(A0);
        const B1 = A0;
        const C1 = B0;
        const D1 = C0;
        const E1 = Ch(E0, F0, G0) + sum1(E0) + W_K0 + H0 + D0;
        const F1 = E0;
        const G1 = F0;
        const H1 = G0;
        const A2 = Ch(E1, F1, G1) + sum1(E1) + W_K1 + H1 + Maj(A1, B1, C1) + sum0(A1);
        const B2 = A1;
        const C2 = B1;
        const D2 = C1;
        const E2 = Ch(E1, F1, G1) + sum1(E1) + W_K1 + H1 + D1;
        const F2 = E1;
        const G2 = F1;
        const H2 = G1;
        dst.ptr[3] = A2;
        dst.ptr[2] = B2;
        dst.ptr[1] = E2;
        dst.ptr[0] = F2;
        return cast(__m128i) dst;
    }
 }
 unittest
 {
    __m128i a = [15, 20, 130, 12345];
    __m128i b = [15, 20, 130, 12345];
    __m128i k = [15, 20, 130, 12345];
    __m128i result = _mm_sha256rnds2_epu32(a, b, k);
    assert(result.array == [1384123044, -2050674062, 327754346, 956342016]);
 }
 private uint bitwiseRotateRight_uint(const uint value, const uint count) @safe
 {
    assert(count < 8 * uint.sizeof);
    return cast(uint) ((value >> count) | (value << (uint.sizeof * 8 - count)));
 }
--- a/external/inteli/smmintrin.d
+++ b/external/inteli/smmintrin.d
--- a/external/inteli/tmmintrin.d
+++ b/external/inteli/tmmintrin.d
--- a/external/inteli/types.d
+++ b/external/inteli/types.d
@ -0,0 +1,456 @@
 /**
 * `core.simd` emulation layer.
 *
 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
 *            cet 2024.
 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
 */
 module inteli.types;
 pure:
 nothrow:
@nogc:
 version(GNU)
 {
    // Note: for GDC support, be sure to use https://explore.dgnu.org/
    // Future: just detect vectors, do not base upon arch.
    version(X86_64)
    {
        enum MMXSizedVectorsAreEmulated = false;
        enum SSESizedVectorsAreEmulated = false;
        // Does GDC support AVX-sized vectors?
        static if (__VERSION__ >= 2100) // Starting at GDC 12.1 only.
        {
            enum AVXSizedVectorsAreEmulated = !(is(__vector(double[4]))); 
        }
        else
        {
            enum AVXSizedVectorsAreEmulated = true;
        }
        import gcc.builtins;
    }
    else
    {
        enum MMXSizedVectorsAreEmulated = true;
        enum SSESizedVectorsAreEmulated = true;
        enum AVXSizedVectorsAreEmulated = true;
    }
 }
 else version(LDC)
 {
    public import ldc.simd;
    // Use this alias to mention it should only be used with LDC,
    // for example when emulated shufflevector would just be wasteful.
    alias shufflevectorLDC = shufflevector;
    enum MMXSizedVectorsAreEmulated = false;
    enum SSESizedVectorsAreEmulated = false;
    enum AVXSizedVectorsAreEmulated = false;
 }
 else version(DigitalMars)
 {
    public import core.simd;
    static if (__VERSION__ >= 2100)
    {
        // Note: turning this true is very desirable for DMD performance,
        // but also leads to many bugs being discovered upstream.
        // The fact that it works at all relies on many workardounds.
        // In particular intel-intrinsics with this "on" is a honeypot for DMD backend bugs,
        // and a very strong DMD codegen test suite.
        // What happens typically is that contributors end up on a DMD bug in their PR.
        // But finally, in 2022 D_SIMD has been activated, at least for SSE and some instructions.
        enum bool tryToEnableCoreSimdWithDMD = true;
    }
    else
    {
        enum bool tryToEnableCoreSimdWithDMD = false;
    }
    version(D_SIMD)
    {
        enum MMXSizedVectorsAreEmulated = true;
        enum SSESizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD;
        // Note: with DMD, AVX-sized vectors can't be enabled yet.
        // On linux + x86_64, this will fail since a few operands seem to be missing. 
        // FUTURE: enable AVX-sized vectors in DMD. :)
        //
        // Blockers: https://issues.dlang.org/show_bug.cgi?id=24283 and 24284
        //           Probably other, unreported issues.
        version(D_AVX)
            enum AVXSizedVectorsAreEmulated = true;
        else
            enum AVXSizedVectorsAreEmulated = true;
    }
    else
    {
        // Some DMD 32-bit targets don't have D_SIMD
        enum MMXSizedVectorsAreEmulated = true;
        enum SSESizedVectorsAreEmulated = true;
        enum AVXSizedVectorsAreEmulated = true;
    }
 }
 enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated;
 static if (CoreSimdIsEmulated)
 {
    // core.simd is emulated in some capacity: introduce `VectorOps`
    mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N)
    {
        enum Count = N;
        alias Base = BaseType;
        BaseType* ptr() return pure nothrow @nogc
        {
            return array.ptr;
        }
        // Unary operators
        VectorType opUnary(string op)() pure nothrow @safe @nogc
        {
            VectorType res = void;
            mixin("res.array[] = " ~ op ~ "array[];");
            return res;
        }
        // Binary operators
        VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc
        {
            VectorType res = void;
            mixin("res.array[] = array[] " ~ op ~ " other.array[];");
            return res;
        }
        // Assigning a BaseType value
        void opAssign(BaseType e) pure nothrow @safe @nogc
        {
            array[] = e;
        }
        // Assigning a static array
        void opAssign(ArrayType v) pure nothrow @safe @nogc
        {
            array[] = v[];
        }
        void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc
        {
            mixin("array[] "  ~ op ~ "= other.array[];");
        }
        // Assigning a dyn array
        this(ArrayType v) pure nothrow @safe @nogc
        {
            array[] = v[];
        }
        // Broadcast constructor
        this(BaseType x) pure nothrow @safe @nogc
        {
            array[] = x;
        }
        /// We can't support implicit conversion but do support explicit casting.
        /// "Vector types of the same size can be implicitly converted among each other."
        /// Casting to another vector type is always just a raw copy.
        VecDest opCast(VecDest)() pure const nothrow @trusted @nogc
            if (VecDest.sizeof == VectorType.sizeof)
            {
                VecDest dest = void;
                // Copy
                dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[];
                return dest;
            }
        ref inout(BaseType) opIndex(size_t i) inout return pure nothrow @safe @nogc
        {
            return array[i];
        }
    }
 }
 else
 {
    public import core.simd;
    // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can
    // And GDC sometimes need those unsigned vector types for some intrinsics.
    // For internal use only.
    package alias ushort8 = Vector!(ushort[8]);
    package alias ubyte8  = Vector!(ubyte[8]);
    package alias ubyte16 = Vector!(ubyte[16]);
    static if (!AVXSizedVectorsAreEmulated)
    {
        package alias ushort16 = Vector!(ushort[16]);
        package alias ubyte32  = Vector!(ubyte[32]);
    }
 }
 // Emulate ldc.simd cmpMask and other masks.
 // Note: these should be deprecated on non-LDC, 
 // since it's slower to generate that code.
 version(LDC)
 {} 
 else
 {
    // TODO: deprecated and write plain versions instead
    private template BaseType(V)
    {
        alias typeof( ( { V v; return v; }()).array[0]) BaseType;
    }
    private template TrueMask(V)
    {
        alias Elem = BaseType!V;
        static if (is(Elem == float))
        {
            immutable uint m1 = 0xffffffff;
            enum Elem TrueMask = *cast(float*)(&m1);
        }
        else static if (is(Elem == double))
        {
            immutable ulong m1 = 0xffffffff_ffffffff;
            enum Elem TrueMask = *cast(double*)(&m1);
        }
        else // integer case
        {
            enum Elem TrueMask = -1;
        }
    }
    Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison
    {
        enum size_t Count = Vec.array.length;
        Vec result;
        foreach(int i; 0..Count)
        {
            bool cond = a.array[i] == b.array[i];
            result.ptr[i] = cond ? TrueMask!Vec : 0;
        }
        return result;
    }
    Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison
    {
        enum size_t Count = Vec.array.length;
        Vec result;
        foreach(int i; 0..Count)
        {
            bool cond = a.array[i] > b.array[i];
            result.ptr[i] = cond ? TrueMask!Vec : 0;
        }
        return result;
    }
 }
 unittest
 {
    float4 a = [1, 3, 5, 7];
    float4 b = [2, 3, 4, 5];
    int4 c = cast(int4)(greaterMask!float4(a, b));
    static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff];
    assert(c.array == correct);
 }
 static if (MMXSizedVectorsAreEmulated)
 {
    /// MMX-like SIMD types
    struct float2
    {
        float[2] array;
        mixin VectorOps!(float2, float[2]);
    }
    struct byte8
    {
        byte[8] array;
        mixin VectorOps!(byte8, byte[8]);
    }
    struct short4
    {
        short[4] array;
        mixin VectorOps!(short4, short[4]);
    }
    struct int2
    {
        int[2] array;
        mixin VectorOps!(int2, int[2]);
    }
    struct long1
    {
        long[1] array;
        mixin VectorOps!(long1, long[1]);
    }
 }
 else
 {
    // For this compiler, defining MMX-sized vectors is working.
    public import core.simd;
    alias long1 = Vector!(long[1]);
    alias float2 = Vector!(float[2]);
    alias int2 = Vector!(int[2]);
    alias short4 = Vector!(short[4]);
    alias byte8 = Vector!(byte[8]);
 }
 static assert(float2.sizeof == 8);
 static assert(byte8.sizeof == 8);
 static assert(short4.sizeof == 8);
 static assert(int2.sizeof == 8);
 static assert(long1.sizeof == 8);
 static if (SSESizedVectorsAreEmulated)
 {
    /// SSE-like SIMD types
    struct float4
    {
        float[4] array;
        mixin VectorOps!(float4, float[4]);
    }
    struct byte16
    {
        byte[16] array;
        mixin VectorOps!(byte16, byte[16]);
    }
    struct short8
    {
        short[8] array;
        mixin VectorOps!(short8, short[8]);
    }
    struct int4
    {
        int[4] array;
        mixin VectorOps!(int4, int[4]);
    }
    struct long2
    {
        long[2] array;
        mixin VectorOps!(long2, long[2]);
    }
    struct double2
    {
        double[2] array;
        mixin VectorOps!(double2, double[2]);
    }
 }
 static assert(float4.sizeof == 16);
 static assert(byte16.sizeof == 16);
 static assert(short8.sizeof == 16);
 static assert(int4.sizeof == 16);
 static assert(long2.sizeof == 16);
 static assert(double2.sizeof == 16);
 static if (AVXSizedVectorsAreEmulated)
 {
    /// AVX-like SIMD types
    struct float8
    {
        float[8] array;
        mixin VectorOps!(float8, float[8]);
    }
    struct byte32
    {
        byte[32] array;
        mixin VectorOps!(byte32, byte[32]);
    }
    struct short16
    {
        short[16] array;
        mixin VectorOps!(short16, short[16]);
    }
    struct int8
    {
        int[8] array;
        mixin VectorOps!(int8, int[8]);
    }
    struct long4
    {
        long[4] array;
        mixin VectorOps!(long4, long[4]);
    }
    struct double4
    {
        double[4] array;
        mixin VectorOps!(double4, double[4]);
    }
 }
 else
 {
    public import core.simd;    
 }
 static assert(float8.sizeof == 32);
 static assert(byte32.sizeof == 32);
 static assert(short16.sizeof == 32);
 static assert(int8.sizeof == 32);
 static assert(long4.sizeof == 32);
 static assert(double4.sizeof == 32);
 alias __m256 = float8;
 alias __m256i = long4; // long long __vector with ICC, GCC, and clang
 alias __m256d = double4;
 alias __m128 = float4;
 alias __m128i = int4;
 alias __m128d = double2;
 alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long
 int _MM_SHUFFLE2(int x, int y) pure @safe
 {
    assert(x >= 0 && x <= 1);
    assert(y >= 0 && y <= 1);
    return (x << 1) | y;
 }
 int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe
 {
    assert(x >= 0 && x <= 3);
    assert(y >= 0 && y <= 3);
    assert(z >= 0 && z <= 3);
    assert(w >= 0 && w <= 3);
    return (z<<6) | (y<<4) | (x<<2) | w;
 }
 // test assignment from scalar to vector type
 unittest
 {
    float4 A = 3.0f;
    float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f];
    assert(A.array == correctA);
    int2 B = 42;
    int[2] correctB = [42, 42];
    assert(B.array == correctB);
 }
--- a/external/inteli/xmmintrin.d
+++ b/external/inteli/xmmintrin.d
--- a/src/gears/main.d
+++ b/src/gears/main.d
@ -14,22 +14,6 @@ void main()
 	r.Renderer rd = r.Init(&window);
 	scope(exit) r.Destroy(&rd);
 	/*
 	Vec4 f1 = Vec4(r: 2.0, a: 5.5);
 	Vec4 f2;
 	Vec4* f = &f1;
 	asm
 	{
 		mov R8, f;
 		movups XMM0, f1.r.offsetof[R8];
 		movups f2, XMM0;
 	}
 	writeln(f2);
 	*/
 	while (true)
 	{
 		p.HandleEvents(&window);
--- a/src/gears/renderer.d
+++ b/src/gears/renderer.d
@ -32,6 +32,7 @@ enum Format : VkFormat
 	RGBA_F32 = VK_FORMAT_R32G32B32A32_SFLOAT,
 	RGBA_UINT = VK_FORMAT_B8G8R8A8_UINT,
 	RGBA_UNORM = VK_FORMAT_R8G8B8A8_UNORM,
 	RGBA_SRGB = VK_FORMAT_R8G8B8A8_SRGB,
 }
 alias FMT = Format;
@ -101,9 +102,17 @@ struct Renderer
 	PushConst push_const;
 	Vec3 camera_pos = Vec3(0.0);
 	Model yoder;
 }
 struct Camera
 {
 	Vec3 pos = Vec3(0.0);
 	Vec3 target = Vec3(0.0);
 }
 struct GlobalUniforms
 {
 	Vec2 res;
@ -128,17 +137,6 @@ extern(C) struct Material
 	f32 shininess = 0.0;
 }
 static assert(Material.ambient.offsetof == 0, "ambient offset incorrect");
 static assert(Material.diffuse.offsetof == 16, "ambient offset incorrect");
 static assert(Material.specular.offsetof == 32, "ambient offset incorrect");
 static assert(Material.albedo_texture.offsetof == 48, "ambient offset incorrect");
 static assert(Material.ambient_texture.offsetof == 52, "ambient offset incorrect");
 static assert(Material.specular_texture.offsetof == 56, "ambient offset incorrect");
 static assert(Material.albedo_has_texture.offsetof == 60, "ambient offset incorrect");
 static assert(Material.ambient_has_texture.offsetof == 64, "ambient offset incorrect");
 static assert(Material.specular_has_texture.offsetof == 68, "ambient offset incorrect");
 static assert(Material.shininess.offsetof == 72, "ambient offset incorrect");
 struct UIVertex
 {
 	Vec2 p0;
@ -244,7 +242,7 @@ Cycle(Renderer* rd)
 	SetUniform(rd, &rd.globals);
-	DrawRect(rd, 150.0, 300.0, 500.0, 700.0, Vec4(r: 0.0, g: 0.0, b: 1.0, a: 1.0));
+	DrawRect(rd, 150.0, 300.0, 500.0, 700.0, Vec4(0.0, 0.0, 1.0, 1.0));
 	PrepComputeDrawImage(rd);
--- a/src/gears/vulkan.d
+++ b/src/gears/vulkan.d
@ -732,7 +732,7 @@ CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, u32 ch, u8[] data)
 		assert(Transfer(vk, &buf, data), "CreateImageView failure: Buffer Transfer error");
 		ImageView conv_view;
-		CreateImageView(vk, &conv_view, w, h, VK_FORMAT_R32G32B32A32_SFLOAT);
+		CreateImageView(vk, &conv_view, w, h, FMT.RGBA_F32);
 		WriteConvDescriptor(vk, &buf);
 		WriteConvDescriptor(vk, &conv_view);
@ -783,10 +783,12 @@ CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, u32 ch, u8[] data)
 		FinishComputePass(vk);
-		vkWaitForFences(vk.device, 1, &vk.comp_fence, VK_TRUE, 1000000000);
+		vkWaitForFences(vk.device, 1, &vk.comp_fence, VK_TRUE, u64.max);
-		//Destroy(vk, &buf);
+		vkQueueWaitIdle(vk.tfer_queue);
-		//Destroy(&conv_view, vk.device, vk.vma);
+
 		Destroy(vk, &buf);
 		Destroy(&conv_view, vk.device, vk.vma);
 	}
 }
@ -834,7 +836,7 @@ FinishComputePass(Vulkan* vk)
 }
 pragma(inline): void
-CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, VkFormat format = VK_FORMAT_R8G8B8A8_SRGB)
+CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, Format format = FMT.RGBA_UNORM)
 {
 	VmaAllocationCreateInfo alloc_info = {
 		usage: VMA_MEMORY_USAGE_GPU_ONLY,
@ -849,7 +851,7 @@ CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, VkFormat format = VK_
 		format: format,
 		tiling: VK_IMAGE_TILING_OPTIMAL,
 		initialLayout: VK_IMAGE_LAYOUT_UNDEFINED,
-		usage: format == VK_FORMAT_R8G8B8A8_SRGB ? (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT) : (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
+		usage: format == FMT.RGBA_F32 ? (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT) : (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT), 
 		samples: VK_SAMPLE_COUNT_1_BIT,
 		extent: {
 			width: w,
--- a/src/shared/aliases.d
+++ b/src/shared/aliases.d
@ -1,5 +1,6 @@
 import core.memory;
 import std.stdint;
 import dplug.math;
 debug
 {
@ -28,3 +29,10 @@ alias b32 = uint;
 alias intptr = intptr_t;
 alias uintptr = uintptr_t;
 alias Vec2 = vec2f;
 alias Vec3 = vec3f;
 alias Vec4 = vec4f;
 alias Mat2 = mat2f;
 alias Mat3 = mat3f;
 alias Mat4 = mat4f;
--- a/src/shared/util.d
+++ b/src/shared/util.d
@ -359,38 +359,3 @@ Hash(string str)
 	return xxh3_64bits_withSeed(str.ptr, str.length, HASH_SEED);
 }
 struct Matrix(T, int S)
 {
 	T[S][S] m;
 	alias m this;
 }
 alias Mat2 = Matrix!(f32, 2);
 alias Mat3 = Matrix!(f32, 3);
 alias Mat4 = Matrix!(f32, 4);
 struct Vector(T, int S)
 {
 	union
 	{
 		struct
 		{
 			T r = 0.0;
 			T g = 0.0;
 			static if (S > 2) T b = 0.0;
 			static if (S > 3) T a = 0.0;
 		};
 		struct
 		{
 			T x;
 			T y;
 			static if (S > 2) T z;
 			static if (S > 3) T w;
 		};
 		T[S] v;
 	}
 }
 alias Vec2 = Vector!(f32, 2);
 alias Vec3 = Vector!(f32, 3);
 alias Vec4 = Vector!(f32, 4);