diff --git a/dub.json b/dub.json
index f97fe16..296c243 100644
--- a/dub.json
+++ b/dub.json
@@ -9,8 +9,8 @@
 			"targetPath": "build",
 			"sourceFiles-linux": ["build/libvma.a", "build/libstb_image.a", "build/libm3d.a"],
 			"sourceFiles-windows": [],
-			"importPaths": ["src/gears", "src/shared", "src/generated", "external/xxhash"],
-			"sourcePaths": ["src/gears", "src/shared", "src/generated", "external/xxhash"],
+			"importPaths": ["src/gears", "src/shared", "src/generated", "external/xxhash", "external/dplug/math", "external/inteli"],
+			"sourcePaths": ["src/gears", "src/shared", "src/generated", "external/xxhash", "external/dplug/math", "external/inteli"],
 			"libs-linux": ["xcb", "X11", "X11-xcb", "vulkan", "stdc++"],
 			"libs-windows": [],
 			"preGenerateCommands-linux": ["./build-vma.sh", "build/Codegen", "dub main:packer"],
@@ -22,8 +22,8 @@
 			"targetType": "executable",
 			"targetPath": "build",
 			"targetName": "Packer",
-			"importPaths": ["src/packer", "src/shared", "src/generated", "external/xxhash"],
-			"sourcePaths": ["src/packer", "src/shared", "src/generated", "external/xxhash"],
+			"importPaths": ["src/packer", "src/shared", "src/generated", "external/xxhash", "external/dplug/math", "external/inteli"],
+			"sourcePaths": ["src/packer", "src/shared", "src/generated", "external/xxhash", "external/dplug/math", "external/inteli"],
 			"sourceFiles-linux": ["build/libstb_image.a", "build/libm3d.a"],
 			"preGenerateCommands-linux": ["./build-vma.sh"],
 			"postGenerateCommands-linux": ["build/Packer"],
@@ -35,8 +35,8 @@
 			"targetType": "executable",
 			"targetPath": "build",
 			"targetName": "Codegen",
-			"importPaths": ["src/codegen", "src/shared", "external/xxhash"],
-			"sourcePaths": ["src/codegen", "src/shared", "external/xxhash"],
+			"importPaths": ["src/codegen", "src/shared", "external/xxhash", "external/dplug/math", "external/inteli"],
+			"sourcePaths": ["src/codegen", "src/shared", "external/xxhash", "external/dplug/math", "external/inteli"],
 			"sourceFiles-linux": ["build/libstb_image.a"],
 			"preGenerateCommands-linux": ["./build-vma.sh"],
 			"preGenerateCommands-windows": [],
diff --git a/external/dplug/math/box.d b/external/dplug/math/box.d
new file mode 100644
index 0000000..b330c3f
--- /dev/null
+++ b/external/dplug/math/box.d
@@ -0,0 +1,689 @@
+/**
+ * N-dimensional half-open interval [a, b[.
+ *
+ * Copyright: Copyright Guillaume Piolat 2015-2021.
+ *            Copyright Ahmet Sait 2021.
+ *            Copyright Ryan Roden-Corrent 2016.
+ *            Copyright Nathan Sashihara 2018.
+ *            Copyright Colden Cullen 2014.
+ *
+ * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+ */
+module dplug.math.box;
+
+import std.math,
+       std.traits;
+
+import dplug.math.vector;
+
+/// N-dimensional half-open interval [a, b[.
+struct Box(T, int N)
+{
+    static assert(N > 0);
+
+    public
+    {
+        alias bound_t = Vector!(T, N);
+
+        bound_t min; // not enforced, the box can have negative volume
+        bound_t max;
+
+        /// Construct a box which extends between 2 points.
+        /// Boundaries: min is inside the box, max is just outside.
+        @nogc this(bound_t min_, bound_t max_) pure nothrow
+        {
+            min = min_;
+            max = max_;
+        }
+
+        static if (N == 1)
+        {
+            @nogc this(T min_, T max_) pure nothrow
+            {
+                min.x = min_;
+                max.x = max_;
+            }
+        }
+
+        static if (N == 2)
+        {
+            @nogc this(T min_x, T min_y, T max_x, T max_y) pure nothrow
+            {
+                min = bound_t(min_x, min_y);
+                max = bound_t(max_x, max_y);
+            }
+        }
+
+        static if (N == 3)
+        {
+            @nogc this(T min_x, T min_y, T min_z, T max_x, T max_y, T max_z) pure nothrow
+            {
+                min = bound_t(min_x, min_y, min_z);
+                max = bound_t(max_x, max_y, max_z);
+            }
+        }
+
+        @property
+        {
+            /// Returns: Dimensions of the box.
+            @nogc bound_t size() pure const nothrow
+            {
+                return max - min;
+            }
+
+            /// Sets size of the box assuming min point is the pivot.
+            /// Returns: Dimensions of the box.
+            @nogc bound_t size(bound_t value) pure nothrow
+            {
+                max = min + value;
+                return value;
+            }
+
+            /// Returns: Center of the box.
+            @nogc bound_t center() pure const nothrow
+            {
+                return (min + max) / 2;
+            }
+
+            static if (N >= 1)
+            {
+                /// Returns: Width of the box, always applicable.
+                @nogc T width() pure const nothrow @property
+                {
+                    return max.x - min.x;
+                }
+
+                /// Sets width of the box assuming min point is the pivot.
+                /// Returns: Width of the box, always applicable.
+                @nogc T width(T value) pure nothrow @property
+                {
+                    max.x = min.x + value;
+                    return value;
+                }
+            }
+
+            static if (N >= 2)
+            {
+                /// Returns: Height of the box, if applicable.
+                @nogc T height() pure const nothrow @property
+                {
+                    return max.y - min.y;
+                }
+
+                /// Sets height of the box assuming min point is the pivot.
+                /// Returns: Height of the box, if applicable.
+                @nogc T height(T value) pure nothrow @property
+                {
+                    max.y = min.y + value;
+                    return value;
+                }
+            }
+
+            static if (N >= 3)
+            {
+                /// Returns: Depth of the box, if applicable.
+                @nogc T depth() pure const nothrow @property
+                {
+                    return max.z - min.z;
+                }
+
+                /// Sets depth of the box assuming min point is the pivot.
+                /// Returns: Depth of the box, if applicable.
+                @nogc T depth(T value) pure nothrow @property
+                {
+                    max.z = min.z + value;
+                    return value;
+                }
+            }
+
+            /// Returns: Signed volume of the box.
+            @nogc T volume() pure const nothrow
+            {
+                T res = 1;
+                bound_t size = size();
+                for(int i = 0; i < N; ++i)
+                    res *= size[i];
+                return res;
+            }
+
+            /// Returns: true if empty.
+            @nogc bool empty() pure const nothrow
+            {
+                bound_t size = size();
+                mixin(generateLoopCode!("if (min[@] == max[@]) return true;", N)());
+                return false;
+            }
+        }
+
+        /// Returns: true if it contains point.
+        @nogc bool contains(bound_t point) pure const nothrow
+        {
+            assert(isSorted());
+            for(int i = 0; i < N; ++i)
+                if ( !(point[i] >= min[i] && point[i] < max[i]) )
+                    return false;
+
+            return true;
+        }
+
+        static if (N >= 2)
+        {
+            /// Returns: true if it contains point `x`, `y`.
+            @nogc bool contains(T x, T y) pure const nothrow
+            {
+                assert(isSorted());
+                if ( !(x >= min.x && x < max.x) )
+                    return false;
+                if ( !(y >= min.y && y < max.y) )
+                    return false;
+                return true;
+            }
+        }
+
+        static if (N >= 3)
+        {
+            /// Returns: true if it contains point `x`, `y`, `z`.
+            @nogc bool contains(T x, T y, T z) pure const nothrow
+            {
+                assert(isSorted());
+                if ( !(x >= min.x && x < max.x) )
+                    return false;
+                if ( !(y >= min.y && y < max.y) )
+                    return false;
+                if ( !(z >= min.z && z < max.z) )
+                    return false;
+                return true;
+            }
+        }
+
+        /// Returns: true if it contains box other.
+        @nogc bool contains(Box other) pure const nothrow
+        {
+            assert(isSorted());
+            assert(other.isSorted());
+
+            mixin(generateLoopCode!("if ( (other.min[@] < min[@]) || (other.max[@] > max[@]) ) return false;", N)());
+            return true;
+        }
+
+        /// Euclidean squared distance from a point.
+        /// See_also: Numerical Recipes Third Edition (2007)
+        @nogc real squaredDistance(bound_t point) pure const nothrow
+        {
+            assert(isSorted());
+            real distanceSquared = 0;
+            for (int i = 0; i < N; ++i)
+            {
+                if (point[i] < min[i])
+                    distanceSquared += (point[i] - min[i]) ^^ 2;
+
+                if (point[i] > max[i])
+                    distanceSquared += (point[i] - max[i]) ^^ 2;
+            }
+            return distanceSquared;
+        }
+
+        /// Euclidean distance from a point.
+        /// See_also: squaredDistance.
+        @nogc real distance(bound_t point) pure const nothrow
+        {
+            return sqrt(squaredDistance(point));
+        }
+
+        /// Euclidean squared distance from another box.
+        /// See_also: Numerical Recipes Third Edition (2007)
+        @nogc real squaredDistance(Box o) pure const nothrow
+        {
+            assert(isSorted());
+            assert(o.isSorted());
+            real distanceSquared = 0;
+            for (int i = 0; i < N; ++i)
+            {
+                if (o.max[i] < min[i])
+                    distanceSquared += (o.max[i] - min[i]) ^^ 2;
+
+                if (o.min[i] > max[i])
+                    distanceSquared += (o.min[i] - max[i]) ^^ 2;
+            }
+            return distanceSquared;
+        }
+
+        /// Euclidean distance from another box.
+        /// See_also: squaredDistance.
+        @nogc real distance(Box o) pure const nothrow
+        {
+            return sqrt(squaredDistance(o));
+        }
+
+        /// Assumes sorted boxes.
+        /// This function deals with empty boxes correctly.
+        /// Returns: Intersection of two boxes.
+        @nogc Box intersection(Box o) pure const nothrow
+        {
+            assert(isSorted());
+            assert(o.isSorted());
+
+            // Return an empty box if one of the boxes is empty
+            if (empty())
+                return this;
+
+            if (o.empty())
+                return o;
+
+            Box result = void;
+            for (int i = 0; i < N; ++i)
+            {
+                T maxOfMins = (min.v[i] > o.min.v[i]) ? min.v[i] : o.min.v[i];
+                T minOfMaxs = (max.v[i] < o.max.v[i]) ? max.v[i] : o.max.v[i];
+                result.min.v[i] = maxOfMins;
+                result.max.v[i] = minOfMaxs >= maxOfMins ? minOfMaxs : maxOfMins;
+            }
+            return result;
+        }
+
+        /// Assumes sorted boxes.
+        /// This function deals with empty boxes correctly.
+        /// Returns: Intersection of two boxes.
+        @nogc bool intersects(Box other) pure const nothrow
+        {
+            Box inter = this.intersection(other);
+            return inter.isSorted() && !inter.empty();
+        }
+
+        /// Extends the area of this Box.
+        @nogc Box grow(bound_t space) pure const nothrow
+        {
+            Box res = this;
+            res.min -= space;
+            res.max += space;
+            return res;
+        }
+
+        /// Shrink the area of this Box. The box might became unsorted.
+        @nogc Box shrink(bound_t space) pure const nothrow
+        {
+            return grow(-space);
+        }
+
+        /// Extends the area of this Box.
+        @nogc Box grow(T space) pure const nothrow
+        {
+            return grow(bound_t(space));
+        }
+
+        /// Translate this Box.
+        @nogc Box translate(bound_t offset) pure const nothrow
+        {
+            return Box(min + offset, max + offset);
+        }
+
+        /// Scale the box by factor `scale`, and round the result to integer if needed.
+        @nogc Box scaleByFactor(float scale) const nothrow
+        {
+            Box res;
+            static if (isFloatingPoint!T)
+            {
+                res.min.x = min.x * scale;
+                res.min.y = min.y * scale;
+                res.max.x = max.x * scale;
+                res.max.y = max.y * scale;
+            }
+            else
+            {
+                res.min.x = cast(T)( round(min.x * scale) );
+                res.min.y = cast(T)( round(min.y * scale) );
+                res.max.x = cast(T)( round(max.x * scale) );
+                res.max.y = cast(T)( round(max.y * scale) );
+            }
+            return res;
+        }
+
+        static if (N == 2) // useful for UI that have horizontal and vertical scale
+        {
+            /// Scale the box by factor `scaleX` horizontally and `scaleY` vetically. 
+            /// Round the result to integer if needed.
+            @nogc Box scaleByFactor(float scaleX, float scaleY) const nothrow
+            {
+                Box res;
+                static if (isFloatingPoint!T)
+                {
+                    res.min.x = min.x * scaleX;
+                    res.min.y = min.y * scaleY;
+                    res.max.x = max.x * scaleX;
+                    res.max.y = max.y * scaleY;
+                }
+                else
+                {
+                    res.min.x = cast(T)( round(min.x * scaleX) );
+                    res.min.y = cast(T)( round(min.y * scaleY) );
+                    res.max.x = cast(T)( round(max.x * scaleX) );
+                    res.max.y = cast(T)( round(max.y * scaleY) );
+                }
+                return res;
+            }
+        }
+
+        static if (N >= 2)
+        {
+            /// Translate this Box by `x`, `y`.
+            @nogc Box translate(T x, T y) pure const nothrow
+            {
+                Box res = this;
+                res.min.x += x;
+                res.min.y += y;
+                res.max.x += x;
+                res.max.y += y;
+                return res;
+            }
+        }
+
+        static if (N >= 3)
+        {
+            /// Translate this Box by `x`, `y`.
+            @nogc Box translate(T x, T y, T z) pure const nothrow
+            {
+                Box res = this;
+                res.min.x += x;
+                res.min.y += y;
+                res.min.z += z;
+                res.max.x += x;
+                res.max.y += y;
+                res.max.z += z;
+                return res;
+            }
+        }
+
+        /// Shrinks the area of this Box.
+        /// Returns: Shrinked box.
+        @nogc Box shrink(T space) pure const nothrow
+        {
+            return shrink(bound_t(space));
+        }
+
+        /// Expands the box to include point.
+        /// Returns: Expanded box.
+        @nogc Box expand(bound_t point) pure const nothrow
+        {
+            import vector = dplug.math.vector;
+            return Box(vector.minByElem(min, point), vector.maxByElem(max, point));
+        }
+
+        /// Expands the box to include another box.
+        /// This function deals with empty boxes correctly.
+        /// Returns: Expanded box.
+        @nogc Box expand(Box other) pure const nothrow
+        {
+            assert(isSorted());
+            assert(other.isSorted());
+
+            // handle empty boxes
+            if (empty())
+                return other;
+            if (other.empty())
+                return this;
+
+            Box result = void;
+            for (int i = 0; i < N; ++i)
+            {
+                T minOfMins = (min.v[i] < other.min.v[i]) ? min.v[i] : other.min.v[i];
+                T maxOfMaxs = (max.v[i] > other.max.v[i]) ? max.v[i] : other.max.v[i];
+                result.min.v[i] = minOfMins;
+                result.max.v[i] = maxOfMaxs;
+            }
+            return result;
+        }
+
+        /// Returns: true if each dimension of the box is >= 0.
+        @nogc bool isSorted() pure const nothrow
+        {
+            for(int i = 0; i < N; ++i)
+            {
+                if (min[i] > max[i])
+                    return false;
+            }
+            return true;
+        }
+
+        /// Returns: Absolute value of the Box to ensure each dimension of the
+        /// box is >= 0.
+        @nogc Box abs() pure const nothrow
+        {
+            Box!(T, N) s = this;
+            for (int i = 0; i < N; ++i)
+            {
+                if (s.min.v[i] > s.max.v[i])
+                {
+                    T tmp = s.min.v[i];
+                    s.min.v[i] = s.max.v[i];
+                    s.max.v[i] = tmp;
+                }
+            }
+            return s;
+        }
+
+        /// Assign with another box.
+        @nogc ref Box opAssign(U)(U x) nothrow if (isBox!U)
+        {
+            static if(is(U.element_t : T))
+            {
+                static if(U._size == _size)
+                {
+                    min = x.min;
+                    max = x.max;
+                }
+                else
+                {
+                    static assert(false, "no conversion between boxes with different dimensions");
+                }
+            }
+            else
+            {
+                static assert(false, "no conversion from " ~ U.element_t.stringof ~ " to " ~ element_t.stringof);
+            }
+            return this;
+        }
+
+        /// Returns: true if comparing equal boxes.
+        @nogc bool opEquals(U)(U other) pure const nothrow if (is(U : Box))
+        {
+            return (min == other.min) && (max == other.max);
+        }
+
+        /// Cast to other box types.
+        @nogc U opCast(U)() pure const nothrow if (isBox!U)
+        {
+            U b = void;
+            for(int i = 0; i < N; ++i)
+            {
+                b.min[i] = cast(U.element_t)(min[i]);
+                b.max[i] = cast(U.element_t)(max[i]);
+            }
+            return b; // return a box where each element has been casted
+        }
+
+        static if (N == 2)
+        {
+            /// Helper function to create rectangle with a given point, width and height.
+            static @nogc Box rectangle(T x, T y, T width, T height) pure nothrow
+            {
+                return Box(x, y, x + width, y + height);
+            }
+        }
+    }
+
+    private
+    {
+        enum _size = N;
+        alias T element_t;
+    }
+}
+
+/// Instanciate to use a 2D box.
+template box2(T)
+{
+    alias Box!(T, 2) box2;
+}
+
+/// Instanciate to use a 3D box.
+template box3(T)
+{
+    alias Box!(T, 3) box3;
+}
+
+
+alias box2!int box2i; /// 2D box with integer coordinates.
+alias box3!int box3i; /// 3D box with integer coordinates.
+alias box2!float box2f; /// 2D box with float coordinates.
+alias box3!float box3f; /// 3D box with float coordinates.
+alias box2!double box2d; /// 2D box with double coordinates.
+alias box3!double box3d; /// 3D box with double coordinates.
+
+/// Returns: A 2D rectangle with point `x`,`y`, `width` and `height`.
+box2i rectangle(int x, int y, int width, int height) pure nothrow @nogc
+{
+    return box2i(x, y, x + width, y + height);
+}
+
+/// Returns: A 2D rectangle with point `x`,`y`, `width` and `height`.
+box2f rectanglef(float x, float y, float width, float height) pure nothrow @nogc
+{
+    return box2f(x, y, x + width, y + height);
+}
+
+/// Returns: A 2D rectangle with point `x`,`y`, `width` and `height`.
+box2d rectangled(double x, double y, double width, double height) pure nothrow @nogc
+{
+    return box2d(x, y, x + width, y + height);
+}
+
+
+unittest
+{
+    box2i a = box2i(1, 2, 3, 4);
+    assert(a.width == 2);
+    assert(a.height == 2);
+    assert(a.volume == 4);
+    box2i b = box2i(vec2i(1, 2), vec2i(3, 4));
+    assert(a == b);
+
+    box3i q = box3i(-3, -2, -1, 0, 1, 2);
+    q.bound_t s = q.bound_t(11, 17, 19);
+    q.bound_t q_min = q.min;
+    assert((q.size = s) == s);
+    assert(q.size == s);
+    assert(q.min == q_min);
+    assert(q.max == q.min + s);
+    assert(q.max -  q.min == s);
+
+    assert((q.width = s.z) == s.z);
+    assert(q.width == s.z);
+    assert(q.min.x == q_min.x);
+    assert(q.max.x == q.min.x + s.z);
+    assert(q.max.x -  q.min.x == s.z);
+
+    assert((q.height = s.y) == s.y);
+    assert(q.height == s.y);
+    assert(q.min.y == q_min.y);
+    assert(q.max.y == q.min.y + s.y);
+    assert(q.max.y -  q.min.y == s.y);
+
+    assert((q.depth = s.x) == s.x);
+    assert(q.depth == s.x);
+    assert(q.min.z == q_min.z);
+    assert(q.max.z == q.min.z + s.x);
+    assert(q.max.z -  q.min.z == s.x);
+
+    assert(q.size == s.zyx);
+
+    box3i n = box3i(2, 1, 0, -1, -2, -3);
+    assert(n.abs == box3i(-1, -2, -3, 2, 1, 0));
+
+    box2f bf = cast(box2f)b;
+    assert(bf == box2f(1.0f, 2.0f, 3.0f, 4.0f));
+
+    box3f qf = box3f(-0, 1f, 2.5f, 3.25f, 5.125f, 7.0625f);
+    qf.bound_t sf = qf.bound_t(-11.5f, -17.25f, -19.125f);
+    qf.bound_t qf_min = qf.min;
+    assert((qf.size = sf) == sf);
+    assert(qf.size == sf);
+    assert(qf.min == qf_min);
+    assert(qf.max == qf.min + sf);
+    assert(qf.max -  qf.min == sf);
+
+    assert((qf.width = sf.z) == sf.z);
+    assert(qf.width == sf.z);
+    assert(qf.min.x == qf_min.x);
+    assert(qf.max.x == qf.min.x + sf.z);
+    assert(qf.max.x -  qf.min.x == sf.z);
+
+    assert((qf.height = sf.y) == sf.y);
+    assert(qf.height == sf.y);
+    assert(qf.min.y == qf_min.y);
+    assert(qf.max.y == qf.min.y + sf.y);
+    assert(qf.max.y -  qf.min.y == sf.y);
+
+    assert((qf.depth = sf.x) == sf.x);
+    assert(qf.depth == sf.x);
+    assert(qf.min.z == qf_min.z);
+    assert(qf.max.z == qf.min.z + sf.x);
+    assert(qf.max.z -  qf.min.z == sf.x);
+
+    assert(qf.size == sf.zyx);
+
+    box2i c = box2i(0, 0, 1,1);
+    assert(c.translate(vec2i(3, 3)) == box2i(3, 3, 4, 4));
+    assert(c.translate(3, 3) == box2i(3, 3, 4, 4));
+    assert(c.contains(vec2i(0, 0)));
+    assert(c.contains(0, 0));
+    assert(!c.contains(vec2i(1, 1)));
+    assert(!c.contains(1, 1));
+    assert(b.contains(b));
+    box2i d = c.expand(vec2i(3, 3));
+    assert(d.contains(vec2i(2, 2)));
+
+    assert(d == d.expand(d));
+
+    assert(!box2i(0, 0, 4, 4).contains(box2i(2, 2, 6, 6)));
+
+    assert(box2f(0, 0, 0, 0).empty());
+    assert(!box2f(0, 2, 1, 1).empty());
+    assert(!box2f(0, 0, 1, 1).empty());
+
+    assert(box2i(260, 100, 360, 200).intersection(box2i(100, 100, 200, 200)).empty());
+
+    // union with empty box is identity
+    assert(a.expand(box2i(10, 4, 10, 6)) == a);
+
+    // intersection with empty box is empty
+    assert(a.intersection(box2i(10, 4, 10, 6)).empty);
+
+    assert(box2i.rectangle(1, 2, 3, 4) == box2i(1, 2, 4, 6));
+    assert(rectangle(1, 2, 3, 4) == box2i(1, 2, 4, 6));
+    assert(rectanglef(1, 2, 3, 4) == box2f(1, 2, 4, 6));
+    assert(rectangled(1, 2, 3, 4) == box2d(1, 2, 4, 6));
+
+    assert(rectangle(10, 10, 20, 20).scaleByFactor(1.5f) == rectangle(15, 15, 30, 30));
+    assert(rectangle(10, 10, 20, 20).scaleByFactor(1.5f, 2.0f) == rectangle(15, 20, 30, 40));
+}
+
+/// True if `T` is a kind of Box
+enum isBox(T) = is(T : Box!U, U...);
+
+unittest
+{
+    static assert( isBox!box2f);
+    static assert( isBox!box3d);
+    static assert( isBox!(Box!(real, 2)));
+    static assert(!isBox!vec2f);
+}
+
+/// Get the numeric type used to measure a box's dimensions.
+alias DimensionType(T : Box!U, U...) = U[0];
+
+///
+unittest
+{
+    static assert(is(DimensionType!box2f == float));
+    static assert(is(DimensionType!box3d == double));
+}
+
diff --git a/external/dplug/math/matrix.d b/external/dplug/math/matrix.d
new file mode 100644
index 0000000..6cdf82b
--- /dev/null
+++ b/external/dplug/math/matrix.d
@@ -0,0 +1,852 @@
+/**
+ * Custom sized 2D Matrices.
+ *
+ * Copyright: Copyright Guillaume Piolat 2015-2021.
+ *            Copyright Aleksandr Druzhinin 2016-2020.
+ *            Copyright Nathan Sashihara 2018.
+ *            Copyright Thibaut Charles 2018.
+ *
+ * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+ */
+module dplug.math.matrix;
+
+import std.math,
+       std.typetuple,
+       std.traits,
+       std.typecons;
+
+import dplug.math.vector;
+
+/// Generic non-resizeable matrix with R rows and C columns.
+/// Intended for 3D use (size 3x3 and 4x4).
+/// Important: <b>Matrices here are in row-major order whereas OpenGL is column-major.</b>
+/// Params:
+///   T = type of elements
+///   R = number of rows
+///   C = number of columns
+struct Matrix(T, int R, int C)
+{
+    public
+    {
+        static assert(R >= 1 && C >= 1);
+
+        alias Vector!(T, C) row_t;
+        alias Vector!(T, R) column_t;
+
+        enum bool isSquare = (R == C);
+
+        // fields definition
+        union
+        {
+            T[C*R] v;        // all elements
+            row_t[R] rows;   // all rows
+            T[C][R] c;       // components
+        }
+
+        @nogc this(U...)(U values) pure nothrow
+        {
+            static if ((U.length == C*R) && allSatisfy!(isTAssignable, U))
+            {
+                // construct with components
+                foreach(int i, x; values)
+                    v[i] = x;
+            }
+            else static if ((U.length == 1) && (isAssignable!(U[0])) && (!is(U[0] : Matrix)))
+            {
+                // construct with assignment
+                opAssign!(U[0])(values[0]);
+            }
+            else static assert(false, "cannot create a matrix from given arguments");
+        }
+
+        /// Construct a matrix from columns.
+        @nogc static Matrix fromColumns(column_t[] columns) pure nothrow
+        {
+            assert(columns.length == C);
+            Matrix res;
+            for (int i = 0; i < R; ++i)
+                for (int j = 0; j < C; ++j)
+                {
+                   res.c[i][j] = columns[j][i];
+                }
+            return res;
+        }
+
+        /// Construct a matrix from rows.
+        @nogc static Matrix fromRows(row_t[] rows) pure nothrow
+        {
+            assert(rows.length == R);
+            Matrix res;
+            res.rows[] = rows[];
+            return res;
+        }
+
+        /// Construct matrix with a scalar.
+        @nogc this(U)(T x) pure nothrow
+        {
+            for (int i = 0; i < _N; ++i)
+                v[i] = x;
+        }
+
+        /// Assign with a scalar.
+        @nogc ref Matrix opAssign(U : T)(U x) pure nothrow
+        {
+            for (int i = 0; i < R * C; ++i)
+                v[i] = x;
+            return this;
+        }
+
+        /// Assign with a samey matrice.
+        @nogc ref Matrix opAssign(U : Matrix)(U x) pure nothrow
+        {
+            for (int i = 0; i < R * C; ++i)
+                v[i] = x.v[i];
+            return this;
+        }
+
+        /// Assign from other small matrices (same size, compatible type).
+        @nogc ref Matrix opAssign(U)(U x) pure nothrow
+            if (isMatrixInstantiation!U
+                && is(U._T : _T)
+                && (!is(U: Matrix))
+                && (U._R == R) && (U._C == C))
+        {
+            for (int i = 0; i < R * C; ++i)
+                v[i] = x.v[i];
+            return this;
+        }
+
+        /// Assign with a static array of size R * C.
+        @nogc ref Matrix opAssign(U)(U x) pure nothrow
+            if ((isStaticArray!U)
+                && is(typeof(x[0]) : T)
+                && (U.length == R * C))
+        {
+            for (int i = 0; i < R * C; ++i)
+                v[i] = x[i];
+            return this;
+        }
+
+        /// Assign with a static array of shape (R, C).
+        @nogc ref Matrix opAssign(U)(U x) pure nothrow
+            if ((isStaticArray!U) && isStaticArray!(typeof(x[0]))
+                && is(typeof(x[0][0]) : T)
+                && (U.length == R)
+                && (x[0].length == C))
+        {
+            foreach (i; 0..R)
+                rows[i] = x[i];
+            return this;
+        }
+
+        /// Assign with a dynamic array of size R * C.
+        @nogc ref Matrix opAssign(U)(U x) pure nothrow
+            if ((isDynamicArray!U)
+                && is(typeof(x[0]) : T))
+        {
+            assert(x.length == R * C);
+            for (int i = 0; i < R * C; ++i)
+                v[i] = x[i];
+            return this;
+        }
+
+        /// Assign with a dynamic array of shape (R, C).
+        @nogc ref Matrix opAssign(U)(U x) pure nothrow
+            if ((isDynamicArray!U) && isDynamicArray!(typeof(x[0]))
+                && is(typeof(x[0][0]) : T))
+        {
+            assert(x.length == R);
+            foreach (i; 0..R)
+            {
+                assert(x[i].length == C);
+                rows[i] = x[i];
+            }
+            return this;
+        }
+
+        /// Return a pointer to content.
+        @nogc inout(T)* ptr() pure inout nothrow @property
+        {
+            return v.ptr;
+        }
+
+        /// Returns a column as a vector
+        /// Returns: column j as a vector.
+        @nogc column_t column(int j) pure const nothrow
+        {
+            column_t res = void;
+            for (int i = 0; i < R; ++i)
+                res.v[i] = c[i][j];
+            return res;
+        }
+
+        /// Returns a row as a vector
+        /// Returns: row i as a vector.
+        @nogc row_t row(int i) pure const nothrow
+        {
+            return rows[i];
+        }
+
+        /// Matrix * scalar multiplication.
+        @nogc Matrix opBinary(string op)(T factor) pure const nothrow if (op == "*")
+        {
+            Matrix result = void;
+
+            for (int i = 0; i < R; ++i)
+            {
+                for (int j = 0; j < C; ++j)
+                {
+                    result.c[i][j] = c[i][j] * factor;
+                }
+            }
+            return result;
+        }
+
+        /// Matrix * vector multiplication.
+        @nogc column_t opBinary(string op)(row_t x) pure const nothrow if (op == "*")
+        {
+            column_t res = void;
+            for (int i = 0; i < R; ++i)
+            {
+                T sum = 0;
+                for (int j = 0; j < C; ++j)
+                {
+                    sum += c[i][j] * x.v[j];
+                }
+                res.v[i] = sum;
+            }
+            return res;
+        }
+
+        /// Matrix * matrix multiplication.
+        @nogc auto opBinary(string op, U)(U x) pure const nothrow
+            if (isMatrixInstantiation!U && (U._R == C) && (op == "*"))
+        {
+            Matrix!(T, R, U._C) result = void;
+
+            for (int i = 0; i < R; ++i)
+            {
+                for (int j = 0; j < U._C; ++j)
+                {
+                    T sum = 0;
+                    for (int k = 0; k < C; ++k)
+                        sum += c[i][k] * x.c[k][j];
+                    result.c[i][j] = sum;
+                }
+            }
+            return result;
+        }
+
+        /// Matrix add and substraction.
+        @nogc Matrix opBinary(string op, U)(U other) pure const nothrow
+            if (is(U : Matrix) && (op == "+" || op == "-"))
+        {
+            Matrix result = void;
+
+            for (int i = 0; i < R; ++i)
+            {
+                for (int j = 0; j < C; ++j)
+                {
+                    mixin("result.c[i][j] = c[i][j] " ~ op ~ " other.c[i][j];");
+                }
+            }
+            return result;
+        }
+
+        // matrix *= scalar
+        @nogc ref Matrix opOpAssign(string op, U : T)(U x) pure nothrow if (op == "*")
+        {
+            for (int i = 0; i < R * C; ++i)
+                v[i] *= x;
+            return this;
+        }
+
+        /// Assignment operator with another samey matrix.
+        @nogc ref Matrix opOpAssign(string op, U)(U operand) pure nothrow 
+            if (is(U : Matrix) && (op == "*" || op == "+" || op == "-"))
+        {
+            mixin("Matrix result = this " ~ op ~ " operand;");
+            return opAssign!Matrix(result);
+        }
+
+        /// Matrix += <something convertible to a Matrix>
+        /// Matrix -= <something convertible to a Matrix>
+        @nogc ref Matrix opOpAssign(string op, U)(U operand) pure nothrow 
+            if ((isConvertible!U) && (op == "*" || op == "+" || op == "-"))
+        {
+            Matrix conv = operand;
+            return opOpAssign!op(conv);
+        }
+
+        /// Cast to other matrix types.
+        /// If the size are different, the resulting matrix is truncated
+        /// and/or filled with identity coefficients.
+        @nogc U opCast(U)() pure const nothrow if (isMatrixInstantiation!U)
+        {
+            U res = U.identity();
+            enum minR = R < U._R ? R : U._R;
+            enum minC = C < U._C ? C : U._C;
+            for (int i = 0; i < minR; ++i)
+                for (int j = 0; j < minC; ++j)
+                {
+                    res.c[i][j] = cast(U._T)(c[i][j]);
+                }
+            return res;
+        }
+
+        @nogc bool opEquals(U)(U other) pure const nothrow if (is(U : Matrix))
+        {
+            for (int i = 0; i < R * C; ++i)
+                if (v[i] != other.v[i])
+                    return false;
+            return true;
+        }
+
+        @nogc bool opEquals(U)(U other) pure const nothrow
+            if ((isAssignable!U) && (!is(U: Matrix)))
+        {
+            Matrix conv = other;
+            return opEquals(conv);
+        }
+
+        // +matrix, -matrix, ~matrix, !matrix
+        @nogc Matrix opUnary(string op)() pure const nothrow if (op == "+" || op == "-" || op == "~" || op == "!")
+        {
+            Matrix res = void;
+            for (int i = 0; i < N; ++i)
+                mixin("res.v[i] = " ~ op ~ "v[i];");
+            return res;
+        }
+
+        static if (isSquare && isFloatingPoint!T && R == 1)
+        {
+            /// Returns an inverted copy of this matrix
+            /// Returns: inverse of matrix.
+            /// Note: Matrix inversion is provided for 1x1, 2x2, 3x3 and 4x4 floating point matrices.
+            @nogc Matrix inverse() pure const nothrow
+            {
+                assert(c[0][0] != 0); // Programming error if matrix is not invertible.
+                return Matrix( 1 / c[0][0]);
+            }
+        }
+
+        static if (isSquare && isFloatingPoint!T && R == 2)
+        {
+            /// Returns an inverted copy of this matrix
+            /// Returns: inverse of matrix.
+            /// Note: Matrix inversion is provided for 1x1, 2x2, 3x3 and 4x4 floating point matrices.
+            @nogc Matrix inverse() pure const nothrow
+            {
+                T det = (c[0][0] * c[1][1] - c[0][1] * c[1][0]);
+                assert(det != 0); // Programming error if matrix is not invertible.
+                T invDet = 1 / det;
+                return Matrix( c[1][1] * invDet, -c[0][1] * invDet,
+                                   -c[1][0] * invDet,  c[0][0] * invDet);
+            }
+        }
+
+        static if (isSquare && isFloatingPoint!T && R == 3)
+        {
+            /// Returns an inverted copy of this matrix
+            /// Returns: inverse of matrix.
+            /// Note: Matrix inversion is provided for 1x1, 2x2, 3x3 and 4x4 floating point matrices.
+            @nogc Matrix inverse() pure const nothrow
+            {
+                T det = c[0][0] * (c[1][1] * c[2][2] - c[2][1] * c[1][2])
+                      - c[0][1] * (c[1][0] * c[2][2] - c[1][2] * c[2][0])
+                      + c[0][2] * (c[1][0] * c[2][1] - c[1][1] * c[2][0]);
+                assert(det != 0); // Programming error if matrix is not invertible.
+                T invDet = 1 / det;
+
+                Matrix res = void;
+                res.c[0][0] =  (c[1][1] * c[2][2] - c[2][1] * c[1][2]) * invDet;
+                res.c[0][1] = -(c[0][1] * c[2][2] - c[0][2] * c[2][1]) * invDet;
+                res.c[0][2] =  (c[0][1] * c[1][2] - c[0][2] * c[1][1]) * invDet;
+                res.c[1][0] = -(c[1][0] * c[2][2] - c[1][2] * c[2][0]) * invDet;
+                res.c[1][1] =  (c[0][0] * c[2][2] - c[0][2] * c[2][0]) * invDet;
+                res.c[1][2] = -(c[0][0] * c[1][2] - c[1][0] * c[0][2]) * invDet;
+                res.c[2][0] =  (c[1][0] * c[2][1] - c[2][0] * c[1][1]) * invDet;
+                res.c[2][1] = -(c[0][0] * c[2][1] - c[2][0] * c[0][1]) * invDet;
+                res.c[2][2] =  (c[0][0] * c[1][1] - c[1][0] * c[0][1]) * invDet;
+                return res;
+            }
+        }
+
+        static if (isSquare && isFloatingPoint!T && R == 4)
+        {
+            /// Returns an inverted copy of this matrix
+            /// Returns: inverse of matrix.
+            /// Note: Matrix inversion is provided for 1x1, 2x2, 3x3 and 4x4 floating point matrices.
+            @nogc Matrix inverse() pure const nothrow
+            {
+                T det2_01_01 = c[0][0] * c[1][1] - c[0][1] * c[1][0];
+                T det2_01_02 = c[0][0] * c[1][2] - c[0][2] * c[1][0];
+                T det2_01_03 = c[0][0] * c[1][3] - c[0][3] * c[1][0];
+                T det2_01_12 = c[0][1] * c[1][2] - c[0][2] * c[1][1];
+                T det2_01_13 = c[0][1] * c[1][3] - c[0][3] * c[1][1];
+                T det2_01_23 = c[0][2] * c[1][3] - c[0][3] * c[1][2];
+
+                T det3_201_012 = c[2][0] * det2_01_12 - c[2][1] * det2_01_02 + c[2][2] * det2_01_01;
+                T det3_201_013 = c[2][0] * det2_01_13 - c[2][1] * det2_01_03 + c[2][3] * det2_01_01;
+                T det3_201_023 = c[2][0] * det2_01_23 - c[2][2] * det2_01_03 + c[2][3] * det2_01_02;
+                T det3_201_123 = c[2][1] * det2_01_23 - c[2][2] * det2_01_13 + c[2][3] * det2_01_12;
+
+                T det = - det3_201_123 * c[3][0] + det3_201_023 * c[3][1] - det3_201_013 * c[3][2] + det3_201_012 * c[3][3];
+                assert(det != 0); // Programming error if matrix is not invertible.
+                T invDet = 1 / det;
+
+                T det2_03_01 = c[0][0] * c[3][1] - c[0][1] * c[3][0];
+                T det2_03_02 = c[0][0] * c[3][2] - c[0][2] * c[3][0];
+                T det2_03_03 = c[0][0] * c[3][3] - c[0][3] * c[3][0];
+                T det2_03_12 = c[0][1] * c[3][2] - c[0][2] * c[3][1];
+                T det2_03_13 = c[0][1] * c[3][3] - c[0][3] * c[3][1];
+                T det2_03_23 = c[0][2] * c[3][3] - c[0][3] * c[3][2];
+                T det2_13_01 = c[1][0] * c[3][1] - c[1][1] * c[3][0];
+                T det2_13_02 = c[1][0] * c[3][2] - c[1][2] * c[3][0];
+                T det2_13_03 = c[1][0] * c[3][3] - c[1][3] * c[3][0];
+                T det2_13_12 = c[1][1] * c[3][2] - c[1][2] * c[3][1];
+                T det2_13_13 = c[1][1] * c[3][3] - c[1][3] * c[3][1];
+                T det2_13_23 = c[1][2] * c[3][3] - c[1][3] * c[3][2];
+
+                T det3_203_012 = c[2][0] * det2_03_12 - c[2][1] * det2_03_02 + c[2][2] * det2_03_01;
+                T det3_203_013 = c[2][0] * det2_03_13 - c[2][1] * det2_03_03 + c[2][3] * det2_03_01;
+                T det3_203_023 = c[2][0] * det2_03_23 - c[2][2] * det2_03_03 + c[2][3] * det2_03_02;
+                T det3_203_123 = c[2][1] * det2_03_23 - c[2][2] * det2_03_13 + c[2][3] * det2_03_12;
+
+                T det3_213_012 = c[2][0] * det2_13_12 - c[2][1] * det2_13_02 + c[2][2] * det2_13_01;
+                T det3_213_013 = c[2][0] * det2_13_13 - c[2][1] * det2_13_03 + c[2][3] * det2_13_01;
+                T det3_213_023 = c[2][0] * det2_13_23 - c[2][2] * det2_13_03 + c[2][3] * det2_13_02;
+                T det3_213_123 = c[2][1] * det2_13_23 - c[2][2] * det2_13_13 + c[2][3] * det2_13_12;
+
+                T det3_301_012 = c[3][0] * det2_01_12 - c[3][1] * det2_01_02 + c[3][2] * det2_01_01;
+                T det3_301_013 = c[3][0] * det2_01_13 - c[3][1] * det2_01_03 + c[3][3] * det2_01_01;
+                T det3_301_023 = c[3][0] * det2_01_23 - c[3][2] * det2_01_03 + c[3][3] * det2_01_02;
+                T det3_301_123 = c[3][1] * det2_01_23 - c[3][2] * det2_01_13 + c[3][3] * det2_01_12;
+
+                Matrix res = void;
+                res.c[0][0] = - det3_213_123 * invDet;
+                res.c[1][0] = + det3_213_023 * invDet;
+                res.c[2][0] = - det3_213_013 * invDet;
+                res.c[3][0] = + det3_213_012 * invDet;
+
+                res.c[0][1] = + det3_203_123 * invDet;
+                res.c[1][1] = - det3_203_023 * invDet;
+                res.c[2][1] = + det3_203_013 * invDet;
+                res.c[3][1] = - det3_203_012 * invDet;
+
+                res.c[0][2] = + det3_301_123 * invDet;
+                res.c[1][2] = - det3_301_023 * invDet;
+                res.c[2][2] = + det3_301_013 * invDet;
+                res.c[3][2] = - det3_301_012 * invDet;
+
+                res.c[0][3] = - det3_201_123 * invDet;
+                res.c[1][3] = + det3_201_023 * invDet;
+                res.c[2][3] = - det3_201_013 * invDet;
+                res.c[3][3] = + det3_201_012 * invDet;
+                return res;
+            }
+        }
+
+        /// Returns a transposed copy of this matrix
+        /// Returns: transposed matrice.
+        @nogc Matrix!(T, C, R) transposed() pure const nothrow
+        {
+            Matrix!(T, C, R) res;
+            for (int i = 0; i < C; ++i)
+                for (int j = 0; j < R; ++j)
+                    res.c[i][j] = c[j][i];
+            return res;
+        }
+
+        static if (isSquare && R > 1)
+        {
+            /// Makes a diagonal matrix from a vector.
+            @nogc static Matrix diag(Vector!(T, R) v) pure nothrow
+            {
+                Matrix res = void;
+                for (int i = 0; i < R; ++i)
+                    for (int j = 0; j < C; ++j)
+                        res.c[i][j] = (i == j) ? v.v[i] : 0;
+                return res;
+            }
+
+            /// In-place translate by (v, 1)
+            @nogc void translate(Vector!(T, R-1) v) pure nothrow
+            {
+                for (int i = 0; i < R; ++i)
+                {
+                    T dot = 0;
+                    for (int j = 0; j + 1 < C; ++j)
+                        dot += v.v[j] * c[i][j];
+
+                    c[i][C-1] += dot;
+                }
+            }
+
+            /// Make a translation matrix.
+            @nogc static Matrix translation(Vector!(T, R-1) v) pure nothrow
+            {
+                Matrix res = identity();
+                for (int i = 0; i + 1 < R; ++i)
+                    res.c[i][C-1] += v.v[i];
+                return res;
+            }
+
+            /// In-place matrix scaling.
+            void scale(Vector!(T, R-1) v) pure nothrow
+            {
+                for (int i = 0; i < R; ++i)
+                    for (int j = 0; j + 1 < C; ++j)
+                        c[i][j] *= v.v[j];
+            }
+
+            /// Make a scaling matrix.
+            @nogc static Matrix scaling(Vector!(T, R-1) v) pure nothrow
+            {
+                Matrix res = identity();
+                for (int i = 0; i + 1 < R; ++i)
+                    res.c[i][i] = v.v[i];
+                return res;
+            }
+        }
+
+        // rotations are implemented for 3x3 and 4x4 matrices.
+        static if (isSquare && (R == 3 || R == 4) && isFloatingPoint!T)
+        {
+            @nogc public static Matrix rotateAxis(int i, int j)(T angle) pure nothrow
+            {
+                Matrix res = identity();
+                const T cosa = cos(angle);
+                const T sina = sin(angle);
+                res.c[i][i] = cosa;
+                res.c[i][j] = -sina;
+                res.c[j][i] = sina;
+                res.c[j][j] = cosa;
+                return res;
+            }
+
+            /// Rotate along X axis
+            /// Returns: rotation matrix along axis X
+            alias rotateAxis!(1, 2) rotateX;
+
+            /// Rotate along Y axis
+            /// Returns: rotation matrix along axis Y
+            alias rotateAxis!(2, 0) rotateY;
+
+            /// Rotate along Z axis
+            /// Returns: rotation matrix along axis Z
+            alias rotateAxis!(0, 1) rotateZ;
+
+            /// Similar to the glRotate matrix, however the angle is expressed in radians
+            /// See_also: $(LINK http://www.cs.rutgers.edu/~decarlo/428/gl_man/rotate.html)
+            @nogc static Matrix rotation(T angle, vec3!T axis) pure nothrow
+            {
+                Matrix res = identity();
+                const T c = cos(angle);
+                const oneMinusC = 1 - c;
+                const T s = sin(angle);
+                axis = axis.normalized();
+                T x = axis.x,
+                  y = axis.y,
+                  z = axis.z;
+                T xy = x * y,
+                  yz = y * z,
+                  xz = x * z;
+
+                res.c[0][0] = x * x * oneMinusC + c;
+                res.c[0][1] = x * y * oneMinusC - z * s;
+                res.c[0][2] = x * z * oneMinusC + y * s;
+                res.c[1][0] = y * x * oneMinusC + z * s;
+                res.c[1][1] = y * y * oneMinusC + c;
+                res.c[1][2] = y * z * oneMinusC - x * s;
+                res.c[2][0] = z * x * oneMinusC - y * s;
+                res.c[2][1] = z * y * oneMinusC + x * s;
+                res.c[2][2] = z * z * oneMinusC + c;
+                return res;
+            }
+        }
+
+        // 4x4 specific transformations for 3D usage
+        static if (isSquare && R == 4 && isFloatingPoint!T)
+        {
+            /// Orthographic projection
+            /// Returns: orthographic projection.
+            @nogc static Matrix orthographic(T left, T right, T bottom, T top, T near, T far) pure nothrow
+            {
+                T dx = right - left,
+                  dy = top - bottom,
+                  dz = far - near;
+
+                T tx = -(right + left) / dx;
+                T ty = -(top + bottom) / dy;
+                T tz = -(far + near)   / dz;
+
+                return Matrix(2 / dx,   0,      0,    tx,
+                                0,    2 / dy,   0,    ty,
+                                0,      0,   -2 / dz, tz,
+                                0,      0,      0,     1);
+            }
+
+            /// Perspective projection
+            /// Returns: perspective projection.
+            @nogc static Matrix perspective(T FOVInRadians, T aspect, T zNear, T zFar) pure nothrow
+            {
+                T f = 1 / tan(FOVInRadians / 2);
+                T d = 1 / (zNear - zFar);
+
+                return Matrix(f / aspect, 0,                  0,                    0,
+                                       0, f,                  0,                    0,
+                                       0, 0, (zFar + zNear) * d, 2 * d * zFar * zNear,
+                                       0, 0,                 -1,                    0);
+            }
+
+            /// Look At projection
+            /// Returns: "lookAt" projection.
+            /// Thanks to vuaru for corrections.
+            @nogc static Matrix lookAt(vec3!T eye, vec3!T target, vec3!T up) pure nothrow
+            {
+                vec3!T Z = (eye - target).normalized();
+                vec3!T X = cross(-up, Z).normalized();
+                vec3!T Y = cross(Z, -X);
+
+                return Matrix(-X.x,        -X.y,        -X.z,      dot(X, eye),
+                               Y.x,         Y.y,         Y.z,     -dot(Y, eye),
+                               Z.x,         Z.y,         Z.z,     -dot(Z, eye),
+                               0,           0,           0,        1);
+            }
+        }
+    }
+
+    package
+    {
+        alias T _T;
+        enum _R = R;
+        enum _C = C;
+    }
+
+    private
+    {
+        template isAssignable(T)
+        {
+            enum bool isAssignable = std.traits.isAssignable!(Matrix, T);
+        }
+
+        template isConvertible(T)
+        {
+            enum bool isConvertible = (!is(T : Matrix)) && isAssignable!T;
+        }
+
+        template isTAssignable(U)
+        {
+            enum bool isTAssignable = std.traits.isAssignable!(T, U);
+        }
+
+        template isRowConvertible(U)
+        {
+            enum bool isRowConvertible = is(U : row_t);
+        }
+
+        template isColumnConvertible(U)
+        {
+            enum bool isColumnConvertible = is(U : column_t);
+        }
+    }
+
+    public
+    {
+        /// Construct an identity matrix
+        /// Returns: an identity matrix.
+        /// Note: the identity matrix, while only meaningful for square matrices,
+        /// is also defined for non-square ones.
+        @nogc static Matrix identity() pure nothrow
+        {
+            Matrix res = void;
+            for (int i = 0; i < R; ++i)
+                for (int j = 0; j < C; ++j)
+                    res.c[i][j] = (i == j) ? 1 : 0;
+            return res;
+        }
+
+        /// Construct an constant matrix
+        /// Returns: a constant matrice.
+        @nogc static Matrix constant(U)(U x) pure nothrow
+        {
+            Matrix res = void;
+
+            for (int i = 0; i < R * C; ++i)
+                res.v[i] = cast(T)x;
+            return res;
+        }
+    }
+}
+
+template isMatrixInstantiation(U)
+{
+    private static void isMatrix(T, int R, int C)(Matrix!(T, R, C) x)
+    {
+    }
+
+    enum bool isMatrixInstantiation = is(typeof(isMatrix(U.init)));
+}
+
+// GLSL is a big inspiration here
+// we defines types with more or less the same names
+
+///
+template mat2x2(T) { alias Matrix!(T, 2, 2) mat2x2; }
+///
+template mat3x3(T) { alias Matrix!(T, 3, 3) mat3x3; }
+///
+template mat4x4(T) { alias Matrix!(T, 4, 4) mat4x4; }
+
+// WARNING: in GLSL, first number is _columns_, second is rows
+// It is the opposite here: first number is rows, second is columns
+// With this convention mat2x3 * mat3x4 -> mat2x4.
+
+///
+template mat2x3(T) { alias Matrix!(T, 2, 3) mat2x3; }
+///
+template mat2x4(T) { alias Matrix!(T, 2, 4) mat2x4; }
+///
+template mat3x2(T) { alias Matrix!(T, 3, 2) mat3x2; }
+///
+template mat3x4(T) { alias Matrix!(T, 3, 4) mat3x4; }
+///
+template mat4x2(T) { alias Matrix!(T, 4, 2) mat4x2; }
+///
+template mat4x3(T) { alias Matrix!(T, 4, 3) mat4x3; }
+
+// shorter names for most common matrices
+alias mat2x2 mat2;///
+alias mat3x3 mat3;///
+alias mat4x4 mat4;///
+
+// Define a lot of type names
+// Most useful are probably mat4f and mat4d
+
+alias mat2!float  mat2f;///
+alias mat2!double mat2d;///
+
+alias mat3!float  mat3f;///
+alias mat3!double mat3d;///
+
+alias mat4!float  mat4f;///
+alias mat4!double mat4d;///
+
+alias mat2x2!float  mat2x2f;///
+alias mat2x2!double mat2x2d;///
+
+alias mat3x3!float  mat3x3f;///
+alias mat3x3!double mat3x3d;///
+
+alias mat4x4!float  mat4x4f;///
+alias mat4x4!double mat4x4d;///
+
+unittest
+{
+    alias mat2i = mat2!int;
+    alias mat2x3f = mat2x3!float;
+    alias mat3x4f = mat3x4!float;
+    alias mat2x4f = mat2x4!float;
+
+    mat2i x = mat2i(0, 1,
+                    2, 3);
+    assert(x.c[0][0] == 0 && x.c[0][1] == 1 && x.c[1][0] == 2 && x.c[1][1] == 3);
+
+    vec2i[2] cols = [vec2i(0, 2), vec2i(1, 3)];
+    mat2i y = mat2i.fromColumns(cols[]);
+    assert(y.c[0][0] == 0 && y.c[0][1] == 1 && y.c[1][0] == 2 && y.c[1][1] == 3);
+    y = mat2i.fromRows(cols[]);
+    assert(y.c[0][0] == 0 && y.c[1][0] == 1 && y.c[0][1] == 2 && y.c[1][1] == 3);
+    y = y.transposed();
+
+    assert(x == y);
+    x = [0, 1, 2, 3];
+    assert(x == y);
+
+    mat2i z = x * y;
+    assert(z == mat2i([2, 3, 6, 11]));
+    vec2i vz = z * vec2i(2, -1);
+    assert(vz == vec2i(1, 1));
+
+    mat2f a = z;
+    mat2d ad = a;
+    ad += a;
+    mat2f w = [4, 5, 6, 7];
+    z = cast(mat2i)w;
+    assert(w == z);
+
+    {
+        mat2x3f A;
+        mat3x4f B;
+        mat2x4f C = A * B;
+    }
+
+    assert(mat2i.diag(vec2i(1, 2)) == mat2i(1, 0,
+                                            0, 2));
+
+    // Construct with a single scalar
+    auto D = mat4f(1.0f);
+    assert(D.v[] == [1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1, ]);
+
+    {
+        double[4][3] starray = [
+            [ 0,  1,  2,  3],
+            [ 4,  5,  6,  7,],
+            [ 8,  9, 10, 11,],
+        ];
+
+        // starray has the shape 3x4
+        assert(starray.length == 3);
+        assert(starray[0].length == 4);
+
+        auto m = mat3x4!double(starray);
+        assert(m.v[] == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ]);
+    }
+
+    {
+        auto dynarray = [
+            [ 0,  1,  2,  3],
+            [ 4,  5,  6,  7,],
+            [ 8,  9, 10, 11,],
+        ];
+
+        // dynarray has the shape 3x4
+        assert(dynarray.length == 3);
+        assert(dynarray[0].length == 4);
+
+        auto m = mat3x4!double(dynarray);
+        assert(m.v[] == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ]);
+    }
+}
+
+// Issue #206 (matrix *= scalar) not yielding matrix * scalar but matrix * matrix(scalar)
+unittest
+{
+    mat4f mvp = mat4f.identity;
+    mvp *= 2;
+    assert(mvp == mat4f(2, 0, 0, 0,
+                        0, 2, 0, 0,
+                        0, 0, 2, 0,
+                        0, 0, 0, 2));
+
+    mvp = mat4f.identity * 2;
+    assert(mvp == mat4f(2, 0, 0, 0,
+                        0, 2, 0, 0,
+                        0, 0, 2, 0,
+                        0, 0, 0, 2));
+
+
+    mvp = mat4f(1) * mat4f(1);
+    assert(mvp == mat4f(4, 4, 4, 4,
+                        4, 4, 4, 4,
+                        4, 4, 4, 4,
+                        4, 4, 4, 4));
+
+    mvp = mat4f(1);
+    mvp *= mat4f(1);
+    assert(mvp == mat4f(4, 4, 4, 4,
+                        4, 4, 4, 4,
+                        4, 4, 4, 4,
+                        4, 4, 4, 4));
+}
diff --git a/external/dplug/math/package.d b/external/dplug/math/package.d
new file mode 100644
index 0000000..88c583e
--- /dev/null
+++ b/external/dplug/math/package.d
@@ -0,0 +1,12 @@
+/**
+ * Math package: rectangles, vectors, matrices.
+ *
+ * Copyright: Copyright Guillaume Piolat 2021.
+ * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+ * Note: this is part of the former gfm:math package, hence containing copyright from many GFM contributors.
+ */
+module dplug.math;
+
+public import dplug.math.vector,
+              dplug.math.box,
+              dplug.math.matrix;
diff --git a/external/dplug/math/vector.d b/external/dplug/math/vector.d
new file mode 100644
index 0000000..c620068
--- /dev/null
+++ b/external/dplug/math/vector.d
@@ -0,0 +1,823 @@
+/**
+ * N-dimensional small vector math.
+ *
+ * Copyright: Copyright Guillaume Piolat 2021.
+ *            Copyright Chance Snow 2021.
+ *            Copyright Aleksandr Druzhinin 2018.
+ *            Copyright Nathan Sashihara 2018.
+ *            Copyright Ryan Roden-Corrent 2016.
+ *            Copyright Steven Dwy 2015.
+ *            Copyright Martin Nowak 2015.
+ *            Copyright Tanel Tagaväli 2015.
+ * 
+ * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+ */
+module dplug.math.vector;
+
+
+import std.traits,
+       std.math,
+       std.array;
+
+import inteli.emmintrin;
+
+/**
+ * Generic 1D small vector.
+ * Params:
+ *    N = number of elements
+ *    T = type of elements
+ */
+struct Vector(T, int N)
+{
+nothrow:
+    public
+    {
+        static assert(N >= 1);
+
+        // fields definition
+        union
+        {
+            T[N] v;
+            struct
+            {
+                static if (N >= 1)
+                {
+                    T x;
+                    alias x r;
+                }
+                static if (N >= 2)
+                {
+                    T y;
+                    alias y g;
+                }
+                static if (N >= 3)
+                {
+                    T z;
+                    alias z b;
+                }
+                static if (N >= 4)
+                {
+                    T w;
+                    alias w a;
+                }
+            }
+        }
+
+        /// Construct a Vector with a `T[]` or the values as arguments
+        @nogc this(Args...)(Args args) pure nothrow
+        {
+            static if (args.length == 1)
+            {
+                // Construct a Vector from a single value.
+                opAssign!(Args[0])(args[0]);
+            }
+            else
+            {
+                // validate the total argument count across scalars and vectors
+                template argCount(T...) {
+                    static if(T.length == 0)
+                        enum argCount = 0; // done recursing
+                    else static if(isVector!(T[0]))
+                        enum argCount = T[0]._N + argCount!(T[1..$]);
+                    else
+                        enum argCount = 1 + argCount!(T[1..$]);
+                }
+
+                static assert(argCount!Args <= N, "Too many arguments in vector constructor");
+
+                int index = 0;
+                foreach(arg; args)
+                {
+                    static if (isAssignable!(T, typeof(arg)))
+                    {
+                        v[index] = arg;
+                        index++; // has to be on its own line (DMD 2.068)
+                    }
+                    else static if (isVector!(typeof(arg)) && isAssignable!(T, arg._T))
+                    {
+                        mixin(generateLoopCode!("v[index + @] = arg[@];", arg._N)());
+                        index += arg._N;
+                    }
+                    else
+                        static assert(false, "Unrecognized argument in Vector constructor");
+                }
+                assert(index == N, "Bad arguments in Vector constructor");
+            }
+        }
+
+        size_t toHash() const nothrow @safe
+        {
+            size_t hash = 0;
+            foreach (elem; v) {
+                hash = elem.hashOf(hash);
+            }
+            return hash;
+        }
+
+        /// Assign a Vector from a compatible type.
+        @nogc ref Vector opAssign(U)(U x) pure nothrow if (isAssignable!(T, U))
+        {
+            mixin(generateLoopCode!("v[@] = x;", N)()); // copy to each component
+            return this;
+        }
+
+        /// Assign a Vector with a static array type.
+        @nogc ref Vector opAssign(U)(U arr) pure nothrow if ((isStaticArray!(U) && isAssignable!(T, typeof(arr[0])) && (arr.length == N)))
+        {
+            mixin(generateLoopCode!("v[@] = arr[@];", N)());
+            return this;
+        }
+
+        /// Assign with a dynamic array.
+        /// Size is checked in debug-mode.
+        @nogc ref Vector opAssign(U)(U arr) pure nothrow if (isDynamicArray!(U) && isAssignable!(T, typeof(arr[0])))
+        {
+            assert(arr.length == N);
+            mixin(generateLoopCode!("v[@] = arr[@];", N)());
+            return this;
+        }
+
+        /// Assign from a samey Vector.
+        @nogc ref Vector opAssign(U)(U u) pure nothrow if (is(U : Vector))
+        {
+            v[] = u.v[];
+            return this;
+        }
+
+        /// Assign from other vectors types (same size, compatible type).
+        @nogc ref Vector opAssign(U)(U x) pure nothrow if (isVector!U
+                                                       && isAssignable!(T, U._T)
+                                                       && (!is(U: Vector))
+                                                       && (U._N == _N))
+        {
+            mixin(generateLoopCode!("v[@] = x.v[@];", N)());
+            return this;
+        }
+
+        /// Returns: a pointer to content.
+        @nogc inout(T)* ptr() pure inout nothrow @property
+        {
+            return v.ptr;
+        }
+
+        @nogc bool opEquals(U)(U other) pure const nothrow
+            if (is(U : Vector))
+        {
+            for (int i = 0; i < N; ++i)
+            {
+                if (v[i] != other.v[i])
+                {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        @nogc bool opEquals(U)(U other) pure const nothrow
+            if (isConvertible!U)
+        {
+            Vector conv = other;
+            return opEquals(conv);
+        }
+
+        @nogc Vector opUnary(string op)() pure const nothrow
+            if (op == "+" || op == "-" || op == "~" || op == "!")
+        {
+            Vector res = void;
+            mixin(generateLoopCode!("res.v[@] = " ~ op ~ " v[@];", N)());
+            return res;
+        }
+
+        @nogc ref Vector opOpAssign(string op, U)(U operand) pure nothrow
+            if (is(U : Vector))
+        {
+            mixin(generateLoopCode!("v[@] " ~ op ~ "= operand.v[@];", N)());
+            return this;
+        }
+
+        @nogc ref Vector opOpAssign(string op, U)(U operand) pure nothrow if (isConvertible!U)
+        {
+            Vector conv = operand;
+            return opOpAssign!op(conv);
+        }
+
+        @nogc Vector opBinary(string op, U)(U operand) pure const nothrow
+            if (is(U: Vector) || (isConvertible!U))
+        {
+            Vector result = void;
+            static if (is(U: T))
+                mixin(generateLoopCode!("result.v[@] = cast(T)(v[@] " ~ op ~ " operand);", N)());
+            else
+            {
+                Vector other = operand;
+                mixin(generateLoopCode!("result.v[@] = cast(T)(v[@] " ~ op ~ " other.v[@]);", N)());
+            }
+            return result;
+        }
+
+        @nogc Vector opBinaryRight(string op, U)(U operand) pure const nothrow if (isConvertible!U)
+        {
+            Vector result = void;
+            static if (is(U: T))
+                mixin(generateLoopCode!("result.v[@] = cast(T)(operand " ~ op ~ " v[@]);", N)());
+            else
+            {
+                Vector other = operand;
+                mixin(generateLoopCode!("result.v[@] = cast(T)(other.v[@] " ~ op ~ " v[@]);", N)());
+            }
+            return result;
+        }
+
+        @nogc ref T opIndex(size_t i) pure nothrow
+        {
+            return v[i];
+        }
+
+        @nogc ref const(T) opIndex(size_t i) pure const nothrow
+        {
+            return v[i];
+        }
+
+        @nogc T opIndexAssign(U : T)(U x, size_t i) pure nothrow
+        {
+            return v[i] = x;
+        }
+
+
+        /// Implements swizzling.
+        ///
+        /// Example:
+        /// ---
+        /// vec4i vi = [4, 1, 83, 10];
+        /// assert(vi.zxxyw == [83, 4, 4, 1, 10]);
+        /// ---
+        @nogc @property auto opDispatch(string op, U = void)() pure const nothrow if (isValidSwizzle!(op))
+        {
+            alias Vector!(T, op.length) returnType;
+            returnType res = void;
+            enum indexTuple = swizzleTuple!op;
+            foreach(i, index; indexTuple)
+                res.v[i] = v[index];
+            return res;
+        }
+
+        /// Support swizzling assignment like in shader languages.
+        ///
+        /// Example:
+        /// ---
+        /// vec3f v = [0, 1, 2];
+        /// v.yz = v.zx;
+        /// assert(v == [0, 2, 0]);
+        /// ---
+        @nogc @property void opDispatch(string op, U)(U x) pure
+            if ((op.length >= 2)
+                && (isValidSwizzleUnique!op)                   // v.xyy will be rejected
+                && is(typeof(Vector!(T, op.length)(x)))) // can be converted to a small vector of the right size
+        {
+            Vector!(T, op.length) conv = x;
+            enum indexTuple = swizzleTuple!op;
+            foreach(i, index; indexTuple)
+                v[index] = conv[i];
+        }
+
+        /// Casting to small vectors of the same size.
+        /// Example:
+        /// ---
+        /// vec4f vf;
+        /// vec4d vd = cast!(vec4d)vf;
+        /// ---
+        @nogc U opCast(U)() pure const nothrow if (isVector!U && (U._N == _N))
+        {
+            U res = void;
+            mixin(generateLoopCode!("res.v[@] = cast(U._T)v[@];", N)());
+            return res;
+        }
+
+        /// Implement slices operator overloading.
+        /// Allows to go back to slice world.
+        /// Returns: length.
+        @nogc int opDollar() pure const nothrow
+        {
+            return N;
+        }
+
+        /// Slice containing vector values
+        /// Returns: a slice which covers the whole Vector.
+        @nogc T[] opSlice() pure nothrow
+        {
+            return v[];
+        }
+
+        /// vec[a..b]
+        @nogc T[] opSlice(int a, int b) pure nothrow
+        {
+            return v[a..b];
+        }
+
+        /// Squared Euclidean length of the Vector
+        /// Returns: squared length.
+        @nogc T squaredMagnitude() pure const nothrow
+        {
+            T sumSquares = 0;
+            mixin(generateLoopCode!("sumSquares += v[@] * v[@];", N)());
+            return sumSquares;
+        }
+
+        /// Squared Euclidean distance between this vector and another one
+        /// Returns: squared Euclidean distance.
+        @nogc T squaredDistanceTo(Vector v) pure const nothrow
+        {
+            return (v - this).squaredMagnitude();
+        }
+
+        static if (isFloatingPoint!T)
+        {
+            /// Euclidean length of the vector
+            /// Returns: Euclidean length
+            @nogc T magnitude() pure const nothrow
+            {
+                return sqrt(squaredMagnitude());
+            }
+
+            /// Inverse Euclidean length of the vector
+            /// Returns: Inverse of Euclidean length.
+            @nogc T inverseMagnitude() pure const nothrow
+            {
+                return 1 / sqrt(squaredMagnitude());
+            }
+
+            alias fastInverseLength = fastInverseMagnitude;
+            /// Faster but less accurate inverse of Euclidean length.
+            /// Returns: Inverse of Euclidean length.
+            @nogc T fastInverseMagnitude() pure const nothrow
+            {
+                return inverseSqrt(squaredMagnitude());
+            }
+
+            /// Euclidean distance between this vector and another one
+            /// Returns: Euclidean distance between this and other.
+            @nogc T distanceTo(Vector other) pure const nothrow
+            {
+                return (other - this).magnitude();
+            }
+
+            /// In-place normalization.
+            @nogc void normalize() pure nothrow
+            {
+                auto invMag = inverseMagnitude();
+                mixin(generateLoopCode!("v[@] *= invMag;", N)());
+            }
+
+            /// Returns a normalized copy of this Vector
+            /// Returns: Normalized vector.
+            @nogc Vector normalized() pure const nothrow
+            {
+                Vector res = this;
+                res.normalize();
+                return res;
+            }
+
+            /// Faster but less accurate in-place normalization.
+            @nogc void fastNormalize() pure nothrow
+            {
+                auto invLength = fastInverseMagnitude();
+                mixin(generateLoopCode!("v[@] *= invLength;", N)());
+            }
+
+            /// Faster but less accurate vector normalization.
+            /// Returns: Normalized vector.
+            @nogc Vector fastNormalized() pure const nothrow
+            {
+                Vector res = this;
+                res.fastNormalize();
+                return res;
+            }
+
+            static if (N == 3)
+            {
+                /// Gets an orthogonal vector from a 3-dimensional vector.
+                /// Doesn’t normalize the output.
+                /// Authors: Sam Hocevar
+                /// See_also: Source at $(WEB lolengine.net/blog/2013/09/21/picking-orthogonal-vector-combing-coconuts).
+                @nogc Vector getOrthogonalVector() pure const nothrow
+                {
+                    return abs(x) > abs(z) ? Vector(-y, x, 0.0) : Vector(0.0, -z, y);
+                }
+            }
+        }
+    }
+
+    private
+    {
+        enum _N = N;
+        alias T _T;
+
+        // define types that can be converted to this, but are not the same type
+        template isConvertible(T)
+        {
+            enum bool isConvertible = (!is(T : Vector))
+            && is(typeof(
+                {
+                    T x;
+                    Vector v = x;
+                }()));
+        }
+
+        // define types that can't be converted to this
+        template isForeign(T)
+        {
+            enum bool isForeign = (!isConvertible!T) && (!is(T: Vector));
+        }
+
+        template isValidSwizzle(string op, int lastSwizzleClass = -1)
+        {
+            static if (op.length == 0)
+                enum bool isValidSwizzle = true;
+            else
+            {
+                enum len = op.length;
+                enum int swizzleClass = swizzleClassify!(op[0]);
+                enum bool swizzleClassValid = (lastSwizzleClass == -1 || (swizzleClass == lastSwizzleClass));
+                enum bool isValidSwizzle = (swizzleIndex!(op[0]) != -1)
+                                         && swizzleClassValid
+                                         && isValidSwizzle!(op[1..len], swizzleClass);
+            }
+        }
+
+        template searchElement(char c, string s)
+        {
+            static if (s.length == 0)
+            {
+                enum bool result = false;
+            }
+            else
+            {
+                enum string tail = s[1..s.length];
+                enum bool result = (s[0] == c) || searchElement!(c, tail).result;
+            }
+        }
+
+        template hasNoDuplicates(string s)
+        {
+            static if (s.length == 1)
+            {
+                enum bool result = true;
+            }
+            else
+            {
+                enum tail = s[1..s.length];
+                enum bool result = !(searchElement!(s[0], tail).result) && hasNoDuplicates!(tail).result;
+            }
+        }
+
+        // true if the swizzle has at the maximum one time each letter
+        template isValidSwizzleUnique(string op)
+        {
+            static if (isValidSwizzle!op)
+                enum isValidSwizzleUnique = hasNoDuplicates!op.result;
+            else
+                enum bool isValidSwizzleUnique = false;
+        }
+
+        template swizzleIndex(char c)
+        {
+            static if((c == 'x' || c == 'r') && N >= 1)
+                enum swizzleIndex = 0;
+            else static if((c == 'y' || c == 'g') && N >= 2)
+                enum swizzleIndex = 1;
+            else static if((c == 'z' || c == 'b') && N >= 3)
+                enum swizzleIndex = 2;
+            else static if ((c == 'w' || c == 'a') && N >= 4)
+                enum swizzleIndex = 3;
+            else
+                enum swizzleIndex = -1;
+        }
+
+        template swizzleClassify(char c)
+        {
+            static if(c == 'x' || c == 'y' || c == 'z' || c == 'w')
+                enum swizzleClassify = 0;
+            else static if(c == 'r' || c == 'g' || c == 'b' || c == 'a')
+                enum swizzleClassify = 1;
+            else
+                enum swizzleClassify = -1;
+        }
+
+        template swizzleTuple(string op)
+        {
+            enum opLength = op.length;
+            static if (op.length == 0)
+                enum swizzleTuple = [];
+            else
+                enum swizzleTuple = [ swizzleIndex!(op[0]) ] ~ swizzleTuple!(op[1..op.length]);
+        }
+    }
+}
+
+/// True if `T` is some kind of `Vector`
+enum isVector(T) = is(T : Vector!U, U...);
+
+///
+unittest
+{
+    static assert(isVector!vec2f);
+    static assert(isVector!vec3d);
+    static assert(isVector!(vec4!real));
+    static assert(!isVector!float);
+}
+
+/// Get the numeric type used to measure a vectors's coordinates.
+alias DimensionType(T : Vector!U, U...) = U[0];
+
+///
+unittest
+{
+    static assert(is(DimensionType!vec2f == float));
+    static assert(is(DimensionType!vec3d == double));
+}
+
+///
+template vec2(T) { alias Vector!(T, 2) vec2; }
+///
+template vec3(T) { alias Vector!(T, 3) vec3; }
+///
+template vec4(T) { alias Vector!(T, 4) vec4; }
+
+alias vec2!int    vec2i;  ///
+alias vec2!float  vec2f;  ///
+alias vec2!double vec2d;  ///
+
+alias vec3!int    vec3i;  ///
+alias vec3!float  vec3f;  ///
+alias vec3!double vec3d;  ///
+
+alias vec4!int    vec4i;  ///
+alias vec4!float  vec4f;  ///
+alias vec4!double vec4d;  ///
+
+
+/// Element-wise minimum.
+@nogc Vector!(T, N) minByElem(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
+{
+    import std.algorithm: min;
+    Vector!(T, N) res = void;
+    mixin(generateLoopCode!("res.v[@] = min(a.v[@], b.v[@]);", N)());
+    return res;
+}
+
+/// Element-wise maximum.
+@nogc Vector!(T, N) maxByElem(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
+{
+    import std.algorithm: max;
+    Vector!(T, N) res = void;
+    mixin(generateLoopCode!("res.v[@] = max(a.v[@], b.v[@]);", N)());
+    return res;
+}
+
+/// Element-wise absolute value.
+@nogc Vector!(T, N) absByElem(T, int N)(const Vector!(T, N) a) pure nothrow
+{
+    Vector!(T, N) res = void;
+    mixin(generateLoopCode!("res.v[@] = abs(a.v[@]);", N)());
+    return res;
+}
+
+/// Dot product of two vectors
+/// Returns: Dot product.
+@nogc T dot(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
+{
+    T sum = 0;
+    mixin(generateLoopCode!("sum += a.v[@] * b.v[@];", N)());
+    return sum;
+}
+
+/// Cross product of two 3D vectors
+/// Returns: 3D cross product.
+/// Thanks to vuaru for corrections.
+@nogc Vector!(T, 3) cross(T)(const Vector!(T, 3) a, const Vector!(T, 3) b) pure nothrow
+{
+    return Vector!(T, 3)(a.y * b.z - a.z * b.y,
+                         a.z * b.x - a.x * b.z,
+                         a.x * b.y - a.y * b.x);
+}
+
+/// 3D reflect, like the GLSL function.
+/// Returns: a reflected by normal b.
+@nogc Vector!(T, N) reflect(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
+{
+    return a - (2 * dot(b, a)) * b;
+}
+
+///
+@nogc unittest
+{
+    // reflect a 2D vector across the x axis (the normal points along the y axis)
+    assert(vec2f(1,1).reflect(vec2f(0,1)) == vec2f(1,-1));
+    assert(vec2f(1,1).reflect(vec2f(0,-1)) == vec2f(1,-1));
+
+    // note that the normal must be, well, normalized:
+    assert(vec2f(1,1).reflect(vec2f(0,20)) != vec2f(1,-1));
+
+    // think of this like a ball hitting a flat floor at an angle.
+    // the x and y components remain unchanged, and the z inverts
+    assert(vec3f(2,3,-0.5).reflect(vec3f(0,0,1)) == vec3f(2,3,0.5));
+}
+
+/// Angle between two vectors
+/// Returns: angle between vectors.
+/// See_also: "The Right Way to Calculate Stuff" at $(WEB www.plunk.org/~hatch/rightway.php)
+@nogc T angleBetween(T, int N)(const Vector!(T, N) a, const Vector!(T, N) b) pure nothrow
+{
+    auto aN = a.normalized();
+    auto bN = b.normalized();
+    auto dp = dot(aN, bN);
+
+    if (dp < 0)
+        return T(PI) - 2 * asin((-bN-aN).magnitude / 2);
+    else
+        return 2 * asin((bN-aN).magnitude / 2);
+}
+
+static assert(vec2f.sizeof == 8);
+static assert(vec3d.sizeof == 24);
+static assert(vec4i.sizeof == 16);
+
+unittest
+{
+    static assert(vec2i.isValidSwizzle!"xyx");
+    static assert(!vec2i.isValidSwizzle!"xyz");
+    static assert(vec4i.isValidSwizzle!"brra");
+    static assert(!vec4i.isValidSwizzle!"rgyz");
+    static assert(vec2i.isValidSwizzleUnique!"xy");
+    static assert(vec2i.isValidSwizzleUnique!"yx");
+    static assert(!vec2i.isValidSwizzleUnique!"xx");
+
+    alias vec2l = vec2!long;
+    alias vec3ui = vec3!uint;
+    alias vec4ub = vec4!ubyte;
+
+    assert(vec2l(0, 1) == vec2i(0, 1));
+
+    int[2] arr = [0, 1];
+    int[] arr2 = new int[2];
+    arr2[] = arr[];
+    vec2i a = vec2i([0, 1]);
+    vec2i a2 = vec2i(0, 1);
+    immutable vec2i b = vec2i(0);
+    assert(b[0] == 0 && b[1] == 0);
+    vec2i c = arr;
+    vec2l d = arr2;
+    assert(a == a2);
+    assert(a == c);
+    assert(vec2l(a) == vec2l(a));
+    assert(vec2l(a) == d);
+
+    int[vec2i] hashMap;
+    hashMap[a] = (c - a).squaredMagnitude;
+    assert(hashMap[a] == (c - a).squaredMagnitude);
+
+    vec4i x = [4, 5, 6, 7];
+    assert(x == x);
+    --x[0];
+    assert(x[0] == 3);
+    ++x[0];
+    assert(x[0] == 4);
+    x[1] &= 1;
+    x[2] = 77 + x[2];
+    x[3] += 3;
+    assert(x == [4, 1, 83, 10]);
+    assert(x.xxywz == [4, 4, 1, 10, 83]);
+    assert(x.xxxxxxx == [4, 4, 4, 4, 4, 4, 4]);
+    assert(x.abgr == [10, 83, 1, 4]);
+    assert(a != b);
+    x = vec4i(x.xyz, 166);
+    assert(x == [4, 1, 83, 166]);
+
+    vec2l e = a;
+    vec2l f = a + b;
+    assert(f == vec2l(a));
+
+    vec3ui g = vec3i(78,9,4);
+    g ^= vec3i(78,9,4);
+    assert(g == vec3ui(0));
+    //g[0..2] = 1u;
+    //assert(g == [2, 1, 0]);
+
+    assert(vec2i(4, 5) + 1 == vec2i(5,6));
+    assert(vec2i(4, 5) - 1 == vec2i(3,4));
+    assert(1 + vec2i(4, 5) == vec2i(5,6));
+    assert(vec3f(1,1,1) * 0 == 0);
+    assert(1.0 * vec3d(4,5,6) == vec3f(4,5.0f,6.0));
+
+    auto dx = vec2i(1,2);
+    auto dy = vec2i(4,5);
+    auto dp = dot(dx, dy);
+    assert(dp == 14 );
+
+    vec3i h = cast(vec3i)(vec3d(0.5, 1.1, -2.2));
+    assert(h == [0, 1, -2]);
+    assert(h[] == [0, 1, -2]);
+    assert(h[1..3] == [1, -2]);
+    assert(h.zyx == [-2, 1, 0]);
+
+    h.yx = vec2i(5, 2); // swizzle assignment
+
+    assert(h.xy == [2, 5]);
+    assert(-h[1] == -5);
+    assert(++h[0] == 3);
+
+    //assert(h == [-2, 1, 0]);
+    assert(!__traits(compiles, h.xx = h.yy));
+    vec4ub j;
+
+    // larger vectors
+    alias Vector!(float, 5) vec5f;
+    vec5f l = vec5f(1, 2.0f, 3.0, 4u, 5.0L);
+    l = vec5f(l.xyz, vec2i(1, 2));
+
+    // the ctor should not compile if given too many arguments
+    static assert(!is(typeof(vec2f(1, 2, 3))));
+    static assert(!is(typeof(vec2f(vec2f(1, 2), 3))));
+    static assert( is(typeof(vec3f(vec2f(1, 2), 3))));
+    static assert( is(typeof(vec3f(1, 2, 3))));
+
+    assert(absByElem(vec3i(-1, 0, 2)) == vec3i(1, 0, 2));
+}
+
+private:
+
+/// SSE approximation of reciprocal square root.
+@nogc T inverseSqrt(T)(T x) pure nothrow if (isFloatingPoint!T)
+{
+    static if (is(T == float))
+    {
+        __m128 V = _mm_set_ss(x);
+        V = _mm_rsqrt_ss(V);
+        return _mm_cvtss_f32(V);
+    }
+    else
+    {
+        return 1 / sqrt(x);
+    }
+}
+
+
+package
+{
+    // This generates small loops for Vector, Matrix, and Box.
+    // Time has shown such sort of manually unrolled code works best on both DMD and LDC.
+
+    static string generateLoopCode(string formatString, int N)() pure nothrow
+    {
+        string result;
+        for (int i = 0; i < N; ++i)
+        {
+            string index = ctIntToString(i);
+            // replace all @ by indices
+
+            int after = 0;
+            int cur = 0;
+            for (; cur < formatString.length; ++cur)
+            {
+                char ch = formatString[cur];
+                if (ch == '@')
+                {
+                    if (cur > after)
+                        result ~= formatString[after..cur];
+                    result ~= index;
+                    after = cur+1;
+                }
+            }
+            if (cur > after)
+                result ~= formatString[after..cur];
+        }
+        return result;
+    }
+
+    // Speed-up CTFE conversions, replacement for std.conv
+    // Doesn't do the negatives.
+    static string ctIntToString(int n) pure nothrow
+    {
+        static immutable string[16] table = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"];
+        if (n < 10)
+            return table[n];
+        else
+        {
+            char[10] r;
+            for (int k = 0; k < 10; ++k)
+            {
+                r[9-k] = cast(char)('0' + n % 10);
+                n /= 10;
+                if (n == 0)
+                    return r[9-k..$].idup;
+            }
+            return r.idup; 
+        }
+    }
+}
+
+unittest
+{
+    assert(ctIntToString(132) == "132");
+    assert(ctIntToString(2147483647) == "2147483647");
+}
\ No newline at end of file
diff --git a/external/inteli/avx2intrin.d b/external/inteli/avx2intrin.d
new file mode 100644
index 0000000..f0e865a
--- /dev/null
+++ b/external/inteli/avx2intrin.d
@@ -0,0 +1,5011 @@
+/**
+* AVX2 intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX2
+*
+* Copyright: Guillaume Piolat 2022-2024.
+*            Johan Engelen 2022.
+*            cet 2024.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.avx2intrin;
+
+// AVX2 instructions
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX2
+// Note: this header will work whether you have AVX2 enabled or not.
+// With LDC, use "dflags-ldc": ["-mattr=+avx2"] or equivalent to actively
+// generate AVX2 instructions.
+// With GDC, use "dflags-gdc": ["-mavx2"] or equivalent to actively
+// generate AVX2 instructions.
+
+
+// Note: many special cases for GDC, because when suporting SIMD_COMPARISON_MASKS_32B but not having AVX2, 
+// the replaced operators have terrible performance. Mostly a problem for -mavx on x86
+
+public import inteli.types;
+import inteli.internals;
+
+// Pull in all previous instruction set intrinsics.
+public import inteli.avxintrin;
+
+nothrow @nogc:
+
+/// Compute the absolute value of packed signed 16-bit integers in `a`.
+__m256i _mm256_abs_epi16 (__m256i a) @trusted
+{
+    // PERF DMD
+    version(LDC)
+        enum split = true; // always beneficial in LDC neon, ssse3, or even sse2
+    else
+        enum split = GDC_with_SSSE3;
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pabsw256(cast(short16)a);
+    }
+    else static if (__VERSION__ >= 2097 && LDC_with_AVX2)
+    {
+        // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 
+        // no good way to do abs(256-bit)
+        return cast(__m256i) inteli_llvm_abs!short16(cast(short16)a, false);
+    }    
+    else static if (split)
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_abs_epi16(a_lo);
+        __m128i r_hi = _mm_abs_epi16(a_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }    
+    else
+    {        
+        short16 sa = cast(short16)a;
+        for (int i = 0; i < 16; ++i)
+        {
+            short s = sa.array[i];
+            sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s));
+        }  
+        return cast(__m256i)sa;
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000,
+                                  1, -1, -32768, 32767, 12, -13, 1000, -1040);
+    short16 B = cast(short16) _mm256_abs_epi16(A);
+    short[16] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000,
+                         1, 1, -32768, 32767, 12, 13, 1000, 1040];
+    assert(B.array == correct);
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in `a`.
+__m256i _mm256_abs_epi32 (__m256i a) @trusted
+{
+    // PERF DMD
+    version(LDC)
+        enum split = true; // always beneficial in LDC neon, ssse3, or even sse2
+    else
+        enum split = false; // GDC manages to split and use pabsd in SSSE3 without guidance
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pabsd256(cast(int8)a);
+    }
+    else static if (__VERSION__ >= 2097 && LDC_with_AVX2)
+    {
+        // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 
+        // no good way to do abs(256-bit)
+        return cast(__m256i) inteli_llvm_abs!int8(cast(int8)a, false);
+    }
+    else static if (split)
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_abs_epi32(a_lo);
+        __m128i r_hi = _mm_abs_epi32(a_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else
+    {
+        int8 sa = cast(int8)a;
+        for (int i = 0; i < 8; ++i)
+        {
+            int s = sa.array[i];
+            sa.ptr[i] = (s >= 0 ? s : -s);
+        }
+        return cast(__m256i)sa;
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647, -1, 0, -2_147_483_648, -2_147_483_646);
+    int8 B = cast(int8) _mm256_abs_epi32(A);
+    int[8] correct = [0, 1, -2_147_483_648, 2_147_483_647, 1, 0, -2_147_483_648, 2_147_483_646];
+    assert(B.array == correct);
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in `a`.
+__m256i _mm256_abs_epi8 (__m256i a) @trusted
+{
+    // PERF DMD
+    // PERF GDC in SSSE3 to AVX doesn't use pabsb and split is catastrophic because of _mm_min_epu8
+    version(LDC)
+        enum split = true; // always beneficial in LDC neon, ssse3, sse2
+    else
+        enum split = false;
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pabsb256(cast(ubyte32)a);
+    }
+    else static if (__VERSION__ >= 2097 && LDC_with_AVX2)
+    {
+        // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 
+        // no good way to do abs(256-bit)
+        return cast(__m256i) inteli_llvm_abs!byte32(cast(byte32)a, false);
+    }
+    else static if (split)
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_abs_epi8(a_lo);
+        __m128i r_hi = _mm_abs_epi8(a_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else
+    {
+        // Basically this loop is poison for LDC optimizer
+        byte32 sa = cast(byte32)a;
+        for (int i = 0; i < 32; ++i)
+        {
+            byte s = sa.array[i];
+            sa.ptr[i] = s >= 0 ? s : cast(byte)(-cast(int)(s));
+        }
+        return cast(__m256i)sa;
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi8(0, -1, -128, -127, 127,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0,
+                                 0, -1, -128, -126, 127, -6, -5, -4, -3, -2, 0, 1, 2, 3, 4, 5);
+    byte32 B = cast(byte32) _mm256_abs_epi8(A);
+    byte[32] correct =          [0,  1, -128,  127, 127,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0,
+                                 0,  1, -128,  126, 127,  6,  5,  4,  3,  2, 0, 1, 2, 3, 4, 5];
+    assert(B.array == correct);
+}
+
+/// Add packed 16-bit integers in `a` and `b`.
+__m256i _mm256_add_epi16 (__m256i a, __m256i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m256i)(cast(short16)a + cast(short16)b);
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16( -7, -1, 0, 9, -100, 100, 234, 432, -32768, 32767, 0, -1, -20000, 0,  6, -2);
+    short16 R = cast(short16) _mm256_add_epi16(A, A);
+    short[16] correct         = [ -14, -2, 0, 18, -200, 200, 468, 864,     0,    -2, 0, -2,  25536, 0, 12, -4 ];
+    assert(R.array == correct);
+}
+
+/// Add packed 32-bit integers in `a` and `b`.
+__m256i _mm256_add_epi32(__m256i a, __m256i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m256i)(cast(int8)a + cast(int8)b);
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432);
+    int8 R = cast(int8) _mm256_add_epi32(A, A);
+    int[8] correct = [ -14, -2, 0, 18, -200, 200, 468, 864 ];
+    assert(R.array == correct);
+}
+
+/// Add packed 64-bit integers in `a` and `b`.
+__m256i _mm256_add_epi64 (__m256i a, __m256i b) pure @safe
+{
+    pragma(inline, true);
+    return a + b;
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi64(-1, 0x8000_0000_0000_0000, 42, -12);
+    long4 R = cast(__m256i) _mm256_add_epi64(A, A);
+    long[4] correct = [ -2, 0, 84, -24 ];
+    assert(R.array == correct);
+}
+
+/// Add packed 8-bit integers in `a` and `b`.
+__m256i _mm256_add_epi8 (__m256i a, __m256i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m256i)(cast(byte32)a + cast(byte32)b);
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78,
+                                 4, 9, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -2, 0, 10, 78);
+    byte32 R = cast(byte32) _mm256_add_epi8(A, A);
+    byte[32] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100,
+                        8, 18, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -4, 0, 20, -100];
+    assert(R.array == correct);
+}
+
+/// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
+__m256i _mm256_adds_epi16 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_paddsw256(cast(short16)a, cast(short16)b);
+    }
+    else static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m256i) inteli_llvm_adds!short16(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        short16 r;
+        short16 sa = cast(short16)a;
+        short16 sb = cast(short16)b;
+        foreach(i; 0..16)
+            r.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    short16 res = cast(short16) _mm256_adds_epi16(_mm256_setr_epi16( 7,  6,  5, -32768, 3, 3, 32767,   0,  7,  6,  5, -32768, 3, 3, 32767,   0),
+                                                  _mm256_setr_epi16( 7,  6,  5, -30000, 3, 1,     1, -10,  7,  6,  5, -30000, 3, 1,     1, -10));
+    static immutable short[16] correctResult                    =  [14, 12, 10, -32768, 6, 4, 32767, -10, 14, 12, 10, -32768, 6, 4, 32767, -10];
+    assert(res.array == correctResult);
+}
+
+/// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
+__m256i _mm256_adds_epi8 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_paddsb256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m256i) inteli_llvm_adds!byte32(cast(byte32)a, cast(byte32)b);
+    }
+    else
+    {
+        byte32 r;
+        byte32 sa = cast(byte32)a;
+        byte32 sb = cast(byte32)b;
+        foreach(i; 0..32)
+            r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]);
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    byte32 res = cast(byte32) _mm256_adds_epi8(_mm256_setr_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0, 15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0),
+                                               _mm256_setr_epi8(15, 14, 13, 12, 11,  10, 9, 8, 7, 6, 5,   -4, 3, 2, 1, 0, 15, 14, 13, 12, 11,  10, 9, 8, 7, 6, 5,   -4, 3, 2, 1, 0));
+    static immutable byte[32] correctResult                  = [30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0, 30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0]; 
+    assert(res.array == correctResult);
+}
+
+/// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
+__m256i _mm256_adds_epu16 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_paddusw256(cast(short16)a, cast(short16)b);
+    }
+    else static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m256i) inteli_llvm_addus!short16(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        short16 r;
+        short16 sa = cast(short16)a;
+        short16 sb = cast(short16)b;
+        foreach(i; 0..16)
+            r.ptr[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    short16 res = cast(short16) _mm256_adds_epu16(_mm256_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
+                                             _mm256_set_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0));
+    static immutable short[16] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
+    assert(res.array == correctResult);
+}
+
+/// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
+__m256i _mm256_adds_epu8 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_paddusb256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m256i) inteli_llvm_addus!byte32(cast(byte32)a, cast(byte32)b);
+    }
+    else
+    {
+        byte32 r;
+        byte32 sa = cast(byte32)a;
+        byte32 sb = cast(byte32)b;
+        foreach(i; 0..32)
+            r.ptr[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    __m256i A          = _mm256_setr_epi8(0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, cast(byte)136, 0, 0, 0, 0, 0, 0);
+    __m256i B          = _mm256_setr_epi8(0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,             1, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0,            40, 0, 0, 0, 0, 0, 0);
+    byte32 R = cast(byte32) _mm256_adds_epu8(A, B);
+    static immutable byte[32] correct =  [0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, cast(byte)176, 0, 0, 0, 0, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the 
+/// result right by `imm8` bytes, and return the low 16 bytes of that in each lane.
+__m256i _mm256_alignr_epi8(ubyte count)(__m256i a, __m256i b) pure @trusted
+{
+
+    // PERF DMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i)__builtin_ia32_palignr256(a, b, count * 8);
+    }
+    else
+    {
+        // Note that palignr 256-bit does the same as palignr 128-bit by lane. Can split.
+        // With LDC 1.24 + avx2 feature + -02, that correctly gives a AVX2 vpalignr despite being split.
+        // I guess we could do it with a big 32-items shufflevector but not sure if best.
+        // 2 inst on ARM64 neon, which is optimal.
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_alignr_epi8!count(a_lo, b_lo);
+        __m128i r_hi = _mm_alignr_epi8!count(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);   
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8( 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16);
+    __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+    __m256i AA = _mm256_set_m128i(A, A);
+    __m256i BB = _mm256_set_m128i(B, B);
+
+    {
+        byte32 C = cast(byte32) _mm256_alignr_epi8!0(AA, BB);
+        byte[32] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        assert(C.array == correct);
+    }
+    {
+        byte32 C = cast(byte32) _mm256_alignr_epi8!20(AA, BB);
+        byte[32] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0];
+        assert(C.array == correct);
+    }
+    {
+        byte32 C = cast(byte32) _mm256_alignr_epi8!34(AA, BB);
+        byte[32] correct = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        assert(C.array == correct);
+    }
+}
+
+/// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`.
+__m256i _mm256_and_si256 (__m256i a, __m256i b) pure @safe
+{
+    pragma(inline, true);
+    return a & b;
+}
+unittest
+{
+    __m256i A = _mm256_set1_epi32(7);
+    __m256i B = _mm256_set1_epi32(14);
+    int8 R = cast(int8) _mm256_and_si256(A, B);
+    int[8] correct = [6, 6, 6, 6, 6, 6, 6, 6];
+    assert(R.array == correct);
+}
+
+/// Compute the bitwise NOT of 256 bits (representing integer data) in `a` and then AND with `b`.
+__m256i _mm256_andnot_si256 (__m256i a, __m256i b) pure @safe
+{
+    // See: https://issues.dlang.org/show_bug.cgi?id=24283, 
+    // need workaround if we ever use DMD AVX codegen
+
+    pragma(inline, true);
+    return (~a) & b;
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(7, -2, 9, 54654, 7, -2, 9, 54654);
+    __m256i B = _mm256_setr_epi32(14, 78, 111, -256, 14, 78, 111, -256);
+    int8 R = cast(int8) _mm256_andnot_si256(A, B);
+    int[8] correct = [8, 0, 102, -54784, 8, 0, 102, -54784];
+    assert(R.array == correct);
+}
+
+/// Average packed unsigned 16-bit integers in `a` and `b`.
+__m256i _mm256_avg_epu16 (__m256i a, __m256i b) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pavgw256(cast(short16)a, cast(short16)b);
+    }
+    else static if (LDC_with_AVX2 && __VERSION__ >= 2094)
+    {
+        return cast(__m256i) __builtin_ia32_pavgw256(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        // Splitting is always beneficial here, except -O0
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_avg_epu16(a_lo, b_lo);
+        __m128i r_hi = _mm_avg_epu16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_set1_epi16(31457);
+    __m256i B = _mm256_set1_epi16(cast(short)64000);
+    short16 avg = cast(short16)(_mm256_avg_epu16(A, B));
+    foreach(i; 0..16)
+        assert(avg.array[i] == cast(short)47729);
+}
+
+/// Average packed unsigned 8-bit integers in `a` and `b`.
+__m256i _mm256_avg_epu8 (__m256i a, __m256i b) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pavgb256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if (LDC_with_AVX2 && __VERSION__ >= 2094)
+    {
+        return cast(__m256i) __builtin_ia32_pavgb256(cast(byte32)a, cast(byte32)b);
+    }
+    else
+    {
+        // Splitting is always beneficial here, except -O0
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_avg_epu8(a_lo, b_lo);
+        __m128i r_hi = _mm_avg_epu8(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_set1_epi8(-1);
+    __m256i B = _mm256_set1_epi8(13);
+    byte32 avg = cast(byte32)(_mm256_avg_epu8(A, B));
+    foreach(i; 0..32)
+        assert(avg.array[i] == cast(byte)134);
+}
+
+/// Blend packed 16-bit integers from `a` and `b` within 128-bit lanes using 8-bit control
+/// mask `imm8`, in each of the two lanes.
+/// Note: this is functionally equivalent to two `_mm_blend_epi16`.
+__m256i _mm256_blend_epi16(int imm8) (__m256i a, __m256i b) pure @trusted
+{
+    // PERF DMD
+    assert(imm8 >= 0 && imm8 < 256);
+    enum bool split = true; // makes things better, except on ARM32 which is no better than naive
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pblendw256(cast(short16)a, cast(short16)b, imm8);
+    }
+    else static if (split)
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_blend_epi16!(imm8)(a_lo, b_lo);
+        __m128i r_hi = _mm_blend_epi16!(imm8)(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7,  0, -1,  -2,  -3,  -4,  -5,  -6,  -7);
+    __m256i B = _mm256_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15, -8, -9, -10, -11, -12, -13, -14, -15);
+    short16 C = cast(short16) _mm256_blend_epi16!147(A, B); // 10010011 10010011
+    short[16] correct =        [8, 9,  2,  3, 12,  5,  6, 15, -8, -9,  -2, -3, -12,  -5,  -6, -15];
+    assert(C.array == correct);
+}
+
+/// Blend packed 32-bit integers from `a` and `b` using 4-bit control mask `imm8`.
+__m128i _mm_blend_epi32(int imm8)(__m128i a, __m128i b) pure @trusted
+{
+    // This one is interesting, it is functionally equivalent to SSE4.1 blendps (_mm_blend_ps)
+    // So without AVX2 we can always fallback to _mm_blend_ps
+    // And indeed, a shufflevector!int4 doesn't even use vpblendd with LDC, and prefer
+    // blendps and shufps so why bother.
+
+    // PERF DMD
+    static assert(imm8 >= 0 && imm8 < 16);
+    static if (GDC_with_AVX2)
+    {
+        return __builtin_ia32_pblendd128(a, b, imm8);
+    }
+    else
+    {
+        return cast(__m128i) _mm_blend_ps!imm8(cast(__m128)a, cast(__m128)b);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(0, 1,  2,  3);
+    __m128i B = _mm_setr_epi32(8, 9, 10, 11);
+    int4 C = _mm_blend_epi32!13(A, B); // 1101
+    int[4] correct =    [8, 1, 10, 11];
+    assert(C.array == correct);
+}
+
+/// Blend packed 32-bit integers from `a` and `b` using 8-bit control mask `imm8`.
+__m256i _mm256_blend_epi32(int imm8)(__m256i a, __m256i b) pure @trusted
+{
+    // This one is functionally equivalent to AVX _mm256_blend_ps, except with integers.
+    // With LDC, doing a shufflevector here would select the vblendps instruction anyway,
+    // so we might as well defer to _mm256_blend_ps.
+
+    // PERF DMD
+    static assert(imm8 >= 0 && imm8 < 256);
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pblendd256 (cast(int8)a, cast(int8)b, imm8);
+    }
+    else
+    {
+        return cast(__m256i) _mm256_blend_ps!imm8(cast(__m256)a, cast(__m256)b);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(0, 1,  2,  3,  4,  5,  6,  7);
+    __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 147, 15);
+    int8 C = cast(int8) _mm256_blend_epi32!0xe7(A, B);
+    int[8] correct =             [8, 9, 10,  3,  4, 13, 147, 15];
+    assert(C.array == correct);
+}
+
+/// Blend packed 8-bit integers from `a` and `b` using `mask`.
+/// Select from `b` if the high-order bit of the corresponding 8-bit element in `mask` is set, else select from `a`.
+ __m256i _mm256_blendv_epi8 (__m256i a, __m256i b, __m256i mask) pure @safe
+ {
+    static if (GDC_with_AVX2)
+        return cast(__m256i)__builtin_ia32_pblendvb256(cast(ubyte32)a, cast(ubyte32)b, cast(ubyte32)mask);
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pblendvb256(cast(byte32)a, cast(byte32)b, cast(byte32)mask);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i m_lo = _mm256_extractf128_si256!0(mask);
+        __m128i m_hi = _mm256_extractf128_si256!1(mask);
+        __m128i r_lo = _mm_blendv_epi8(a_lo, b_lo, m_lo);
+        __m128i r_hi = _mm_blendv_epi8(a_hi, b_hi, m_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  
+                               8,  9, 10, 11, 12, 13, 14, 15);
+    __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 
+                              24, 25, 26, 27, 28, 29, 30, 31);
+    __m128i M = _mm_setr_epi8( 1, -1,  1,  1, -4,  1, -8,  127,  
+                               1,  1, -1, -1,  4,  1,  8, -128);
+    __m256i AA = _mm256_set_m128i(A, A);
+    __m256i BB = _mm256_set_m128i(B, B);
+    __m256i MM = _mm256_set_m128i(M, M);
+    byte32 R = cast(byte32) _mm256_blendv_epi8(AA, BB, MM);
+    byte[32] correct =      [  0, 17,  2,  3, 20,  5, 22,  7, 8,  9, 26, 27, 12, 13, 14, 31,
+                               0, 17,  2,  3, 20,  5, 22,  7, 8,  9, 26, 27, 12, 13, 14, 31 ];
+    assert(R.array == correct);
+}
+
+/// Broadcast the low packed 8-bit integer from `a` to all elements of result.
+__m128i _mm_broadcastb_epi8 (__m128i a) pure @safe
+{
+    byte16 ba = cast(byte16)a;
+    byte16 r;
+    r = ba.array[0];
+    return cast(__m128i)r;
+}
+unittest
+{
+    byte16 A;
+    A.ptr[0] = 2;
+    byte16 B = cast(byte16) _mm_broadcastb_epi8(cast(__m128i)A);
+    byte[16] correct = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2];
+    assert(B.array == correct);
+}
+
+/// Bro0adcast the low packed 8-bit integer from `a` to all elements of result.
+__m256i _mm256_broadcastb_epi8(__m128i a) pure @safe
+{
+    byte16 ba = cast(byte16)a;
+    byte32 r;
+    r = ba.array[0];
+    return cast(__m256i)r;
+}
+unittest
+{
+    byte16 A;
+    A.ptr[0] = 2;
+    byte32 B = cast(byte32) _mm256_broadcastb_epi8(cast(__m128i)A);
+    byte[32] correct = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2];
+    assert(B.array == correct);
+}
+
+/// Broadcast the low packed 32-bit integer from `a` to all elements of result.
+__m128i _mm_broadcastd_epi32 (__m128i a) pure @safe
+{
+    int4 ba = cast(int4)a;
+    int4 r;
+    r = ba.array[0];
+    return cast(__m128i)r;
+}
+unittest
+{
+    int4 A;
+    A.ptr[0] = -2;
+    int4 B = cast(int4) _mm_broadcastd_epi32(cast(__m128i)A);
+    int[4] correct = [-2, -2, -2, -2];
+    assert(B.array == correct);
+}
+
+/// Broadcast the low packed 32-bit integer from `a` to all elements of result.
+__m256i _mm256_broadcastd_epi32 (__m128i a) pure @safe
+{
+    int4 ba = cast(int4)a;
+    int8 r;
+    r = ba.array[0];
+    return cast(__m256i)r;
+}
+unittest
+{
+    int4 A;
+    A.ptr[0] = -2;
+    int8 B = cast(int8) _mm256_broadcastd_epi32(cast(__m128i)A);
+    int[8] correct = [-2, -2, -2, -2, -2, -2, -2, -2];
+    assert(B.array == correct);
+}
+
+/// Broadcast the low packed 64-bit integer from `a` to all elements of result.
+__m128i _mm_broadcastq_epi64 (__m128i a) pure @safe
+{
+    long2 ba = cast(long2)a;
+    long2 r;
+    r = ba.array[0];
+    return cast(__m128i)r;
+}
+unittest
+{
+    long2 A;
+    A.ptr[0] = -2;
+    long2 B = cast(long2) _mm_broadcastq_epi64(cast(__m128i)A);
+    long[2] correct = [-2, -2];
+    assert(B.array == correct);
+}
+
+/// Broadcast the low packed 64-bit integer from `a` to all elements of result.
+__m256i _mm256_broadcastq_epi64 (__m128i a) pure @safe
+{
+    long2 ba = cast(long2)a;
+    long4 r;
+    r = ba.array[0];
+    return cast(__m256i)r;
+}
+unittest
+{
+    long2 A;
+    A.ptr[0] = -2;
+    long4 B = cast(long4) _mm256_broadcastq_epi64(cast(__m128i)A);
+    long[4] correct = [-2, -2, -2, -2];
+    assert(B.array == correct);
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from `a` to all elements of result.
+__m128d _mm_broadcastsd_pd (__m128d a) pure @safe
+{
+    double2 r;
+    r = a.array[0];
+    return r;
+}
+unittest
+{
+    double2 A;
+    A.ptr[0] = 2;
+    double2 B = _mm_broadcastsd_pd(A);
+    double[2] correct = [2.0, 2.0];
+    assert(B.array == correct);
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from `a` to all elements of result.
+__m256d _mm256_broadcastsd_pd (__m128d a) pure @safe
+{
+    double4 r;
+    r = a.array[0];
+    return r;
+}
+unittest
+{
+    double2 A;
+    A.ptr[0] = 3;
+    double4 B = _mm256_broadcastsd_pd(A);
+    double[4] correct = [3.0, 3, 3, 3];
+    assert(B.array == correct);
+}
+
+/// Broadcast 128 bits of integer data from ``a to all 128-bit lanes in result.
+/// Note: also exist with name `_mm256_broadcastsi128_si256` which is identical.
+__m256i _mm_broadcastsi128_si256 (__m128i a) pure @trusted
+{
+    // Note that GDC will prefer vinserti128 to vbroadcast, for some reason
+    // So in the end it's the same as naive code.
+    // For this reason, __builtin_ia32_vbroadcastsi256 isn't used
+    long2 ba = cast(long2)a;
+    long4 r;
+    r.ptr[0] = ba.array[0];
+    r.ptr[1] = ba.array[1];
+    r.ptr[2] = ba.array[0];
+    r.ptr[3] = ba.array[1];
+    return cast(__m256i)r;
+}
+unittest
+{
+    long2 A;
+    A.ptr[0] = 34;
+    A.ptr[1] = -56;
+    long4 B = cast(long4) _mm_broadcastsi128_si256(cast(__m128i)A);
+    long[4] correct = [34, -56, 34, -56];
+    assert(B.array == correct);
+}
+
+///ditto
+alias _mm256_broadcastsi128_si256 = _mm_broadcastsi128_si256; // intrinsics is duplicated in the Guide, for some reason
+
+/// Broadcast the low single-precision (32-bit) floating-point element from `a` to all elements of result.
+__m128 _mm_broadcastss_ps (__m128 a) pure @safe
+{
+    float4 r;
+    r = a.array[0];
+    return r;
+}
+unittest
+{
+    float4 A;
+    A.ptr[0] = 2;
+    float4 B = _mm_broadcastss_ps(A);
+    float[4] correct = [2.0f, 2, 2, 2];
+    assert(B.array == correct);
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from `a` to all elements of result.
+__m256 _mm256_broadcastss_ps (__m128 a) pure @safe
+{
+    float8 r;
+    r = a.array[0];
+    return r;
+}
+unittest
+{
+    float4 A;
+    A.ptr[0] = 2;
+    float8 B = _mm256_broadcastss_ps(A);
+    float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2];
+    assert(B.array == correct);
+}
+
+/// Broadcast the low packed 16-bit integer from `a` to all elements of result.
+__m128i _mm_broadcastw_epi16 (__m128i a) pure @safe
+{
+    short8 ba = cast(short8)a;
+    short8 r;
+    r = ba.array[0];
+    return cast(__m128i)r;
+}
+unittest
+{
+    short8 A;
+    A.ptr[0] = 13;
+    short8 B = cast(short8) _mm_broadcastw_epi16(cast(__m128i)A);
+    short[8] correct = [13, 13, 13, 13, 13, 13, 13, 13];
+    assert(B.array == correct);
+}
+
+/// Broadcast the low packed 16-bit integer from `a` to all elements of result.
+__m256i _mm256_broadcastw_epi16 (__m128i a) pure @safe
+{
+    short8 ba = cast(short8)a;
+    short16 r;
+    r = ba.array[0];
+    return cast(__m256i)r;
+}
+unittest
+{
+    short8 A;
+    A.ptr[0] = 13;
+    short16 B = cast(short16) _mm256_broadcastw_epi16(cast(__m128i)A);
+    short[16] correct = [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13];
+    assert(B.array == correct);
+}
+
+
+/// Shift 128-bit lanes in `a` left by `bytes` bytes while shifting in zeroes.
+__m256i _mm256_bslli_epi128(ubyte bytes)(__m256i a) pure @trusted
+{
+    // Note: can't use __builtin_ia32_pslldqi256 with GDC, wants an immediate
+    //       and even string mixin do not make it
+    // PERF: hence GDC AVX2 doesn't use the instruction, and nothing inlines very well in GDC either
+    static if (bytes >= 16)
+    {
+        return _mm256_setzero_si256();
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i)__asm!(long4)("vpslldq $2, $1, $0", "=v,v,I", a, bytes);
+    }
+    else // split
+    {
+        __m128i lo = _mm_slli_si128!bytes(_mm256_extractf128_si256!0(a));
+        __m128i hi = _mm_slli_si128!bytes(_mm256_extractf128_si256!1(a));
+        return _mm256_set_m128i(hi, lo);
+    }
+}
+unittest
+{
+    __m256i a = _mm256_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+    assert(_mm256_bslli_epi128!7(a).array == [72057594037927936, 650777868590383874, 1224979098644774912, 1808220633999610642]);
+}
+
+/// Shift 128-bit lanes in `a` right by `bytes` bytes while shifting in zeroes.
+__m256i _mm256_bsrli_epi128(ubyte bytes)(__m256i a) pure @trusted
+{
+    // Note: can't use __builtin_ia32_psrldqi256 with GDC, wants an immediate
+    //       and even string mixin do not make it
+    // PERF: hence GDC AVX2 doesn't use the instruction, and nothing inlines very well in GDC either
+    static if (bytes >= 16)
+    {
+        return _mm256_setzero_si256();
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i)__asm!(long4)("vpsrldq $2, $1, $0", "=v,v,I", a, bytes);
+    }
+    else // split
+    {
+        __m128i lo = _mm_srli_si128!bytes(_mm256_extractf128_si256!0(a));
+        __m128i hi = _mm_srli_si128!bytes(_mm256_extractf128_si256!1(a));
+        return _mm256_set_m128i(hi, lo);
+    }
+}
+unittest
+{
+    __m256i a = _mm256_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+    assert(_mm256_bsrli_epi128!7(a).array == [1084818905618843912, 16, 2242261671028070680, 32]);
+}
+
+/// Compare packed 16-bit integers in `a` and `b` for equality.
+__m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF: GDC without AVX
+    // PERF: DMD
+    static if (SIMD_COMPARISON_MASKS_32B)
+    {
+        // PERF: catastrophic in GDC without AVX2
+        return cast(__m256i)(cast(short16)a == cast(short16)b);
+    }
+    else static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pcmpeqw256(cast(short16)a, cast(short16)b);
+    }
+    else version(LDC)
+    {
+        return cast(__m256i) equalMask!short16(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        short16 sa = cast(short16)a;
+        short16 sb = cast(short16)b;
+        short16 sr;
+        for (int n = 0; n < 16; ++n)
+        {
+            bool cond = sa.array[n] == sb.array[n];
+            sr.ptr[n] = cond ? -1 : 0;
+        }
+        return cast(__m256i) sr;
+    }
+}
+unittest
+{
+    short16   A = [-3, -2, -1,  0,  0,  1,  2,  3, -3, -2, -1,  0,  0,  1,  2,  3];
+    short16   B = [ 4,  3,  2,  1,  0, -1, -2, -3, -3,  3,  2,  1,  0, -1, -2, -3];
+    short[16] E = [ 0,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0];
+    short16   R = cast(short16)(_mm256_cmpeq_epi16(cast(__m256i)A, cast(__m256i)B));
+    assert(R.array == E);
+}
+
+/// Compare packed 32-bit integers in `a` and `b` for equality.
+__m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF: GDC without AVX
+    // PERF: DMD
+    static if (SIMD_COMPARISON_MASKS_32B)
+    {
+        // Quite bad in GDC -mavx (with no AVX2)
+        return cast(__m256i)(cast(int8)a == cast(int8)b);
+    }
+    else static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pcmpeqd256(cast(int8)a, cast(int8)b);
+    }
+    else version(LDC)
+    {
+        return cast(__m256i) equalMask!int8(cast(int8)a, cast(int8)b);
+    }
+    else
+    {
+        int8 ia = cast(int8)a;
+        int8 ib = cast(int8)b;
+        int8 ir;
+        for (int n = 0; n < 8; ++n)
+        {
+            bool cond = ia.array[n] == ib.array[n];
+            ir.ptr[n] = cond ? -1 : 0;
+        }
+        return cast(__m256i) ir;
+    }
+}
+unittest
+{
+    int8   A = [-3, -2, -1,  0, -3, -2, -1,  0];
+    int8   B = [ 4, -2,  2,  0,  4, -2,  2,  0];
+    int[8] E = [ 0, -1,  0, -1,  0, -1,  0, -1];
+    int8   R = cast(int8)(_mm256_cmpeq_epi32(cast(__m256i)A, cast(__m256i)B));
+    assert(R.array == E);
+}
+
+/// Compare packed 64-bit integers in `a` and `b` for equality.
+__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF: GDC without AVX
+    // PERF: DMD
+    static if (SIMD_COMPARISON_MASKS_32B)
+    {
+        // Note: enabling this with DMD will probably lead to same bug as _mm_cmpeq_epi64
+        return cast(__m256i)(cast(long4)a == cast(long4)b);
+    }
+    else static if (GDC_with_AVX2)
+    {
+        return cast(__m256i)__builtin_ia32_pcmpeqq256(cast(long4)a, cast(long4)b);
+    }
+    else version(LDC)
+    {
+        return cast(__m256i) equalMask!long4(cast(long4)a, cast(long4)b);
+    }
+    else
+    {
+        long4 la = cast(long4)a;
+        long4 lb = cast(long4)b;
+        long4 res;
+        res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
+        res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
+        res.ptr[2] = (la.array[2] == lb.array[2]) ? -1 : 0;
+        res.ptr[3] = (la.array[3] == lb.array[3]) ? -1 : 0;
+        return cast(__m256i)res;
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi64(-1, -2, -1, -2);
+    __m256i B = _mm256_setr_epi64(-3, -2, -3, -3);
+    __m256i C = _mm256_setr_epi64(-1, -4, -1, -2);
+    long4 AB = cast(long4) _mm256_cmpeq_epi64(A, B);
+    long4 AC = cast(long4) _mm256_cmpeq_epi64(A, C);
+    long[4] correct1 = [ 0, -1,  0,  0];
+    long[4] correct2 = [-1,  0, -1, -1];
+    assert(AB.array == correct1);
+    assert(AC.array == correct2);
+}
+
+/// Compare packed 8-bit integers in `a` and `b` for equality.
+__m256i _mm256_cmpeq_epi8 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF: GDC without AVX2, need split
+    // PERF: DMD
+    static if (SIMD_COMPARISON_MASKS_32B)
+    {
+        return cast(__m256i)(cast(byte32)a == cast(byte32)b);
+    }
+    else static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pcmpeqb256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else version(LDC)
+    {
+        return cast(__m256i) equalMask!byte32(cast(byte32)a, cast(byte32)b);
+    }
+    else
+    {
+        byte32 ba = cast(byte32)a;
+        byte32 bb = cast(byte32)b;
+        byte32 br;
+        for (int n = 0; n < 32; ++n)
+        {
+            bool cond = ba.array[n] == bb.array[n];
+            br.ptr[n] = cond ? -1 : 0;
+        }
+        return cast(__m256i) br;
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1,
+                                 1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 42);
+    __m256i B = _mm256_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1,
+                                 2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
+    byte32 C = cast(byte32) _mm256_cmpeq_epi8(A, B);
+    byte[32] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1,
+                              0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0,  0];
+    assert(C.array == correct);
+}
+
+/// Compare packed signed 16-bit integers in `a` and `b` for greater-than.
+__m256i _mm256_cmpgt_epi16 (__m256i a, __m256i b) pure @safe
+{
+    version(GNU)
+        enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC without AVX2
+    else
+        enum bool mayUseComparisonOperator = true;
+
+    static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator)
+    {
+        return cast(__m256i)(cast(short16)a > cast(short16)b);
+    }
+    else static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pcmpgtw256(cast(short16)a, cast(short16)b);
+    }
+    else // split
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_cmpgt_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_cmpgt_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    short16   A = [-3, -2, -1,  0,  0,  1,  2,  3, -3, -2, -1,  0,  0,  1,  2,  3];
+    short16   B = [ 4,  3,  2,  1,  0, -1, -2, -3,  4, -3,  2,  1,  0, -1, -2, -3];
+    short[16] E = [ 0,  0,  0,  0,  0, -1, -1, -1,  0, -1,  0,  0,  0, -1, -1, -1];
+    short16   R = cast(short16)(_mm256_cmpgt_epi16(cast(__m256i)A, cast(__m256i)B));
+    assert(R.array == E);
+}
+
+/// Compare packed signed 32-bit integers in `a` and `b` for greater-than.
+__m256i _mm256_cmpgt_epi32 (__m256i a, __m256i b) pure @safe
+{
+    version(GNU)
+        enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC else
+    else
+        enum bool mayUseComparisonOperator = true;
+
+    static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator)
+    {
+        return cast(__m256i)(cast(int8)a > cast(int8)b);
+    }
+    else static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pcmpgtd256(cast(int8)a, cast(int8)b);
+    }
+    else // split
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_cmpgt_epi32(a_lo, b_lo);
+        __m128i r_hi = _mm_cmpgt_epi32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    int8   A = [-3,  2, -1,  0, -3,  2, -1,  0];
+    int8   B = [ 4, -2,  2,  0,  4, -2,  2,  0];
+    int[8] E = [ 0, -1,  0,  0,  0, -1,  0,  0];
+    int8   R = cast(int8) _mm256_cmpgt_epi32(cast(__m256i)A, cast(__m256i)B);
+    assert(R.array == E);
+}
+
+__m256i _mm256_cmpgt_epi64 (__m256i a, __m256i b) pure @safe
+{
+    version(GNU)
+        enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC else
+    else
+        enum bool mayUseComparisonOperator = true;
+
+    static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator)
+    {
+        return cast(__m256i)(cast(long4)a > cast(long4)b);
+    }
+    else static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pcmpgtq256(cast(long4)a, cast(long4)b);
+    }
+    else // split
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_cmpgt_epi64(a_lo, b_lo);
+        __m128i r_hi = _mm_cmpgt_epi64(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi64(-3,  2, 70,  2);
+    __m256i B = _mm256_setr_epi64 (4, -2,  4, -2);
+    long[4] correct = [ 0, -1, -1, -1 ];
+    long4 R = cast(long4)(_mm256_cmpgt_epi64(A, B));
+    assert(R.array == correct);
+}
+
+/// Compare packed signed 8-bit integers in `a` and `b` for greater-than.
+__m256i _mm256_cmpgt_epi8 (__m256i a, __m256i b) pure @safe
+{
+    version(GNU)
+        enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC without AVX2
+    else
+        enum bool mayUseComparisonOperator = true;
+
+    static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator)
+    {
+        return cast(__m256i)(cast(byte32)a > cast(byte32)b);
+    }
+    else static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pcmpgtb256(cast(short16)a, cast(short16)b);
+    }
+    else // split
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_cmpgt_epi8(a_lo, b_lo);
+        __m128i r_hi = _mm_cmpgt_epi8(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi8(1, 2, 3, 1,  127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1,   1, 2, 3, 1,  127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
+    __m256i B = _mm256_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1,   2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 0);
+    byte32 C = cast(byte32) _mm256_cmpgt_epi8(A, B);
+    byte[32] correct =          [0, 0,-1, 0,   -1,   0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0,   0, 0,-1, 0,   -1,   0, 0, 0,-1,-1,-1, 0, 0, 0,-1,-1];
+    assert(C.array == correct);
+}
+
+
+/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
+__m256i _mm256_cvtepi16_epi32 (__m128i a) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmovsxwd256(cast(short8)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %r = sext <8 x i16> %0 to <8 x i32>
+            ret <8 x i32> %r`;
+        return cast(__m256i) LDCInlineIR!(ir, int8, short8)(cast(short8)a);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        int8 r;
+        r.ptr[0] = sa.array[0];
+        r.ptr[1] = sa.array[1];
+        r.ptr[2] = sa.array[2];
+        r.ptr[3] = sa.array[3];
+        r.ptr[4] = sa.array[4];
+        r.ptr[5] = sa.array[5];
+        r.ptr[6] = sa.array[6];
+        r.ptr[7] = sa.array[7];
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767);
+    int8 C = cast(int8) _mm256_cvtepi16_epi32(A);
+    int[8] correct = [-1, 0, -32768, 32767, -1, 0, -32768, 32767];
+    assert(C.array == correct);
+}
+
+
+/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
+__m256i _mm256_cvtepi16_epi64 (__m128i a) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmovsxwq256(cast(short8)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
+            %r = sext <4 x i16> %v to <4 x i64>
+            ret <4 x i64> %r`;
+        return cast(__m256i) LDCInlineIR!(ir, long4, short8)(cast(short8)a);
+    }
+    else
+    {
+        // LDC x86 generates vpmovsxwq since LDC 1.12 -O1
+        short8 sa = cast(short8)a;
+        long4 r;
+        r.ptr[0] = sa.array[0];
+        r.ptr[1] = sa.array[1];
+        r.ptr[2] = sa.array[2];
+        r.ptr[3] = sa.array[3];
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-1, 0, short.min, short.max, 2, 3, 4, 5);
+    long4 C = cast(long4) _mm256_cvtepi16_epi64(A);
+    long[4] correct = [-1, 0, short.min, short.max];
+    assert(C.array == correct);
+}
+
+/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
+__m256i _mm256_cvtepi32_epi64 (__m128i a) pure @trusted
+{
+    long4 r;
+    r.ptr[0] = a.array[0];
+    r.ptr[1] = a.array[1];
+    r.ptr[2] = a.array[2];
+    r.ptr[3] = a.array[3];
+    return cast(__m256i)r;
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(-1, 0, int.min, int.max);
+    long4 C = cast(long4) _mm256_cvtepi32_epi64(A);
+    long[4] correct = [-1, 0, int.min, int.max];
+    assert(C.array == correct);
+}
+
+/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
+__m256i _mm256_cvtepi8_epi16 (__m128i a) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmovsxbw256(cast(ubyte16)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %r = sext <16 x i8> %0 to <16 x i16>
+            ret <16 x i16> %r`;
+        return cast(__m256i) LDCInlineIR!(ir, short16, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        short16 r;
+        byte16 ba = cast(byte16)a;
+        for (int n = 0; n < 16; ++n)
+        {
+            r.ptr[n] = ba.array[n];
+        }
+        return cast(__m256i)r; 
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
+    short16 C = cast(short16) _mm256_cvtepi8_epi16(A);
+    short[16] correct = [-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
+    assert(C.array == correct);
+}
+
+/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
+__m256i _mm256_cvtepi8_epi32 (__m128i a) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmovsxbd256(cast(ubyte16)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %v = shufflevector <16 x i8> %0,<16 x i8> undef, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5,i32 6, i32 7>
+            %r = sext <8 x i8> %v to <8 x i32>
+            ret <8 x i32> %r`;
+        return cast(__m256i) LDCInlineIR!(ir, int8, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        // PERF This is rather bad in GDC without AVX, or with DMD
+        // should split that
+        int8 r;
+        byte16 ba = cast(byte16)a;
+        for (int n = 0; n < 8; ++n)
+        {
+            r.ptr[n] = ba.array[n];
+        }
+        return cast(__m256i)r; 
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
+    int8 C = cast(int8) _mm256_cvtepi8_epi32(A);
+    int[8] correct = [-1, 0, byte.min, byte.max, 2, 3, 4, 5];
+    assert(C.array == correct);
+}
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
+__m256i _mm256_cvtepi8_epi64 (__m128i a) pure @trusted
+{
+    // PERF This is rather bad in GDC without AVX
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmovsxbq256(cast(ubyte16)a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // 4 inst since LDC 1.22 -O2 
+        return _mm256_cvtepi16_epi64(_mm_cvtepi8_epi16(a));
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %v = shufflevector <16 x i8> %0,<16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+            %r = sext <4 x i8> %v to <4 x i64>
+            ret <4 x i64> %r`;
+        return cast(__m256i) LDCInlineIR!(ir, long4, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        long4 r;
+        byte16 ba = cast(byte16)a;
+        for (int n = 0; n < 4; ++n)
+        {
+            r.ptr[n] = ba.array[n];
+        }
+        return cast(__m256i)r; 
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
+    long4 C = cast(long4) _mm256_cvtepi8_epi64(A);
+    long[4] correct = [-1, 0, byte.min, byte.max];
+    assert(C.array == correct);
+}
+
+/// Zero-extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
+__m256i _mm256_cvtepu16_epi32(__m128i a) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmovzxwd256(cast(short8)a);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        int8 r;
+        r.ptr[0] = cast(ushort)sa.array[0];
+        r.ptr[1] = cast(ushort)sa.array[1];
+        r.ptr[2] = cast(ushort)sa.array[2];
+        r.ptr[3] = cast(ushort)sa.array[3];
+        r.ptr[4] = cast(ushort)sa.array[4];
+        r.ptr[5] = cast(ushort)sa.array[5];
+        r.ptr[6] = cast(ushort)sa.array[6];
+        r.ptr[7] = cast(ushort)sa.array[7];
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767);
+    int8 C = cast(int8) _mm256_cvtepu16_epi32(A);
+    int[8] correct = [65535, 0, 32768, 32767, 65535, 0, 32768, 32767];
+    assert(C.array == correct);
+}
+
+/// Zero-extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
+__m256i _mm256_cvtepu16_epi64(__m128i a) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmovzxwq256(cast(short8)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
+            %r = zext <4 x i16> %v to <4 x i64>
+            ret <4 x i64> %r`;
+        return cast(__m256i) LDCInlineIR!(ir, long4, short8)(cast(short8)a);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        long4 r;
+        r.ptr[0] = cast(ushort)sa.array[0];
+        r.ptr[1] = cast(ushort)sa.array[1];
+        r.ptr[2] = cast(ushort)sa.array[2];
+        r.ptr[3] = cast(ushort)sa.array[3];
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 2, 3, 4, 5);
+    long4 C = cast(long4) _mm256_cvtepu16_epi64(A);
+    long[4] correct = [65535, 0, 32768, 32767];
+    assert(C.array == correct);
+}
+
+/// Zero-extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
+__m256i _mm256_cvtepu32_epi64 (__m128i a) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmovzxdq256(cast(int4)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %r = zext <4 x i32> %0 to <4 x i64>
+            ret <4 x i64> %r`;
+        return cast(__m256i) LDCInlineIR!(ir, long4, int4)(cast(int4)a);
+    }
+    else
+    {
+        long4 r;
+        r.ptr[0] = cast(uint)a.array[0];
+        r.ptr[1] = cast(uint)a.array[1];
+        r.ptr[2] = cast(uint)a.array[2];
+        r.ptr[3] = cast(uint)a.array[3];
+        return cast(__m256i)r; 
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(-1, 0, int.min, int.max);
+    long4 C = cast(long4) _mm256_cvtepu32_epi64(A);
+    long[4] correct = [uint.max, 0, 2_147_483_648, int.max];
+    assert(C.array == correct);
+}
+
+/// Zero-extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
+__m256i _mm256_cvtepu8_epi16 (__m128i a) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmovzxbw256(cast(ubyte16)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %r = zext <16 x i8> %0 to <16 x i16>
+            ret <16 x i16> %r`;
+        return cast(__m256i) LDCInlineIR!(ir, short16, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        short16 r;
+        byte16 ba = cast(byte16)a;
+        for (int n = 0; n < 16; ++n)
+        {
+            r.ptr[n] = cast(ubyte)ba.array[n];
+        }
+        return cast(__m256i)r; 
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
+    short16 C = cast(short16) _mm256_cvtepu8_epi16(A);
+    short[16] correct     = [255, 0,  128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
+    assert(C.array == correct);
+}
+
+/// Zero-extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
+__m256i _mm256_cvtepu8_epi32 (__m128i a) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmovzxbd256(cast(ubyte16)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5,i32 6, i32 7>
+            %r = zext <8 x i8> %v to <8 x i32>
+            ret <8 x i32> %r`;
+        return cast(__m256i) LDCInlineIR!(ir, int8, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        int8 r;
+        byte16 ba = cast(byte16)a;
+        for (int n = 0; n < 8; ++n)
+        {
+            r.ptr[n] = cast(ubyte)ba.array[n];
+        }
+        return cast(__m256i)r; 
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
+    int8 C = cast(int8) _mm256_cvtepu8_epi32(A);
+    int[8] correct     = [255, 0,  128, 127, 2, 3, 4, 5];
+    assert(C.array == correct);
+}
+
+/// Zero-extend packed unsigned 8-bit integers in `a` to packed 64-bit integers.
+__m256i _mm256_cvtepu8_epi64 (__m128i a) pure @trusted
+{
+    // PERF ARM64+LDC, not awesome
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmovzxbq256(cast(ubyte16)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
+            %r = zext <4 x i8> %v to <4 x i64>
+            ret <4 x i64> %r`;
+        return cast(__m256i) LDCInlineIR!(ir, long4, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        long4 r;
+        byte16 ba = cast(byte16)a;
+        for (int n = 0; n < 4; ++n)
+        {
+            r.ptr[n] = cast(ubyte)ba.array[n];
+        }
+        return cast(__m256i)r; 
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
+    long4 C = cast(long4) _mm256_cvtepu8_epi64(A);
+    long[4] correct     = [255, 0,  128, 127];
+    assert(C.array == correct);
+}
+
+/// Extract a 16-bit integer from `a`, selected with index.
+int _mm256_extract_epi16 (__m256i a, int index) pure @trusted
+{
+    short16 sa = cast(short16)a;
+    return sa.ptr[index & 15];
+}
+unittest
+{
+    short16 b;
+    b = 43;
+    assert(_mm256_extract_epi16(cast(__m256i)b, 7) == 43);
+}
+
+/// Extract a 8-bit integer from `a`, selected with index.
+int _mm256_extract_epi8 (__m256i a, int index) pure @trusted
+{
+    byte32 sa = cast(byte32)a;
+    return sa.ptr[index & 31];
+}
+unittest
+{
+    byte32 b;
+    b = -44;
+    assert(_mm256_extract_epi8(cast(__m256i)b, 5) == -44);
+    assert(_mm256_extract_epi8(cast(__m256i)b, 5 + 32) == -44);
+}
+
+/// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`.
+__m128i _mm256_extracti128_si256(int imm8)(__m256i a) pure @trusted
+    if ( (imm8 == 0) || (imm8 == 1) )
+{
+    pragma(inline, true);
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m128i) __builtin_ia32_extract128i256(a, imm8);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum str = (imm8 == 1) ? "<i32 2, i32 3>" : "<i32 0, i32 1>";
+        enum ir = "%r = shufflevector <4 x i64> %0, <4 x i64> undef, <2 x i32>" ~ str ~ "\n" ~
+                  "ret <2 x i64> %r";
+        return cast(__m128i) LDCInlineIR!(ir, ulong2, ulong4)(cast(ulong4)a);
+    }
+    else
+    {
+        long4 al = cast(long4) a;
+        long2 ret;
+        ret.ptr[0] = (imm8==1) ? al.array[2] : al.array[0];
+        ret.ptr[1] = (imm8==1) ? al.array[3] : al.array[1];
+        return cast(__m128i) ret;
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432 );
+    int[4] correct0 = [ -7, -1, 0, 9 ];
+    int[4] correct1 = [ -100, 100, 234, 432 ];
+    __m128i R0 = _mm256_extracti128_si256!(0)(A);
+    __m128i R1 = _mm256_extracti128_si256!(1)(A);
+    assert(R0.array == correct0);
+    assert(R1.array == correct1);
+}
+
+/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
+__m256i _mm256_hadd_epi16 (__m256i a, __m256i b) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_phaddw256(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_hadd_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_hadd_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768, 1, -2, 4, 8, 16, 32, -1, -32768);
+    short16 C = cast(short16) _mm256_hadd_epi16(A, A);
+    short[16] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767,  -1, 12, 48, 32767, -1, 12, 48, 32767];
+    assert(C.array == correct);
+}
+
+/// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
+__m256i _mm256_hadd_epi32 (__m256i a, __m256i b) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_phaddd256(cast(int8)a, cast(int8)b);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_hadd_epi32(a_lo, b_lo);
+        __m128i r_hi = _mm_hadd_epi32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(1, -2, int.min, -1, 1, -2, int.min, -1);
+    __m256i B = _mm256_setr_epi32(1, int.max, 4, -4, 1, int.max, 4, -4);
+    int8 C = cast(int8) _mm256_hadd_epi32(A, B);
+    int[8] correct = [ -1, int.max, int.min, 0, -1, int.max, int.min, 0 ];
+    assert(C.array == correct);
+}
+
+/// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, and pack the signed 16-bit results.
+__m256i _mm256_hadds_epi16 (__m256i a, __m256i b) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_phaddsw256(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_hadds_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_hadds_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768, 1, -2, 4, 8, 16, 32, -1, -32768);
+    short16 C = cast(short16) _mm256_hadds_epi16(A, A);
+    short[16] correct = [ -1, 12, 48, -32768, -1, 12, 48, -32768, -1, 12, 48, -32768, -1, 12, 48, -32768];
+    assert(C.array == correct);
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
+__m256i _mm256_hsub_epi16 (__m256i a, __m256i b) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_phsubw256(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_hsub_epi32(a_lo, b_lo);
+        __m128i r_hi = _mm_hsub_epi32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(1, 2, int.min, 1, 1, 2, int.min, 1);
+    __m256i B = _mm256_setr_epi32(int.max, -1, 4, 4, int.max, -1, 4, 4);
+    int8 C = cast(int8) _mm256_hsub_epi32(A, B);
+    int[8] correct = [ -1, int.max, int.min, 0, -1, int.max, int.min, 0 ];
+    assert(C.array == correct);
+}
+
+/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
+__m256i _mm256_hsub_epi32 (__m256i a, __m256i b) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_phsubd256(cast(int8)a, cast(int8)b);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_hsub_epi32(a_lo, b_lo);
+        __m128i r_hi = _mm_hsub_epi32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(1, 2, int.min, 1, 1, 2, int.min, 1);
+    __m256i B = _mm256_setr_epi32(int.max, -1, 4, 4, int.max, -1, 4, 4);
+    int8 C = cast(int8) _mm256_hsub_epi32(A, B);
+    int[8] correct = [ -1, int.max, int.min, 0,  -1, int.max, int.min, 0 ];
+    assert(C.array == correct);
+}
+
+/// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, and pack the signed 16-bit results.
+__m256i _mm256_hsubs_epi16 (__m256i a, __m256i b) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_phsubsw256(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_hsubs_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_hsubs_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16(1, -2, 4, 8, 32767, -1, -10, 32767, 1, -2, 4, 8, 32767, -1, -10, 32767);
+    short16 C = cast(short16) _mm256_hsubs_epi16(A, A);
+    short[16] correct = [ 3, -4, 32767, -32768, 3, -4, 32767, -32768, 3, -4, 32767, -32768, 3, -4, 32767, -32768 ];
+    assert(C.array == correct);
+}
+
+
+// TODO __m128i _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale) pure @safe
+// TODO __m128i _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
+// TODO __m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale) pure @safe
+// TODO __m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale) pure @safe
+// TODO __m128i _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe
+// TODO __m128i _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
+// TODO __m256i _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe
+// TODO __m256i _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale) pure @safe
+// TODO __m128d _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe
+// TODO __m128d _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) pure @safe
+// TODO __m256d _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe
+// TODO __m256d _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale) pure @safe
+// TODO __m128 _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale) pure @safe
+// TODO __m128 _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) pure @safe
+// TODO __m256 _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale) pure @safe
+// TODO __m256 _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale) pure @safe
+// TODO __m128i _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale) pure @safe
+// TODO __m128i _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
+// TODO __m128i _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale) pure @safe
+// TODO __m128i _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale) pure @safe
+// TODO __m128i _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe
+// TODO __m128i _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
+// TODO __m256i _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale) pure @safe
+// TODO __m256i _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale) pure @safe
+// TODO __m128d _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe
+// TODO __m128d _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) pure @safe
+// TODO __m256d _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale) pure @safe
+// TODO __m256d _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale) pure @safe
+// TODO __m128 _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale) pure @safe
+// TODO __m128 _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) pure @safe
+// TODO __m128 _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale) pure @safe
+// TODO __m128 _mm256_mask_i64gather_ps (__m128 src, float const* base_addr, __m256i vindex, __m128 mask, const int scale) pure @safe
+
+
+/// Copy `a` to result, then insert 128 bits from `b` into result at the location specified by 
+/// `imm8`.
+__m256i _mm256_inserti128_si256 (__m256i a, __m128i b, const int imm8) pure @trusted
+{
+    long2 lb = cast(long2)b;
+    a.ptr[(imm8 & 1)*2  ] = lb.array[0];
+    a.ptr[(imm8 & 1)*2+1] = lb.array[1];
+    return a; 
+}
+unittest
+{
+    __m256i A = [0, 1, 2, 3];
+    long2 B = [4, 5];
+    __m256i C = _mm256_inserti128_si256(A, cast(__m128i)B, 0 + 8);
+    __m256i D = _mm256_inserti128_si256(A, cast(__m128i)B, 1);
+    long[4] correctC = [4, 5, 2, 3]; 
+    long[4] correctD = [0, 1, 4, 5];
+    assert(C.array == correctC);
+    assert(D.array == correctD);
+}
+
+/// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
+/// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
+/// and pack the results in destination.
+__m256i _mm256_madd_epi16 (__m256i a, __m256i b) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        // split is beneficial for ARM64, LDC and GDC without AVX2
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_madd_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_madd_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    short16 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767];
+    short16 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767];
+    int8 R = cast(int8) _mm256_madd_epi16(cast(__m256i)A, cast(__m256i)B);
+    int[8] correct = [1, 13, -2147483648, 2*32767*32767, 1, 13, -2147483648, 2*32767*32767];
+    assert(R.array == correct);
+}
+
+/// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 
+/// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 
+/// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 
+/// and pack the saturated results.
+__m256i _mm256_maddubs_epi16 (__m256i a, __m256i b) @safe
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i)__builtin_ia32_pmaddubsw256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i)__builtin_ia32_pmaddubsw256(cast(byte32)a, cast(byte32)b);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_maddubs_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_maddubs_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(  -1,  10, 100, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // u8
+    __m128i B = _mm_setr_epi8(-128, -30, 100,  127, -1, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0); // i8
+    __m256i AA = _mm256_set_m128i(A, A);
+    __m256i BB = _mm256_set_m128i(B, B);
+    short16 C = cast(short16) _mm256_maddubs_epi16(AA, BB);
+    short[16] correct =       [   -32768,     26256, 0, 0, 0, 0, 0, 0,
+                                  -32768,     26256, 0, 0, 0, 0, 0, 0];
+    assert(C.array == correct);
+}
+
+version(DigitalMars)
+{
+    // this avoids a bug with DMD < 2.099 -a x86 -O
+    private enum bool maskLoadWorkaroundDMD = (__VERSION__ < 2099);
+}
+else
+{
+    private enum bool maskLoadWorkaroundDMD = false;
+}
+
+/// Load packed 32-bit integers from memory using `mask` (elements are zeroed out when the highest
+/// bit is not set in the corresponding element).
+/// Warning: See "Note about mask load/store" to know why you must address valid memory only.
+__m128i _mm_maskload_epi32 (const(int)* mem_addr, __m128i mask) /* pure */ @system
+{
+    // PERF DMD
+    static if (LDC_with_AVX2)
+    {
+        // MAYDO report that the builtin is impure
+        return __builtin_ia32_maskloadd(mem_addr, mask);
+    }
+    else static if (GDC_with_AVX2)
+    {
+        return __builtin_ia32_maskloadd(cast(__m128i*)mem_addr, mask);
+    }
+    else
+    {
+        return cast(__m128i) _mm_maskload_ps(cast(const(float)*)mem_addr, mask);
+    }
+}
+unittest
+{
+    static if (!maskLoadWorkaroundDMD)
+    {
+        int[4] A = [7, 1, 2, 3];
+        int4 B = _mm_maskload_epi32(A.ptr, _mm_setr_epi32(1, -1, -1, 1));  // can NOT address invalid memory with mask load and writes!
+        int[4] correct = [0, 1, 2, 0];
+        assert(B.array == correct);
+    }
+}
+
+/// Load packed 32-bit integers from memory using `mask` (elements are zeroed out when the highest 
+/// bit is not set in the corresponding element).
+/// Warning: See "Note about mask load/store" to know why you must address valid memory only.
+__m256i _mm256_maskload_epi32 (const(int)* mem_addr, __m256i mask) /* pure */ @system
+{
+    static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_maskloadd256(mem_addr, cast(int8)mask);
+    }
+    else static if (GDC_with_AVX2)
+    {
+        return cast(__m256i)__builtin_ia32_maskloadd256(cast(__m256i*)mem_addr, cast(int8)mask);
+    }
+    else
+    {
+        return cast(__m256i) _mm256_maskload_ps(cast(const(float*)) mem_addr, mask);
+    }
+}
+unittest
+{
+    int[8] A = [7, 1, 2, 3, 8, -2, 4, 5];
+    int8 B = cast(int8) _mm256_maskload_epi32(A.ptr, _mm256_setr_epi32(1, -1, -1, 1, -1, -1, 1, 1));
+    int[8] correct = [0, 1, 2, 0, 8, -2, 0, 0];
+    assert(B.array == correct);
+}
+
+/// Load packed 64-bit integers from memory using `mask` (elements are zeroed out when the highest 
+/// bit is not set in the corresponding element).
+/// Warning: See "Note about mask load/store" to know why you must address valid memory only.
+__m128i _mm_maskload_epi64 (const(long)* mem_addr, __m128i mask) @system
+{
+    // PERF DMD
+    static if (LDC_with_AVX2)
+    {
+        return cast(__m128i) __builtin_ia32_maskloadq(mem_addr, cast(long2) mask);
+    }
+    else static if (GDC_with_AVX2)
+    {
+        return cast(__m128i) __builtin_ia32_maskloadq(cast(long2*)mem_addr, cast(long2) mask);
+    }
+    else
+    {
+        return cast(__m128i) _mm_maskload_pd(cast(const(double)*)mem_addr, mask);
+    }
+}
+unittest
+{
+    static if (!maskLoadWorkaroundDMD)
+    {
+        long[2] A = [-7, -8];
+        long2 B = cast(long2) _mm_maskload_epi64(A.ptr, _mm_setr_epi64(1, -1));
+        long[2] correct = [0, -8];
+        assert(B.array == correct);
+    }
+}
+
+/// Load packed 64-bit integers from memory using `mask` (elements are zeroed out when the highest 
+/// bit is not set in the corresponding element).
+/// Warning: See "Note about mask load/store" to know why you must address valid memory only.
+__m256i _mm256_maskload_epi64 (const(long)* mem_addr, __m256i mask) /* pure */ @system
+{
+    static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_maskloadq256(mem_addr, cast(long4)mask);
+    }
+    else static if (GDC_with_AVX2)
+    {
+        return cast(__m256i)__builtin_ia32_maskloadq256(cast(__m256i*)mem_addr, cast(long4)mask);
+    }
+    else
+    {
+        return cast(__m256i) _mm256_maskload_pd(cast(const(double*)) mem_addr, mask);
+    }
+}
+unittest
+{
+    long[4] A = [ 8, -2, 4, 5];
+    long4 B = cast(long4) _mm256_maskload_epi64(A.ptr, _mm256_setr_epi64(1, -1, -1, 1));
+    long[4] correct = [0, -2, 4, 0];
+}
+
+/// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
+__m256i _mm256_max_epi16 (__m256i a, __m256i b) pure @safe
+{
+    // PERF D_SIMD
+    version(GNU)
+        enum bool split = true;
+    else static if (SIMD_COMPARISON_MASKS_32B)
+        enum bool split = false;
+    else
+        enum bool split = true;
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmaxsw256(cast(short16)a, cast(short16)b);
+    }
+    else static if (split)
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_max_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_max_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else static if (SIMD_COMPARISON_MASKS_32B)
+    {
+        // catastrophic with GDC x86 for some reason. Sad.
+        short16 sa = cast(short16)a;
+        short16 sb = cast(short16)b;
+        short16 greater = sa > sb;
+        return cast(__m256i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+        static assert(0);    
+}
+unittest
+{
+    short16 R = cast(short16) _mm256_max_epi16(_mm256_setr_epi16(32767, 1, -4, -8, 9,     7, 0,-57, 1, 0, 0, 0, 1, 0, 0, 0),
+                                               _mm256_setr_epi16(   -4,-8,  9,  7, 0,-32768, 0,  0, 0, 2, 0, 4, 2, 1, 2, -4));
+    short[16] correct =                                         [32767, 1,  9,  7, 9,     7, 0,  0, 1, 2, 0, 4, 2, 1, 2, 0];
+    assert(R.array == correct);
+}
+
+/// Compare packed signed 32-bit integers in `a` and `b`, and return packed maximum values.
+__m256i _mm256_max_epi32 (__m256i a, __m256i b) pure @safe
+{
+    // PERF D_SIMD
+    version(GNU)
+        enum bool split = true;
+    else static if (SIMD_COMPARISON_MASKS_32B)
+        enum bool split = false;
+    else
+        enum bool split = true;
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmaxsd256(cast(int8)a, cast(int8)b);
+    }
+    else static if (split)
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_max_epi32(a_lo, b_lo);
+        __m128i r_hi = _mm_max_epi32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else static if (SIMD_COMPARISON_MASKS_32B) 
+    {
+        // catastrophic with GDC x86 for some reason, like for 16-bit numbers.
+        int8 sa = cast(int8)a;
+        int8 sb = cast(int8)b;
+        int8 greater = sa > sb;
+        return cast(__m256i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+        static assert(0);    
+}
+unittest
+{
+    int8 R = cast(int8) _mm256_max_epi32(_mm256_setr_epi32(0x7fffffff, 1, -4,  7, 0x7fffffff, 2, -4,  7),
+                                         _mm256_setr_epi32(        -4,-8,  9, -8,-0x80000000,-8,  9, -8));
+    int[8] correct =                                      [0x7fffffff, 1,  9,  7, 0x7fffffff, 2,  9,  7];
+    assert(R.array == correct);
+}
+
+/// Compare packed signed 8-bit integers in `a` and `b`, and return packed maximum values.
+__m256i _mm256_max_epi8 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF D_SIMD
+    version(GNU)
+        enum bool split = true;
+    else static if (SIMD_COMPARISON_MASKS_32B)
+        enum bool split = false;
+    else
+        enum bool split = true;
+    static if (GDC_with_AVX2)
+    {
+        // Strangely, GDC asks for unsigned ubyte32
+        return cast(__m256i) __builtin_ia32_pmaxsb256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if (split)
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_max_epi8(a_lo, b_lo);
+        __m128i r_hi = _mm_max_epi8(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else static if (SIMD_COMPARISON_MASKS_32B)
+    {
+        // This is real bad with GDC, again
+        byte32 sa = cast(byte32)a;
+        byte32 sb = cast(byte32)b;
+        byte32 greater = cast(byte32)(sa > sb);
+        return cast(__m256i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+        static assert(false);
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0,   127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
+    __m256i B = _mm256_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0,     4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 4, 0, 0);
+    byte32 R = cast(byte32) _mm256_max_epi8(A, B);
+    byte[32] correct =          [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0,   127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 4, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed maximum values.
+__m256i _mm256_max_epu16 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF D_SIMD
+    version(GNU)
+        enum bool split = true;
+    else static if (SIMD_COMPARISON_MASKS_32B)
+        enum bool split = false;
+    else
+        enum bool split = true;
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmaxuw256(cast(short16)a, cast(short16)b);
+    }
+    else static if (split)
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_max_epu16(a_lo, b_lo);
+        __m128i r_hi = _mm_max_epu16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else static if (SIMD_COMPARISON_MASKS_32B)
+    {
+        // catastrophic with GDC x86_64, good with LDC
+        short16 sa = cast(short16)a;
+        short16 sb = cast(short16)b;
+        short16 greater = cast(short16)(cast(ushort16)sa > cast(ushort16)sb);
+        return cast(__m256i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+        static assert(false);
+}
+unittest
+{
+    short16 R = cast(short16) _mm256_max_epu16(_mm256_setr_epi16(32767, 1, -4, -8, 9,     7, 0,-57, 1, 0, 0, 0, 1, 0, 0, -6),
+                                                _mm256_setr_epi16(  -4,-8,  9,  7, 0,-32768, 0,  0, 0, 2, 0, 4, 2, 1, 2, -4));
+    short[16] correct =                                            [-4,-8, -4, -8, 9,-32768, 0,-57, 1, 2, 0, 4, 2, 1, 2, -4];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed maximum values.
+__m256i _mm256_max_epu32 (__m256i a, __m256i b) pure @safe
+{
+    // PERF D_SIMD
+    version(GNU)
+        enum bool split = true;
+    else static if (SIMD_COMPARISON_MASKS_32B)
+        enum bool split = false;
+    else
+        enum bool split = true;
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmaxud256(cast(int8)a, cast(int8)b);
+    }
+    else static if (split)
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_max_epu32(a_lo, b_lo);
+        __m128i r_hi = _mm_max_epu32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else static if (SIMD_COMPARISON_MASKS_32B) 
+    {
+        // catastrophic with GDC x86 for some reason, like for 16-bit numbers.
+        uint8 sa = cast(uint8)a;
+        uint8 sb = cast(uint8)b;
+        uint8 greater = sa > sb;
+        return cast(__m256i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+        static assert(0);
+}
+unittest
+{
+    int8 R = cast(int8) _mm256_max_epu32(_mm256_setr_epi32(0x7fffffff, 1,  4, -7, 0x7fffffff, 1, 11, -7),
+                                         _mm256_setr_epi32(        -4,-8,  9, -8,         -4,-8,  9, -8));
+    int[8] correct =                                      [        -4,-8,  9, -7,         -4,-8, 11, -7];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values.
+__m256i _mm256_max_epu8 (__m256i a, __m256i b) pure @safe
+{
+    // PERF D_SIMD
+    version(GNU)
+        enum bool split = true;
+    else static if (SIMD_COMPARISON_MASKS_32B)
+        enum bool split = false;
+    else
+        enum bool split = true;
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmaxub256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if (split)
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_max_epu8(a_lo, b_lo);
+        __m128i r_hi = _mm_max_epu8(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else static if (SIMD_COMPARISON_MASKS_32B)
+    {
+        // This is real bad with GDC, again
+        ubyte32 sa = cast(ubyte32)a;
+        ubyte32 sb = cast(ubyte32)b;
+        ubyte32 greater = cast(ubyte32)(sa > sb);
+        return cast(__m256i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+        static assert(false);
+}
+unittest
+{
+    byte32 R = cast(byte32) _mm256_max_epu8(_mm256_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0,   45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
+                                            _mm256_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57,   -4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
+    byte[32] correct =                                      [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57,   -4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
+    assert(R.array == correct);
+}
+
+// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
+__m256i _mm256_min_epi16 (__m256i a, __m256i b) pure @safe
+{
+    // PERF D_SIMD
+    version(GNU)
+        enum bool split = true;
+    else static if (SIMD_COMPARISON_MASKS_32B)
+        enum bool split = false;
+    else
+        enum bool split = true;
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pminsw256(cast(short16)a, cast(short16)b);
+    }
+    else static if (split)
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_min_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_min_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else static if (SIMD_COMPARISON_MASKS_32B)
+    {
+        // same as _mm256_min_epi16, this is catastrophic with GDC -mavx
+        short16 sa = cast(short16)a;
+        short16 sb = cast(short16)b;
+        short16 greater = sa > sb;
+        return cast(__m256i)( (~greater & sa) | (greater & sb) );
+    }
+    else
+        static assert(0);
+}
+unittest
+{
+    short16 R = cast(short16) _mm256_min_epi16(_mm256_setr_epi16(32767, 1, -4, -8, 9,     7, 0,-57, 1, 0, 0, 0, 1, 0, 0,  0),
+                                               _mm256_setr_epi16(   -4,-8,  9,  7, 0,-32768, 0,  0, 0, 2, 0, 4, 2, 1, 2, -4));
+    short[16] correct =                                         [   -4,-8, -4, -8, 0,-32768, 0,-57, 0, 0, 0, 0, 1, 0, 0, -4];
+    assert(R.array == correct);
+}
+
+/// Compare packed signed 32-bit integers in `a` and `b`, and return packed minimum values.
+__m256i _mm256_min_epi32 (__m256i a, __m256i b) pure @safe
+{
+    // PERF D_SIMD
+    version(GNU)
+        enum bool split = true;
+    else static if (SIMD_COMPARISON_MASKS_32B)
+        enum bool split = false;
+    else
+        enum bool split = true;
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pminsd256(cast(int8)a, cast(int8)b);
+    }
+    else static if (split)
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_min_epi32(a_lo, b_lo);
+        __m128i r_hi = _mm_min_epi32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else static if (SIMD_COMPARISON_MASKS_32B) 
+    {
+        // Not checked this one, probably same badness issue with GDC
+        int8 sa = cast(int8)a;
+        int8 sb = cast(int8)b;
+        int8 greater = sa > sb;
+        return cast(__m256i)( (~greater & sa) | (greater & sb) );
+    }
+    else
+        static assert(0);    
+}
+unittest
+{
+    int8 R = cast(int8) _mm256_min_epi32(_mm256_setr_epi32(0x7fffffff, 1, -4,  7, 0x7fffffff, 2, -4,  7),
+                                         _mm256_setr_epi32(        -4,-8,  9, -8,-0x80000000,-8,  9, -8));
+    int[8] correct =                                      [ -       4,-8, -4, -8,-0x80000000,-8, -4, -8];
+    assert(R.array == correct);
+}
+
+
+/// Compare packed signed 8-bit integers in `a` and `b`, and return packed minimum values.
+__m256i _mm256_min_epi8 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF D_SIMD
+    version(GNU)
+        enum bool split = true;
+    else static if (SIMD_COMPARISON_MASKS_32B)
+        enum bool split = false;
+    else
+        enum bool split = true;
+    static if (GDC_with_AVX2)
+    {
+        // Strangely, GDC asks for unsigned ubyte32
+        return cast(__m256i) __builtin_ia32_pminsb256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if (split)
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_min_epi8(a_lo, b_lo);
+        __m128i r_hi = _mm_min_epi8(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else static if (SIMD_COMPARISON_MASKS_32B)
+    {
+        // This is real bad with GDC, again
+        byte32 sa = cast(byte32)a;
+        byte32 sb = cast(byte32)b;
+        byte32 greater = cast(byte32)(sa > sb);
+        return cast(__m256i)( (~greater & sa) | (greater & sb) );
+    }
+    else
+        static assert(false);
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi8(127,  1, -4, -8, 9,    7, 0, -57, 0, 0, 0, 0, 0, 0, 0, 0,   127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
+    __m256i B = _mm256_setr_epi8(  4, -8,  9, -7, 0, -128, 0,   0, 0, 0, 0, 0, 0, 0, 0, 0,     4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, -4, 0, 0);
+    byte32 R = cast(byte32) _mm256_min_epi8(A, B);
+    byte[32] correct =          [  4, -8, -4, -8, 0, -128, 0, -57, 0, 0, 0, 0, 0, 0, 0, 0,     4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, -4, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed minimum values.
+__m256i _mm256_min_epu16 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF D_SIMD
+    version(GNU)
+        enum bool split = true;
+    else static if (SIMD_COMPARISON_MASKS_32B)
+        enum bool split = false;
+    else
+        enum bool split = true;
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pminuw256(cast(short16)a, cast(short16)b);
+    }
+    else static if (split)
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_min_epu16(a_lo, b_lo);
+        __m128i r_hi = _mm_min_epu16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else static if (SIMD_COMPARISON_MASKS_32B)
+    {
+        // catastrophic with GDC x86_64
+        short16 sa = cast(short16)a;
+        short16 sb = cast(short16)b;
+        short16 greater = cast(short16)(cast(ushort16)sa > cast(ushort16)sb);
+        return cast(__m256i)( (~greater & sa) | (greater & sb) );
+    }
+    else
+        static assert(false);
+}
+unittest
+{
+    short16 R = cast(short16) _mm256_min_epu16(_mm256_setr_epi16(32767, 1, -4, -8, 9,     7, 0,-57, 1, 0, 0, 0, 1, 0, 0, -6),
+                                               _mm256_setr_epi16(  -4, -8,  9,  7, 0,-32768, 0,  0, 0, 2, 0, 4, 2, 1, 2, -4));
+    short[16] correct =                                         [32767, 1,  9,  7, 0,     7, 0,  0, 0, 0, 0, 0, 1, 0, 0, -6];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed minimum values.
+__m256i _mm256_min_epu32 (__m256i a, __m256i b) pure @safe
+{
+    // PERF D_SIMD
+    version(GNU)
+        enum bool split = true;
+    else static if (SIMD_COMPARISON_MASKS_32B)
+        enum bool split = false;
+    else
+        enum bool split = true;
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pminud256(cast(int8)a, cast(int8)b);
+    }
+    else static if (split)
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_min_epu32(a_lo, b_lo);
+        __m128i r_hi = _mm_min_epu32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else static if (SIMD_COMPARISON_MASKS_32B) 
+    {
+        // catastrophic with GDC, so in this case split instead
+        uint8 sa = cast(uint8)a;
+        uint8 sb = cast(uint8)b;
+        uint8 greater = sa > sb;
+        return cast(__m256i)( (greater & sb) | (~greater & sa) );
+    }
+    else
+        static assert(0);
+}
+unittest
+{
+    int8 R = cast(int8) _mm256_min_epu32(_mm256_setr_epi32(0x7fffffff, 1,  4, -7, 0x7fffffff, 1, 11, -7),
+                                         _mm256_setr_epi32(        -4,-8,  9, -8,         -4,-8,  9, -8));
+    int[8] correct =                                      [0x7fffffff, 1,  4, -8, 0x7fffffff, 1,  9, -8];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
+__m256i _mm256_min_epu8 (__m256i a, __m256i b) pure @safe
+{
+    // PERF D_SIMD
+    version(GNU)
+        enum bool split = true;
+    else static if (SIMD_COMPARISON_MASKS_32B)
+        enum bool split = false;
+    else
+        enum bool split = true;
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pminub256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if (split)
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_min_epu8(a_lo, b_lo);
+        __m128i r_hi = _mm_min_epu8(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else static if (SIMD_COMPARISON_MASKS_32B)
+    {
+        ubyte32 sa = cast(ubyte32)a;
+        ubyte32 sb = cast(ubyte32)b;
+        ubyte32 greater = cast(ubyte32)(sa > sb);
+        return cast(__m256i)( (~greater & sa) | (greater & sb) );
+    }
+    else
+        static assert(false);
+}
+unittest
+{
+    byte32 R = cast(byte32) _mm256_min_epu8(_mm256_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0,   45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
+                                            _mm256_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57,   -4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
+    byte[32] correct =                                      [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0,   45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
+    assert(R.array == correct);
+}
+
+/// Create mask from the most significant bit of each 8-bit element in `a`.
+int _mm256_movemask_epi8 (__m256i a) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return __builtin_ia32_pmovmskb256(cast(ubyte32)a);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return __builtin_ia32_pmovmskb256(cast(byte32)a);
+    }
+    else
+    {
+        // ARM64 splitting makes it 33 inst instead of 48 for naive version.
+        //       PERF not sure if there is something better, sounds likely
+        // Otherwise, beneficial for every case.
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        return (_mm_movemask_epi8(a_hi) << 16) | _mm_movemask_epi8(a_lo);
+    }
+}
+unittest
+{
+    assert(0x9D37_9C36 == _mm256_movemask_epi8(_mm256_set_epi8(-1, 1, 2, -3, -1, -1, 4,-8, 127, 0, -1, -1, 0, -1, -1, -1,
+                                                               -1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
+}
+
+// TODO __m256i _mm256_mpsadbw_epu8 (__m256i a, __m256i b, const int imm8) pure @safe
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in `a` and `b`, and 
+/// return the signed 64-bit results.
+__m256i _mm256_mul_epi32 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF LDC + SSE2 to SSSE3. I don't quite see what to do, same problem in _mm_mul_epi32.
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmuldq256(cast(int8)a, cast(int8)b);
+    }
+    else static if ( (LDC_with_SSE41 || LDC_with_AVX2) && LDC_with_optimizations) 
+    {
+        // good with LDC + SSE4.1 to AVX2, else need to split
+        enum ir = `
+            %ia = shufflevector <8 x i32> %0,<8 x i32> %0, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+            %ib = shufflevector <8 x i32> %1,<8 x i32> %1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+            %la = sext <4 x i32> %ia to <4 x i64>
+            %lb = sext <4 x i32> %ib to <4 x i64>
+            %r = mul <4 x i64> %la, %lb
+            ret <4 x i64> %r`;
+        return cast(__m256i) LDCInlineIR!(ir, long4, int8, int8)(cast(int8)a, cast(int8)b);
+    }
+    else
+    {
+        // split, very beneficial with LDC+ARM64
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_mul_epi32(a_lo, b_lo);
+        __m128i r_hi = _mm_mul_epi32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(61616461, 1915324654, 4564061, 3, 61616466, 1915324654, 4564061, 3);
+    __m256i B = _mm256_setr_epi32(49716422, -915616216, -121144, 0, 49716422, -915616216, -121145, 0);
+    long4 R = cast(long4) _mm256_mul_epi32(A, B);
+    long[4] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144, cast(long)61616466 * 49716422, cast(long)4564061 * -121145];
+    assert(R.array == correct);
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, and 
+/// return the unsigned 64-bit results.
+__m256i _mm256_mul_epu32 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmuludq256(cast(int8)a, cast(int8)b);
+    }
+    else version(GNU)
+    {
+        // explicit split needed for GDC without avx2
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_mul_epu32(a_lo, b_lo);
+        __m128i r_hi = _mm_mul_epu32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }       
+    else
+    {
+        // Works well in all LDC cases, surprisingly.
+        int8 ia = cast(int8)a;
+        int8 ib = cast(int8)b;
+        long4 r;
+        r.ptr[0] = cast(long)cast(uint)ia.array[0] * cast(long)cast(uint)ib.array[0];
+        r.ptr[1] = cast(long)cast(uint)ia.array[2] * cast(long)cast(uint)ib.array[2];
+        r.ptr[2] = cast(long)cast(uint)ia.array[4] * cast(long)cast(uint)ib.array[4];
+        r.ptr[3] = cast(long)cast(uint)ia.array[6] * cast(long)cast(uint)ib.array[6];
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    __m256i A = _mm256_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff, 42, 0xDEADBEEF, 42, 0xffffffff);
+    __m256i B = _mm256_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff, 42, 0xCAFEBABE, 42, 0xffffffff);
+    __m256i C = _mm256_mul_epu32(A, B);
+    long4 LC = cast(long4)C;
+    long[4] correct = [18446744065119617025uL, 12723420444339690338uL, 18446744065119617025uL, 12723420444339690338uL];
+    assert(LC.array == correct);
+}
+
+/// Multiply the packed signed 16-bit integers in `a` and `b`, 
+/// producing intermediate 32-bit integers, and return the high 
+/// 16 bits of the intermediate integers.
+__m256i _mm256_mulhi_epi16 (__m256i a, __m256i b) pure @safe
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmulhw256(cast(short16)a, cast(short16)b);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmulhw256(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_mulhi_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_mulhi_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7, 0, -16, 2, 3, 4, 8, 16, 8);
+    __m256i B = _mm256_set1_epi16(16384);
+    short16 R = cast(short16)_mm256_mulhi_epi16(A, B);
+    short[16] correct = [0, -4, 0, 0, 1, 2, 4, 1, 0, -4, 0, 0, 1, 2, 4, 2];
+    assert(R.array == correct);
+}
+
+/// Multiply the packed unsigned 16-bit integers in `a` and `b`, 
+/// producing intermediate 32-bit integers, and return the high 
+/// 16 bits of the intermediate integers.
+__m256i _mm256_mulhi_epu16 (__m256i a, __m256i b) pure @safe
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmulhuw256(cast(short16)a, cast(short16)b);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pmulhuw256(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_mulhi_epu16(a_lo, b_lo);
+        __m128i r_hi = _mm_mulhi_epu16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+
+// TODO __m256i _mm256_mulhrs_epi16 (__m256i a, __m256i b) pure @safe
+
+/// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
+/// and return the low 16 bits of the intermediate integers.
+__m256i _mm256_mullo_epi16 (__m256i a, __m256i b) pure @safe
+{
+    // PERF D_SIMD
+    static if (GDC_with_AVX)
+    {
+        return cast(__m256i)(cast(short16)a * cast(short16)b);
+    }
+    else version(LDC)
+    {
+        return cast(__m256i)(cast(short16)a * cast(short16)b);
+    }
+    else
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_mullo_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_mullo_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7, 16384, -16, 0,      3, 4, 1, 16, 7);
+    __m256i B = _mm256_set1_epi16(16384);
+    short16 R = cast(short16)_mm256_mullo_epi16(A, B);
+    short[16] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384, 0, 0, 0, -16384, 0, 16384, 0, -16384];
+    assert(R.array == correct);
+}
+
+/// Multiply the packed signed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers,
+/// and store the low 32 bits of the intermediate integer.
+__m256i _mm256_mullo_epi32 (__m256i a, __m256i b) pure @safe
+{
+    // PERF D_SIMD
+    static if (GDC_with_AVX)
+    {
+        return cast(__m256i)(cast(int8)a * cast(int8)b);
+    }
+    else version(LDC)
+    {
+        return cast(__m256i)(cast(int8)a * cast(int8)b);
+    }
+    else
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_mullo_epi32(a_lo, b_lo);
+        __m128i r_hi = _mm_mullo_epi32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(61616461, 1915324654, 4564061, 3, 61616461, 1915324654, 4564061, 3);
+    __m256i B = _mm256_setr_epi32(49716422, -915616216, -121144, 0, 49716422, -915616216, -121144, 1);
+    int8 R = cast(int8) _mm256_mullo_epi32(A, B);
+    int[8] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0,
+                      cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 3];
+    assert(R.array == correct);
+}
+
+/// Compute the bitwise OR of 256 bits (representing integer data) in `a` and `b`.
+__m256i _mm256_or_si256 (__m256i a, __m256i b) pure @safe
+{
+    return a | b;
+}
+unittest
+{
+    long A = 0x55555555_55555555;
+    long B = 0xAAAAAAAA_AAAAAAAA;
+    __m256i vA = _mm256_set_epi64(A, B, A, B);
+    __m256i vB = _mm256_set_epi64(B, A, 0, B);
+    __m256i R  = _mm256_or_si256(vA, vB);
+    long[4] correct = [B, A, -1, -1];
+    assert(R.array == correct);
+}
+
+/// Convert packed signed 16-bit integers from `a` and `b `to packed 8-bit integers using signed saturation.
+/// Warning: `a` and `b` are interleaved per-lane. 
+///           Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1.
+__m256i _mm256_packs_epi16 (__m256i a, __m256i b) pure @safe
+{
+    // PERF D_SIMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_packsswb256(cast(short16)a, cast(short16)b);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_packsswb256(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_packs_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_packs_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0, 
+                                 -1000, -1000, 1000, 0, 256, -129, 254, 0);
+    byte32 R = cast(byte32) _mm256_packs_epi16(A, A);
+    byte[32] correct = [127, -128, 127, 0, 127, -128, 127, 0,
+                        127, -128, 127, 0, 127, -128, 127, 0,
+                       -128, -128, 127, 0, 127, -128, 127, 0,
+                       -128, -128, 127, 0, 127, -128, 127, 0];
+    assert(R.array == correct);
+}
+
+/// Convert packed signed 32-bit integers from `a` and `b `to packed 16-bit integers using signed saturation.
+/// Warning: `a` and `b` are interleaved per-lane.
+///           Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1.
+__m256i _mm256_packs_epi32 (__m256i a, __m256i b) pure @safe
+{
+    // PERF D_SIMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_packssdw256(cast(int8)a, cast(int8)b);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_packssdw256(cast(int8)a, cast(int8)b);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_packs_epi32(a_lo, b_lo);
+        __m128i r_hi = _mm_packs_epi32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(100000, -100000, 1000, 0, 4, 5, -100000, 7);
+    short16 R = cast(short16) _mm256_packs_epi32(A, A);
+    short[16] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0, 4, 5, -32768, 7, 4, 5, -32768, 7];
+    assert(R.array == correct);
+}
+
+
+/// Convert packed signed 16-bit integers from `a` and `b `to packed 8-bit integers using unsigned saturation.
+/// Warning: `a` and `b` are interleaved per-lane. 
+///           Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1.
+__m256i _mm256_packus_epi16 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF D_SIMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_packuswb256(cast(short16)a, cast(short16)b);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_packuswb256(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        // Always beneficial with LDC.
+        // arm64: 4 inst with LDC  -O1
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_packus_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_packus_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0, -10, 400,  0, 256, -32768,  2,  1, 0);
+    __m256i B = _mm256_setr_epi16(  0,   1, 2,   3,   4, 5, 6, 7,   8,   9, 10,  11,     12, 13, 14, 15);
+    byte32 R = cast(byte32) _mm256_packus_epi16(A, B);
+   align(32) static immutable byte[32] correctResult = [0, -1, 0, -1, -1, 2, 1, 0, 0, 1,  2,  3,  4,  5,  6,  7,
+                                                        0, -1, 0, -1, 0  , 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15];
+    assert(R.array == correctResult);
+}
+
+/// Convert packed signed 32-bit integers from `a` and `b `to packed 16-bit integers using unsigned saturation.
+/// Warning: `a` and `b` are interleaved per-lane.
+///           Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1.
+__m256i _mm256_packus_epi32 (__m256i a, __m256i b) pure @safe
+{
+    // PERF D_SIMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_packusdw256(cast(int8)a, cast(int8)b);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_packusdw256(cast(int8)a, cast(int8)b);
+    }
+    else
+    {
+        // 8 inst in arm64 since LDC 1.22 -O2,
+        // sounds a bit underperforming maybe
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_packus_epi32(a_lo, b_lo);
+        __m128i r_hi = _mm_packus_epi32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(100000, -100000, 1000, 0, 100000, -100000, 1000, 1);
+    short16 R = cast(short16) _mm256_packus_epi32(A, A);
+    short[16] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0,
+                         cast(short)65535, 0, 1000, 1, cast(short)65535, 0, 1000, 1];
+    assert(R.array == correct);
+}
+
+
+
+// TODO __m256i _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8) pure @safe
+
+/// Shuffle 64-bit integers in `a` across lanes using the control in `imm8`.
+__m256i _mm256_permute4x64_epi64(int imm8)(__m256i a) pure @trusted
+{
+    static if (GDC_with_AVX2)
+        return cast(__m256i) __builtin_ia32_permdi256(a, imm8);
+    else static if (LDC_with_optimizations)
+    {
+        return shufflevector!(long4, (imm8 >> 0) & 3,
+                              (imm8 >> 2) & 3,
+                              (imm8 >> 4) & 3,
+                              (imm8 >> 6) & 3)(a, a);
+    }
+    else
+    {
+        __m256i b = a;
+        static foreach (i; 0..4)
+            a[i] = b[(imm8 & (0b00000011 << (i * 2))) >> (i * 2)];
+        return a;
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi64x(1, 2, 3, 4);
+    static immutable long[4] correct = [ 4, 3, 2, 1 ];
+    assert(_mm256_permute4x64_epi64!(0b00011011)(A).array == correct);
+
+    A = _mm256_setr_epi64x(1, 2, 3, 4);
+    static immutable long[4] correct2 = [ 1, 4, 1, 1 ];
+    assert(_mm256_permute4x64_epi64!(0b00001100)(A).array == correct2);
+}
+
+
+// TODO __m256d _mm256_permute4x64_pd (__m256d a, const int imm8) pure @safe
+// TODO __m256i _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx) pure @safe
+// TODO __m256 _mm256_permutevar8x32_ps (__m256 a, __m256i idx) pure @safe
+
+/// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each
+/// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the
+/// low 16 bits of 64-bit elements in result.
+__m256i _mm256_sad_epu8 (__m256i a, __m256i b) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psadbw256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psadbw256(cast(byte32)a, cast(byte32)b);
+    }
+    else
+    {
+        // split is beneficial for ARM64, LDC and GDC without AVX2
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_sad_epu8(a_lo, b_lo);
+        __m128i r_hi = _mm_sad_epu8(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54,
+                              3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
+    __m256i B = _mm256_set1_epi8(1);
+    int8 R = cast(int8) _mm256_sad_epu8(A, B);
+    int[8] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
+                      0,
+                      23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
+                      0,
+                      2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
+                      0,
+                      23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
+                      0];
+    assert(R.array == correct);
+}
+
+/// Shuffle 32-bit integers in `a` within 128-bit lanes using the control in `imm8`, and return the results.
+__m256i _mm256_shuffle_epi32(int imm8)(__m256i a) pure @trusted
+{
+    static if (GDC_with_AVX2)
+        return cast(__m256i)__builtin_ia32_pshufd256(cast(int8)a, imm8);
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i)shufflevectorLDC!(int8,
+            (imm8 >> 0) & 3,
+            (imm8 >> 2) & 3,
+            (imm8 >> 4) & 3,
+            (imm8 >> 6) & 3,
+            ((imm8 >> 0) & 3) + 4,
+            ((imm8 >> 2) & 3) + 4,
+            ((imm8 >> 4) & 3) + 4,
+            ((imm8 >> 6) & 3) + 4)(cast(int8)a, cast(int8)a);
+    }
+    else
+    {
+        auto hi = _mm_shuffle_epi32!imm8(_mm256_extractf128_si256!0(a));
+        auto lo = _mm_shuffle_epi32!imm8(_mm256_extractf128_si256!1(a));
+        return _mm256_setr_m128i(hi, lo);
+    }
+}
+unittest
+{
+    __m256i a = _mm256_set_epi32(32, 31, 30, 29, 28, 27, 26, 25);
+    assert(_mm256_shuffle_epi32!255(a).array == [120259084316L, 120259084316, 137438953504, 137438953504]);
+}
+
+/// Shuffle 8-bit integers in `a` within 128-bit lanes according to shuffle control mask in the 
+/// corresponding 8-bit element of `b`.
+__m256i _mm256_shuffle_epi8(__m256i a, __m256i b) pure @trusted
+{
+    static if (GDC_with_AVX2)
+        return cast(__m256i)__builtin_ia32_pshufb256(cast(ubyte32)a, cast(ubyte32)b);
+    else static if (LDC_with_AVX2)
+        return cast(__m256i)__builtin_ia32_pshufb256(cast(byte32)a, cast(byte32)b);
+    else
+    {
+        auto hi = _mm_shuffle_epi8(_mm256_extractf128_si256!0(a), _mm256_extractf128_si256!0(b));
+        auto lo = _mm_shuffle_epi8(_mm256_extractf128_si256!1(a), _mm256_extractf128_si256!1(b));
+        return _mm256_setr_m128i(hi, lo);
+    }
+}
+unittest
+{
+    __m256i a = _mm256_set_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    __m256i b = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+
+    __m256i expected = _mm256_setr_epi8(
+        2, 2, 2, 2, 2, 2, 2, 2, 
+        1, 1, 1, 1, 1, 1, 1, 1, 
+        18, 18, 18, 18, 18, 18, 18, 18, 
+        17, 17, 17, 17, 17, 17, 17, 17
+    );
+
+    assert(_mm256_shuffle_epi8(a, b).array == expected.array);
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. Store the results in the high 64 bits of 128-bit lanes
+/// of result, with the low 64 bits of 128-bit lanes being copied from from `a`.
+/// See also: `_MM_SHUFFLE`.
+__m256i _mm256_shufflehi_epi16(int imm8)(__m256i a) pure @safe
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pshufhw256(cast(short16)a, imm8);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        return cast(__m256i) shufflevectorLDC!(short16,
+            0, 1, 2, 3,
+            4 + ( (imm8 >> 0) & 3 ),
+            4 + ( (imm8 >> 2) & 3 ),
+            4 + ( (imm8 >> 4) & 3 ),
+            4 + ( (imm8 >> 6) & 3 ),
+            8, 9, 10, 11,
+            12 + ( (imm8 >> 0) & 3 ),
+            12 + ( (imm8 >> 2) & 3 ),
+            12 + ( (imm8 >> 4) & 3 ),
+            12 + ( (imm8 >> 6) & 3 ))
+            (cast(short16)a, cast(short16)a);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_shufflehi_epi16!imm8(a_lo);
+        __m128i r_hi = _mm_shufflehi_epi16!imm8(a_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
+    short16 B = cast(short16) _mm256_shufflehi_epi16!SHUFFLE(A);
+    short[16] expectedB = [ 0, 1, 2, 3, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12 ];
+    assert(B.array == expectedB);
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. Store the results in the low 64 bits of 128-bit lanes 
+/// of result, with the high 64 bits of 128-bit lanes being copied from from `a`.
+/// See also: `_MM_SHUFFLE`.
+__m256i _mm256_shufflelo_epi16(int imm8)(__m256i a) pure @safe
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pshuflw256(cast(short16)a, imm8);
+    }
+    else static if (LDC_with_optimizations)
+    { 
+        return cast(__m256i) shufflevectorLDC!(short16,
+            ( (imm8 >> 0) & 3 ),
+            ( (imm8 >> 2) & 3 ),
+            ( (imm8 >> 4) & 3 ),
+            ( (imm8 >> 6) & 3 ), 
+            4, 5, 6, 7,
+            ( (imm8 >> 0) & 3 ) + 8,
+            ( (imm8 >> 2) & 3 ) + 8,
+            ( (imm8 >> 4) & 3 ) + 8,
+            ( (imm8 >> 6) & 3 ) + 8,
+            12, 13, 14, 15)
+            (cast(short16)a, cast(short16)a);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_shufflelo_epi16!imm8(a_lo);
+        __m128i r_hi = _mm_shufflelo_epi16!imm8(a_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
+    short16 B = cast(short16) _mm256_shufflelo_epi16!SHUFFLE(A);
+    short[16] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7, 11, 10, 9, 8, 12, 13, 14, 15 ];
+    assert(B.array == expectedB);
+}
+
+/// Negate packed signed 16-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative.
+/// Elements in result are zeroed out when the corresponding element in `b` is zero.
+__m256i _mm256_sign_epi16 (__m256i a, __m256i b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psignw256(cast(short16)a, cast(short16)b);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psignw256(cast(short16)a, cast(short16)b);
+    }
+    else // split
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_sign_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_sign_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    // PERF: not optimal in AVX without AVX2
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-2, -1, 0, 1,  2, short.min, short.min, short.min);
+    __m128i B = _mm_setr_epi16(-1,  0,-1, 1, -2,       -50,         0,        50);
+    __m256i AA = _mm256_set_m128i(A, A);
+    __m256i BB = _mm256_set_m128i(B, B);
+    short16 C = cast(short16) _mm256_sign_epi16(AA, BB);
+    short[16] correct =        [ 2,  0, 0, 1, -2, short.min,         0, short.min, 2,  0, 0, 1, -2, short.min,         0, short.min];
+    assert(C.array == correct);
+}
+
+/// Negate packed signed 32-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative.
+/// Elements in result are zeroed out when the corresponding element in `b` is zero.
+__m256i _mm256_sign_epi32 (__m256i a, __m256i b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psignd256(cast(int8)a, cast(int8)b);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psignd256(cast(int8)a, cast(int8)b);
+    }
+    else // split
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_sign_epi32(a_lo, b_lo);
+        __m128i r_hi = _mm_sign_epi32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    // PERF: not optimal in AVX without AVX2
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(-2, -1,  0, int.max, -2, -1,  0, int.max);
+    __m256i B = _mm256_setr_epi32(-1,  0, -1,       1, -1,  0, -1,       1);
+    int8 C = cast(int8) _mm256_sign_epi32(A, B);
+    int[8] correct =             [ 2,  0, 0, int.max,   2,  0,  0, int.max];
+    assert(C.array == correct);
+}
+
+/// Negate packed signed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative.
+/// Elements in result are zeroed out when the corresponding element in `b` is zero.
+__m256i _mm256_sign_epi8 (__m256i a, __m256i b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psignb256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psignb256(cast(byte32)a, cast(byte32)b);
+    }
+    else // split
+    {
+        // LDC arm64, 10 inst since LDC 1.32.1 -O1
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_sign_epi8(a_lo, b_lo);
+        __m128i r_hi = _mm_sign_epi8(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    // PERF: not optimal in AVX without AVX2
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi8( 1,  1, 1, 1,  1,        1,       -2,        1,  0,  1, 0, 0,  0,        0,       -2,        1, 
+                                 -2, -1, 0, 1,  2, byte.min, byte.min, byte.min, -1,  0,-1, 1, -2,      -50,        0,       50);
+    __m256i B = _mm256_setr_epi8(-1,  0,-1, 1, -2,      -50,        0,       50, -1,  0,-1, 1, -2,      -50,        0,       50,
+                                 -1,  0,-1, 1, -2,      -50,        0,       50, -2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
+    byte32  C = cast(byte32) _mm256_sign_epi8(A, B);
+    byte[32] correct =         [ -1, 0,-1, 1, -1,       -1,        0,        1,  0,  0, 0, 0,  0,        0,        0,        1,        
+                                  2, 0, 0, 1, -2, byte.min,        0, byte.min,  1,  0, 0, 1, -2,       50,        0,      -50];
+    assert(C.array == correct);
+}
+
+/// Shift packed 16-bit integers in `a` left by `count` while shifting in zeroes.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 15, result is defined to be all zeroes.
+/// Note: prefer `_mm256_slli_epi16`, less of a trap.
+__m256i _mm256_sll_epi16 (__m256i a, __m128i count) pure @trusted
+{
+    // PERF ARM64
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psllw256(cast(short16)a, cast(short8)count);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_sll_epi16(a_lo, count);
+        __m128i r_hi = _mm_sll_epi16(a_hi, count);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m256i A = _mm256_setr_epi16(4, -8, 11, -32768, 4, -8, 11, -32768, 4, -8, 11, -32768, 4, -8, 11, -32768);
+    short[16] correct0  = (cast(short16)A).array;
+    short[16] correctX  = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 
+    short[16] correct2  = [16, -32, 44, 0, 16, -32, 44, 0, 16, -32, 44, 0, 16, -32, 44, 0];
+    short16 B0 = cast(short16) _mm256_sll_epi16(A, shift0);
+    short16 BX = cast(short16) _mm256_sll_epi16(A, shiftX);
+    short16 B2 = cast(short16) _mm256_sll_epi16(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 32-bit integers in `a` left by `count` while shifting in zeroes.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 31, result is defined to be all zeroes.
+/// Note: prefer `_mm256_slli_epi32`, less of a trap.
+__m256i _mm256_sll_epi32 (__m256i a, __m128i count) pure @trusted
+{
+    // PERF ARM64
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pslld256(cast(int8)a, count);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_sll_epi32(a_lo, count);
+        __m128i r_hi = _mm_sll_epi32(a_hi, count);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m256i A = _mm256_setr_epi32(4, -9, 11, -2147483648, 2, -9, 11, -2147483648);
+    int[8] correct0  = (cast(int8)A).array;
+    int[8] correctX  = [0, 0, 0, 0, 0, 0, 0, 0]; 
+    int[8] correct2  = [16, -36, 44, 0, 8, -36, 44, 0];
+    int8 B0 = cast(int8) _mm256_sll_epi32(A, shift0);
+    int8 BX = cast(int8) _mm256_sll_epi32(A, shiftX);
+    int8 B2 = cast(int8) _mm256_sll_epi32(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 64-bit integers in `a` left by `count` while shifting in zeroes.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 63, result is defined to be all zeroes.
+/// Note: prefer `_mm256_sll_epi64`, less of a trap.
+__m256i _mm256_sll_epi64 (__m256i a, __m128i count) pure @trusted
+{
+    // PERF ARM64
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psllq256(cast(long4)a, cast(long2)count);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_sll_epi64(a_lo, count);
+        __m128i r_hi = _mm_sll_epi64(a_hi, count);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m256i A = _mm256_setr_epi64(4, -9, 5, -8);
+    long[4] correct0  = [ 4,  -9, 5, -8];
+    long[4] correctX  = [ 0,   0,  0, 0];
+    long[4] correct2  = [16, -36, 20, -32];
+    long4 B0 = cast(long4) _mm256_sll_epi64(A, shift0);
+    long4 BX = cast(long4) _mm256_sll_epi64(A, shiftX);
+    long4 B2 = cast(long4) _mm256_sll_epi64(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
+__m256i _mm256_slli_epi16(__m256i a, int imm8) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8);
+    }
+    else // split
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_slli_epi16(a_lo, imm8);
+        __m128i r_hi = _mm_slli_epi16(a_hi, imm8);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7);
+    short16 B = cast(short16)( _mm256_slli_epi16(A, 1) );
+    short16 B2 = cast(short16)( _mm256_slli_epi16(A, 1 + 256) );
+    short[16] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14, 0, 2, 4, 6, -8, -10, 12, 14 ];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    short16 C = cast(short16)( _mm256_slli_epi16(A, 16) );
+    short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ];
+    assert(C.array == expectedC);
+}
+
+/// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
+__m256i _mm256_slli_epi32 (__m256i a, int imm8) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_slli_epi32(a_lo, imm8);
+        __m128i r_hi = _mm_slli_epi32(a_hi, imm8);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -9);
+    int8 B = cast(int8) _mm256_slli_epi32(A, 1);
+    int8 B2 = cast(int8) _mm256_slli_epi32(A, 1 + 256);
+    int[8] expectedB = [ 0, 4, 6, -8, 0, 4, 6, -18 ];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    int8 C = cast(int8) _mm256_slli_epi32(A, 0);
+    int[8] expectedC = [ 0, 2, 3, -4, 0, 2, 3, -9 ];
+    assert(C.array == expectedC);
+
+    int8 D = cast(int8) _mm256_slli_epi32(A, 65);
+    int[8] expectedD = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
+    assert(D.array == expectedD);
+}
+
+/// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
+__m256i _mm256_slli_epi64 (__m256i a, int imm8) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psllqi256(cast(long4)a, cast(ubyte)imm8);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_slli_epi64(a_lo, imm8);
+        __m128i r_hi = _mm_slli_epi64(a_hi, imm8);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi64(23, -4, 1, long.max);
+    long4 B = cast(long4) _mm256_slli_epi64(A, 1);
+    long4 B2 = cast(long4) _mm256_slli_epi64(A, 1 + 256);
+
+    long[4] expectedB = [ 46, -8, 2, -2];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    long4 C = cast(long4) _mm256_slli_epi64(A, 0);
+    long[4] expectedC = [ 23, -4, 1, long.max ];
+    assert(C.array == expectedC);
+
+    long4 D = cast(long4) _mm256_slli_epi64(A, 65);
+    long[4] expectedD = [ 0, 0, 0, 0 ];
+    assert(D.array == expectedD);
+}
+
+/// Shift 128-bit lanes in `a` left by `bytes` bytes while shifting in zeroes.
+alias _mm256_slli_si256 = _mm256_bslli_epi128;
+
+/// Shift packed 32-bit integers in `a` left by the amount specified by the corresponding element in `count` while shifting in zeroes.
+__m128i _mm_sllv_epi32(__m128i a, __m128i count) pure @trusted
+{
+    static if (GDC_with_AVX2 || LDC_with_AVX2)
+        return cast(__m128i)__builtin_ia32_psllv4si(cast(byte16)a, cast(byte16)count);
+    else
+    {
+        // UB if b[n] >= 32
+        __m128i R = _mm_setr_epi32(a.array[0] << count.array[0], 
+                                   a.array[1] << count.array[1], 
+                                   a.array[2] << count.array[2], 
+                                   a.array[3] << count.array[3]);
+
+        // Map large and negative shifts to 32
+        __m128i mm32 = _mm_set1_epi32(32);
+        __m128i shift = _mm_min_epu32(count, mm32);
+
+        // Set to 0 where the shift is >= 32
+        R = R & _mm_cmplt_epi32(shift, mm32);
+        return R;
+    }
+}
+unittest
+{
+    __m128i A     = _mm_setr_epi32(-1,  1, 4, -4);
+    __m128i shift = _mm_setr_epi32( 2, -6, 1, 32);
+    int4 R = cast(int4) _mm_sllv_epi32(A, shift);
+    int[4] expected = [ -4, 0, 8, 0 ];
+    assert(R.array == expected);
+}
+
+/// Shift packed 32-bit integers in `a` left by the amount specified by the corresponding element in `count` while shifting in zeroes.
+__m256i _mm256_sllv_epi32 (__m256i a, __m256i count) pure @safe
+{
+    static if (GDC_with_AVX2 || LDC_with_AVX2)
+        return cast(__m256i)__builtin_ia32_psllv8si(cast(int8)a, cast(int8)count);
+    else
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i c_lo = _mm256_extractf128_si256!0(count);
+        __m128i c_hi = _mm256_extractf128_si256!1(count);
+        __m128i r_lo = _mm_sllv_epi32(a_lo, c_lo);
+        __m128i r_hi = _mm_sllv_epi32(a_hi, c_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A     = _mm256_setr_epi32(-1,  1, 4, -4, -1,  1,  4, -4);
+    __m256i shift = _mm256_setr_epi32( 2, -6, 1, 32,  2, -6, 33, 32);
+    int8 R = cast(int8) _mm256_sllv_epi32(A, shift);
+    int[8] expected = [ -4, 0, 8, 0, -4, 0, 0, 0 ];
+    assert(R.array == expected);
+}
+
+
+/// Shift packed 64-bit integers in `a` left by the amount specified by the corresponding element in `b` while shifting in zeros.
+__m128i _mm_sllv_epi64(__m128i a, __m128i count) pure @trusted
+{
+    static if (GDC_with_AVX2 || LDC_with_AVX2)
+    {
+        return cast(__m128i)__builtin_ia32_psllv2di(cast(long2)a, cast(long2)count);
+    }
+    else
+    {
+        // PERF arm64
+        // LDC: x86, it's not good, but at least it's branchless
+        long2 la = cast(long2)a;
+        long2 lb = cast(long2)count;
+        long2 R;
+        R.ptr[0] = cast(uint)(lb.array[0]) < 64 ? (la.array[0] << lb.array[0]) : 0;
+        R.ptr[1] = cast(uint)(lb.array[1]) < 64 ? (la.array[1] << lb.array[1]) : 0;
+        return cast(__m128i)R;
+    }
+}
+unittest
+{
+    __m128i A  = _mm_setr_epi64( -4,  6);
+    __m128i B1 = _mm_setr_epi64(  2,  0);
+    __m128i B2 = _mm_setr_epi64(-12, 64);
+    long2 R1 = cast(long2) _mm_sllv_epi64(A, B1);
+    long2 R2 = cast(long2) _mm_sllv_epi64(A, B2);
+    long[2] correct1 = [-16, 6];
+    long[2] correct2 = [  0, 0];
+    assert(R1.array == correct1);
+    assert(R2.array == correct2);
+}
+
+/// Shift packed 64-bit integers in `a` left by the amount specified by the corresponding element in `count` while shifting in zeroes.
+__m256i _mm256_sllv_epi64 (__m256i a, __m256i count) pure @safe
+{
+    static if (GDC_with_AVX2 || LDC_with_AVX2)
+        return cast(__m256i)__builtin_ia32_psllv4di(cast(long4)a, cast(long4)count);
+    else
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i c_lo = _mm256_extractf128_si256!0(count);
+        __m128i c_hi = _mm256_extractf128_si256!1(count);
+        __m128i r_lo = _mm_sllv_epi64(a_lo, c_lo);
+        __m128i r_hi = _mm_sllv_epi64(a_hi, c_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A  = _mm256_setr_epi64( -4,  6, -1, 6);
+    __m256i B1 = _mm256_setr_epi64(  2,  0,  3, 1);
+    __m256i B2 = _mm256_setr_epi64(-12, 64, 63, 64);
+    long4 R1 = cast(long4) _mm256_sllv_epi64(A, B1);
+    long4 R2 = cast(long4) _mm256_sllv_epi64(A, B2);
+    long[4] correct1 = [-16, 6, -8, 12];
+    long[4] correct2 = [  0, 0, long.min, 0];
+    assert(R1.array == correct1);
+    assert(R2.array == correct2);
+}
+
+
+
+/// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 15, result is defined to be all sign bits.
+/// Warning: prefer `_mm256_srai_epi16`, less of a trap.
+__m256i _mm256_sra_epi16 (__m256i a, __m128i count) pure @trusted
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psraw256(cast(short16)a, cast(short8)count);
+    }
+    else
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_sra_epi16(a_lo, count);
+        __m128i r_hi = _mm_sra_epi16(a_hi, count);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m256i A = _mm256_setr_epi16(4, -9, 11, -32768, 4, -8, 11, -32768,
+                                  4, -9, 11, -32768, 4, -8, 11, -32768);
+    short[16] correct0  = (cast(short16)A).array;
+    short[16] correctX  = [0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1]; 
+    short[16] correct2  = [1, -3,  2, -8192,  1, -2,  2, -8192, 1, -3,  2, -8192,  1, -2,  2, -8192];
+    short16 B0 = cast(short16) _mm256_sra_epi16(A, shift0);
+    short16 BX = cast(short16) _mm256_sra_epi16(A, shiftX);
+    short16 B2 = cast(short16) _mm256_sra_epi16(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 31, result is defined to be all sign bits.
+/// Warning: prefer `_mm256_sra_epi32`, less of a trap.
+__m256i _mm256_sra_epi32 (__m256i a, __m128i count) pure @trusted
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psrad256(cast(int8)a, cast(int4)count);
+    }
+    else
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_sra_epi32(a_lo, count);
+        __m128i r_hi = _mm_sra_epi32(a_hi, count);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m256i A = _mm256_setr_epi32(4, -9, 11, -2147483648, 8, -9, 11, -2147483648);
+    int[8] correct0  = (cast(int8)A).array;
+    int[8] correctX  = [0, -1, 0, -1, 0, -1, 0, -1]; 
+    int[8] correct2  = [1, -3, 2, -536870912, 2, -3, 2, -536870912];
+    int8 B0 = cast(int8) _mm256_sra_epi32(A, shift0);
+    int8 BX = cast(int8) _mm256_sra_epi32(A, shiftX);
+    int8 B2 = cast(int8) _mm256_sra_epi32(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
+__m256i _mm256_srai_epi16 (__m256i a, int imm8) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psrawi256(cast(short16)a, cast(ubyte)imm8);
+    }
+    else 
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_srai_epi16(a_lo, imm8);
+        __m128i r_hi = _mm_srai_epi16(a_hi, imm8);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A  = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, short.min, short.max, 2, 3, -4, -5, 6, 7);
+    short16 B  = cast(short16)( _mm256_srai_epi16(A, 1) );
+    short16 B2 = cast(short16)( _mm256_srai_epi16(A, 1 + 256) );
+    short[16] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3, -16384, 16383, 1, 1, -2, -3, 3, 3 ];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    short16 C = cast(short16)( _mm256_srai_epi16(A, 18) );
+    short[16] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0,
+                           -1, 0, 0, 0, -1, -1, 0, 0 ];
+    assert(C.array == expectedC);
+}
+
+/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
+__m256i _mm256_srai_epi32 (__m256i a, int imm8) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psradi256(cast(int8)a, cast(ubyte)imm8);
+    }
+    else // split
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_srai_epi32(a_lo, imm8);
+        __m128i r_hi = _mm_srai_epi32(a_hi, imm8);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4);
+    int8 B = cast(int8) _mm256_srai_epi32(A, 1);
+    int8 B2 = cast(int8) _mm256_srai_epi32(A, 1 + 256);
+    int[8] expectedB = [ 0, 1, 1, -2, 0, 1, 1, -2];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    int8 C = cast(int8) _mm256_srai_epi32(A, 32);
+    int[8] expectedC = [ 0, 0, 0, -1, 0, 0, 0, -1];
+    assert(C.array == expectedC);
+
+    int8 D = cast(int8) _mm256_srai_epi32(A, 0);
+    int[8] expectedD = [ 0, 2, 3, -4, 0, 2, 3, -4];
+    assert(D.array == expectedD);
+}
+
+__m128i _mm_srav_epi32(__m128i a, __m128i count) pure @trusted
+{
+    static if (GDC_with_AVX2 || LDC_with_AVX2)
+        return cast(__m128i)__builtin_ia32_psrav4si(cast(int4)a, cast(int4)count);
+    else
+    {
+        __m128i R = _mm_setr_epi32(a.array[0] >> count.array[0], 
+                                   a.array[1] >> count.array[1], 
+                                   a.array[2] >> count.array[2], 
+                                   a.array[3] >> count.array[3]);
+
+        // Map large and negative shifts to all sign bits
+        __m128i signbits = _mm_srai_epi32(a, 31);
+        __m128i mm32 = _mm_set1_epi32(32);
+        __m128i shift = _mm_min_epu32(count, mm32);
+
+        // Set to 0 where the shift is >= 32
+        __m128i lower = _mm_cmplt_epi32(shift, mm32);
+
+        R = (R & lower) | (signbits & ~lower);
+        return R;
+    }
+}
+unittest
+{
+    __m128i A     = _mm_setr_epi32(-1,  1, -4, -4);
+    __m128i shift = _mm_setr_epi32( 2, -6, 31, 32);
+    int4 R = cast(int4) _mm_srav_epi32(A, shift);
+    int[4] expected = [ -1, 0, -1, -1 ];
+    assert(R.array == expected);
+}
+
+__m256i _mm256_srav_epi32 (__m256i a, __m256i count) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psrav8si(cast(int8)a, cast(ubyte)imm8);
+    }
+    else // split
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i c_lo = _mm256_extractf128_si256!0(count);
+        __m128i c_hi = _mm256_extractf128_si256!1(count);
+        __m128i r_lo = _mm_srav_epi32(a_lo, c_lo);
+        __m128i r_hi = _mm_srav_epi32(a_hi, c_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m128i A     = _mm_setr_epi32(-1,  1, -4, -4);
+    __m128i shift = _mm_setr_epi32( 2, -6, 31, 32);
+    int4 R = cast(int4) _mm_srav_epi32(A, shift);
+    int[4] expected = [ -1, 0, -1, -1 ];
+    assert(R.array == expected);
+}
+
+// TODO 
+
+/// Shift packed 16-bit integers in `a` right by `count` while shifting in zeroes.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 15, result is defined to be all zeroes.
+/// Note: prefer `_mm256_srli_epi16`, less of a trap.
+__m256i _mm256_srl_epi16 (__m256i a, __m128i count) pure @trusted
+{
+    // PERF ARM64
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psrlw256(cast(short16)a, cast(short8)count);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_srl_epi16(a_lo, count);
+        __m128i r_hi = _mm_srl_epi16(a_hi, count);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m256i A = _mm256_setr_epi16(4, -8, 11, -32768, 4, -8, 11, -32768, 4, -8, 11, -32768, 4, -8, 11, -32768);
+    short[16] correct0  = (cast(short16)A).array;
+    short[16] correctX  = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 
+    short[16] correct2  = [1, 16382, 2, 8192, 1, 16382, 2, 8192, 1, 16382, 2, 8192, 1, 16382, 2, 8192];
+    short16 B0 = cast(short16) _mm256_srl_epi16(A, shift0);
+    short16 BX = cast(short16) _mm256_srl_epi16(A, shiftX);
+    short16 B2 = cast(short16) _mm256_srl_epi16(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 32-bit integers in `a` right by `count` while shifting in zeroes.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 31, result is defined to be all zeroes.
+/// Note: prefer `_mm256_srli_epi32`, less of a trap.
+__m256i _mm256_srl_epi32 (__m256i a, __m128i count) pure @trusted
+{
+    // PERF ARM64
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psrld256(cast(int8)a, count);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_srl_epi32(a_lo, count);
+        __m128i r_hi = _mm_srl_epi32(a_hi, count);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m256i A = _mm256_setr_epi32(4, -8, 11, -0x80000000, 0, 1, -11, 0x7fffffff);
+    int[8] correct0  = (cast(int8)A).array;
+    int[8] correctX  = [0, 0, 0, 0, 0, 0, 0, 0]; 
+    int[8] correct2  = [1, 1073741822, 2, 536870912, 0, 0, 1073741821, 0x1fffffff];
+    int8 B0 = cast(int8) _mm256_srl_epi32(A, shift0);
+    int8 BX = cast(int8) _mm256_srl_epi32(A, shiftX);
+    int8 B2 = cast(int8) _mm256_srl_epi32(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 64-bit integers in `a` right by `count` while shifting in zeroes.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 63, result is defined to be all zeroes.
+/// Note: prefer `_mm256_srli_epi64`, less of a trap.
+__m256i _mm256_srl_epi64 (__m256i a, __m128i count) pure @trusted
+{
+    // PERF ARM64
+    /*
+    static if (LDC_with_ARM64)
+    { 
+        long bs = (cast(long2)count).array[0];
+        if (bs > 63)
+            return long4(0);
+        else 
+        {
+            a <<= long4(bs);
+            return a;
+        }
+    }
+    else*/  static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psrlq256(cast(long4)a, cast(long2)count);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_srl_epi64(a_lo, count);
+        __m128i r_hi = _mm_srl_epi64(a_hi, count);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m256i A = _mm256_setr_epi64(4, -9, 8, -9);
+    long[4] correct0  = [ 4,  -9, 8, -9];
+    long[4] correctX  = [ 0,   0,  0, 0];
+    long[4] correct2  = [ 1,  4611686018427387901,  2, 4611686018427387901];
+    long4 B0 = cast(long4) _mm256_srl_epi64(A, shift0);
+    long4 BX = cast(long4) _mm256_srl_epi64(A, shiftX);
+    long4 B2 = cast(long4) _mm256_srl_epi64(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
+__m256i _mm256_srli_epi16 (__m256i a, int imm8) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_srli_epi16(a_lo, imm8);
+        __m128i r_hi = _mm_srli_epi16(a_hi, imm8);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7);
+    short16 B = cast(short16) _mm256_srli_epi16(A, 1);
+    short16 B2 = cast(short16) _mm256_srli_epi16(A, 1 + 256);
+    short[16] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3, 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    short16 C = cast(short16) _mm256_srli_epi16(A, 16);
+    short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ];
+    assert(C.array == expectedC);
+
+    short16 D = cast(short16) _mm256_srli_epi16(A, 0);
+    short[16] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7 ];
+    assert(D.array == expectedD);
+}
+
+/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
+__m256i _mm256_srli_epi32 (__m256i a, int imm8) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8);
+    }
+    else 
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_srli_epi32(a_lo, imm8);
+        __m128i r_hi = _mm_srli_epi32(a_hi, imm8);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4);
+    int8 B = cast(int8) _mm256_srli_epi32(A, 1);
+    int8 B2 = cast(int8) _mm256_srli_epi32(A, 1 + 256);
+    int[8] expectedB = [ 0, 1, 1, 0x7FFFFFFE, 0, 1, 1, 0x7FFFFFFE];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    int8 C = cast(int8) _mm256_srli_epi32(A, 255);
+    int[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
+    assert(C.array == expectedC);
+}
+
+/// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
+__m256i _mm256_srli_epi64 (__m256i a, int imm8) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psrlqi256(cast(int8)a, cast(ubyte)imm8);
+    }
+    else 
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i r_lo = _mm_srli_epi64(a_lo, imm8);
+        __m128i r_hi = _mm_srli_epi64(a_hi, imm8);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi64(8, -4, 16, -8);
+    long4 B = cast(long4) _mm256_srli_epi64(A, 1);
+    long4 B2 = cast(long4) _mm256_srli_epi64(A, 1 + 512);
+    long[4] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE, 8, 0x7FFFFFFFFFFFFFFC];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    long4 C = cast(long4) _mm256_srli_epi64(A, 64);
+    long[4] expectedC = [ 0, 0, 0, 0 ];
+    assert(C.array == expectedC);
+}
+
+/// Shift 128-bit lanes in `a` right by `bytes` bytes while shifting in zeroes.
+alias _mm256_srli_si256 = _mm256_bsrli_epi128;
+
+/// Shift packed 32-bit integers in `a` right by the amount specified by the corresponding element in `count` while shifting in zeroes.
+__m128i _mm_srlv_epi32(__m128i a, __m128i count) pure @trusted
+{
+    static if (GDC_with_AVX2 || LDC_with_AVX2)
+        return cast(__m128i)__builtin_ia32_psrlv4si(cast(byte16)a, cast(byte16)count);
+    else
+    {
+        __m128i R = _mm_setr_epi32(a.array[0] >>> count.array[0], 
+                                   a.array[1] >>> count.array[1], 
+                                   a.array[2] >>> count.array[2], 
+                                   a.array[3] >>> count.array[3]);
+
+        // Map large and negative shifts to 32
+        __m128i mm32 = _mm_set1_epi32(32);
+        __m128i shift = _mm_min_epu32(count, mm32);
+
+        // Set to 0 where the shift is >= 32
+        R = R & _mm_cmplt_epi32(shift, mm32);
+        return R;
+    }
+}
+unittest
+{
+    __m128i A     = _mm_setr_epi32(-1,  1, 4, -4);
+    __m128i shift = _mm_setr_epi32( 2, -6, 1, 32);
+    int4 R = cast(int4) _mm_srlv_epi32(A, shift);
+    int[4] expected = [ 1073741823, 0, 2, 0 ];
+    assert(R.array == expected);
+}
+
+/// Shift packed 32-bit integers in `a` right by the amount specified by the corresponding element in `count` while shifting in zeroes.
+__m256i _mm256_srlv_epi32 (__m256i a, __m256i count) pure @trusted
+{
+    static if (GDC_with_AVX2 || LDC_with_AVX2)
+        return cast(__m256i)__builtin_ia32_psrlv8si(cast(int8)a, cast(int8)count);
+    else
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i c_lo = _mm256_extractf128_si256!0(count);
+        __m128i c_hi = _mm256_extractf128_si256!1(count);
+        __m128i r_lo = _mm_srlv_epi32(a_lo, c_lo);
+        __m128i r_hi = _mm_srlv_epi32(a_hi, c_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A     = _mm256_setr_epi32(-1,  1, 4, -4, -1,  1, 4, -4);
+    __m256i shift = _mm256_setr_epi32( 2, -6, 1, 32, 33,  2, -6, 1);
+    int8 R = cast(int8) _mm256_srlv_epi32(A, shift);
+    int[8] expected = [ 1073741823, 0, 2, 0, 0, 0, 0, 2147483646 ];
+    assert(R.array == expected);
+}
+
+/// Shift packed 64-bit integers in `a` right by the amount specified by the corresponding element in `count` while shifting in zeroes.
+__m128i _mm_srlv_epi64(__m128i a, __m128i count) pure @trusted
+{
+    static if (GDC_or_LDC_with_AVX2)
+    {
+        return cast(__m128i)__builtin_ia32_psrlv2di(cast(long2)a, cast(long2)count);
+    }
+    else
+    {
+        // Note: arm64 rather bad for LDC < 1.34
+        //       after that, perfect.
+        // LDC: x86, it's not good, but at least it's branchless
+        long2 la = cast(long2)a;
+        long2 lb = cast(long2)count;
+        long2 R;
+        R.ptr[0] = cast(ulong)(lb.array[0]) < 64 ? (la.array[0] >>> lb.array[0]) : 0;
+        R.ptr[1] = cast(ulong)(lb.array[1]) < 64 ? (la.array[1] >>> lb.array[1]) : 0;
+        return cast(__m128i)R;
+    }
+}
+unittest
+{
+    __m256i A  = _mm256_setr_epi64( -4,  6,  -4,  6);
+    __m256i B1 = _mm256_setr_epi64(  2,  0,   2,  0);
+    __m256i B2 = _mm256_setr_epi64(-12, 64, -12, 64);
+    long4 R1 = cast(long4) _mm256_srlv_epi64(A, B1);
+    long4 R2 = cast(long4) _mm256_srlv_epi64(A, B2);
+    long[4] correct1 = [ 4611686018427387903, 6,  4611686018427387903, 6];
+    long[4] correct2 = [                   0, 0,                    0, 0];
+    assert(R1.array == correct1);
+    assert(R2.array == correct2);
+}
+
+/// Shift packed 64-bit integers in `a` right by the amount specified by the corresponding element in `count` while shifting in zeroes.
+__m256i _mm256_srlv_epi64 (__m256i a, __m256i count) pure @trusted
+{
+    // PERF: rather lame in non-AVX2 x86
+    static if (GDC_with_AVX2 || LDC_with_AVX2)
+        return cast(__m256i)__builtin_ia32_psrlv4di(cast(long4)a, cast(long4)count);
+    else
+    {
+        // split
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i c_lo = _mm256_extractf128_si256!0(count);
+        __m128i c_hi = _mm256_extractf128_si256!1(count);
+        __m128i r_lo = _mm_srlv_epi64(a_lo, c_lo);
+        __m128i r_hi = _mm_srlv_epi64(a_hi, c_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A  = _mm256_setr_epi64( -4,  6,  -4,  6);
+    __m256i B1 = _mm256_setr_epi64(  2,  0,   2,  0);
+    __m256i B2 = _mm256_setr_epi64(-12, 64, -12, 64);
+    long4 R1 = cast(long4) _mm256_srlv_epi64(A, B1);
+    long4 R2 = cast(long4) _mm256_srlv_epi64(A, B2);
+    long[4] correct1 = [ 4611686018427387903, 6,  4611686018427387903, 6];
+    long[4] correct2 = [                   0, 0,                    0, 0];
+    assert(R1.array == correct1);
+    assert(R2.array == correct2);
+}
+
+/// Load 256-bits of integer data from memory using a non-temporal memory hint.
+/// `mem_addr` must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+__m256i _mm256_stream_load_si256 (const(__m256i)* mem_addr) pure @trusted
+{
+    // PERF DMD D_SIMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_movntdqa256(cast(__m256i*)mem_addr); // const_cast
+    }
+    else static if (LDC_with_InlineIREx && LDC_with_optimizations)
+    {
+        enum prefix = `!0 = !{ i32 1 }`;
+        enum ir = `
+            %r = load <4 x i64>, <4 x i64>* %0, !nontemporal !0
+            ret <4 x i64> %r`;
+        return cast(__m256i) LDCInlineIREx!(prefix, ir, "", long4, const(long4)*)(mem_addr);
+    }
+    else
+    {
+        return *mem_addr; // regular move instead
+    }
+}
+unittest
+{
+    align(32) static immutable int[8] correct = [1, 2, 3, 4, 5, 6, 7, 8];
+    __m256i A = _mm256_stream_load_si256(cast(__m256i*)correct.ptr);
+    _mm_mfence();
+    assert((cast(int8)A).array == correct);
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
+__m256i _mm256_sub_epi16 (__m256i a, __m256i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m256i)(cast(short16)a - cast(short16)b);
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16( -7, -1, 0, 9, -100, 100, 234, 432, -32768, 32767, 0, -1, -20000, 0,  6, -2);
+    short16 R = cast(short16) _mm256_sub_epi16(A, A);
+    short[16] correct         = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0];
+    assert(R.array == correct);
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
+__m256i _mm256_sub_epi32(__m256i a, __m256i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m256i)(cast(int8)a - cast(int8)b);
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432);
+    int8 R = cast(int8) _mm256_sub_epi32(A, A);
+    int[8] correct = [ 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
+__m256i _mm256_sub_epi64 (__m256i a, __m256i b) pure @safe
+{
+    pragma(inline, true);
+    return a - b;
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi64(-1, 0x8000_0000_0000_0000, 42, -12);
+    long4 R = cast(__m256i) _mm256_sub_epi64(A, A);
+    long[4] correct = [ 0, 0, 0, 0 ];
+    assert(R.array == correct);
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
+__m256i _mm256_sub_epi8 (__m256i a, __m256i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m256i)(cast(byte32)a - cast(byte32)b);
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78,
+                                 4, 9, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -2, 0, 10, 78);
+    byte32 R = cast(byte32) _mm256_sub_epi8(A, A);
+    byte[32] correct; // zero initialized
+    assert(R.array == correct);
+}
+
+/// Subtract packed signed 16-bit integers in `b` from packed 16-bit integers in `a` using 
+/// saturation.
+__m256i _mm256_subs_epi16 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psubsw256(cast(short16)a, cast(short16)b);
+    }
+    else static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m256i) inteli_llvm_subs!short16(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        short16 r;
+        short16 sa = cast(short16)a;
+        short16 sb = cast(short16)b;
+        foreach(i; 0..16)
+            r.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    short16 res = cast(short16) _mm256_subs_epi16(_mm256_setr_epi16( 7,  6,  5, -32768, 3, 3, 32766,   0,  7,  6,  5, -32750, 3, 3, 32767,   0),
+                                                  _mm256_setr_epi16( 7,  6,  5, -30000, 3, 1,    -2, -10,  7,  6,  5,    100, 3, 1,     1, -10));
+    static immutable short[16] correctResult                    =  [ 0,  0,  0,  -2768, 0, 2, 32767,  10,  0,  0,  0, -32768, 0, 2, 32766,  10];
+    assert(res.array == correctResult);
+}
+
+
+/// Subtract packed signed 8-bit integers in `b` from packed 8-bit integers in `a` using
+/// saturation.
+__m256i _mm256_subs_epi8 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psubsb256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m256i) inteli_llvm_subs!byte32(cast(byte32)a, cast(byte32)b);
+    }
+    else
+    {
+        byte32 r;
+        byte32 sa = cast(byte32)a;
+        byte32 sb = cast(byte32)b;
+        foreach(i; 0..32)
+            r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    byte32 R = cast(byte32) _mm256_subs_epi8(_mm256_setr_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0, 15, 14, 13, 12, 11, 126, 9, 8, 7, 6, 5, -127, 3, 2, 1, 0),
+                                             _mm256_setr_epi8(15, 14, 13, 12, 11,  10, 9, 8, 7, 6, 5,    4, 3, 2, 1, 0, 15, 14, 13, 12, 11, -10, 9, 8, 7, 6, 5,    4, 3, 2, 1, 0));
+    static immutable byte[32] correct                      = [ 0,  0,  0,  0,  0, 117, 0, 0, 0, 0, 0, -128, 0, 0, 0, 0,  0,  0,  0,  0,  0, 127, 0, 0, 0, 0, 0, -128, 0, 0, 0, 0]; 
+    assert(R.array == correct);
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 
+/// using saturation.
+__m256i _mm256_subs_epu16 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psubusw256(cast(short16)a, cast(short16)b);
+    }
+    else static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m256i) inteli_llvm_subus!short16(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        short16 r;
+        short16 sa = cast(short16)a;
+        short16 sb = cast(short16)b;
+        foreach(i; 0..16)
+            r.ptr[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]));
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    short16 R = cast(short16) _mm256_subs_epu16(_mm256_setr_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3,  2, cast(short)65534, 0),
+                                                _mm256_setr_epi16(3, 4,                1, 0, 3, 2,                1, 0, 3, 2,                1, 0, 3, 20, cast(short)65535, 0));
+    static immutable short[16] correct =                         [0, 0, cast(short)65534, 0, 0, 0, cast(short)65534, 0, 0, 0, cast(short)65534, 0, 0,  0,                0, 0];
+    assert(R.array == correct);
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` using
+/// saturation.
+__m256i _mm256_subs_epu8 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF DMD
+    // PERF GDC without AVX2
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_psubusb256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m256i) inteli_llvm_subus!byte32(cast(byte32)a, cast(byte32)b);
+    }
+    else
+    {
+        byte32 r;
+        byte32 sa = cast(byte32)a;
+        byte32 sb = cast(byte32)b;
+        foreach(i; 0..32)
+            r.ptr[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    __m256i A          = _mm256_setr_epi8(0, 0, 5, 4, 5, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, cast(byte)136, 0, 0, 0, 0, 0, 0);
+    __m256i B          = _mm256_setr_epi8(0, 0, 4, 5, 5, 0, 0, 0, 0, 0, 0, 0,             1, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)137, 0, 0, 0,            40, 0, 0, 0, 0, 0, 0);
+    byte32 R = cast(byte32) _mm256_subs_epu8(A, B);
+    static immutable byte[32] correct =  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)254, 0, 0, 0, 0, 0, 0, 0, 0,   cast(byte)0, 0, 0, 0, cast(byte) 96, 0, 0, 0, 0, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in `a` and `b`.
+__m256i _mm256_unpackhi_epi16 (__m256i a, __m256i b) pure @safe
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(long4) __builtin_ia32_punpckhwd256(cast(short16)a, cast(short16)b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <16 x i16> %0, <16 x i16> %1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12,i32 28, i32 13,i32 29, i32 14,i32 30, i32 15,i32 31>
+            ret <16 x i16> %r`;
+        return cast(__m256i)LDCInlineIR!(ir, short16, short16, short16)(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        // Better for arm64, GDC without AVX2
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_unpackhi_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_unpackhi_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
+    __m256i B = _mm256_setr_epi16(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+    short16 C = cast(short16) _mm256_unpackhi_epi16(A, B);
+    short[16] correct = [4,  20, 5,  21, 6, 22, 7, 23, 
+                         12, 28, 13, 29, 14, 30, 15, 31];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in `a` and `b`.
+__m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b) pure @trusted
+{
+    static if (GDC_with_AVX2)
+        enum bool split = false;
+    else version(GNU)
+        enum bool split = true;
+    else
+        enum bool split = false;
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(long4) __builtin_ia32_punpckhdq256(cast(int8)a, cast(int8)b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // LDC AVX2: Suprisingly, this start using vunpckhps in LDC 1.31 -O2
+        enum ir = `%r = shufflevector <8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+            ret <8 x i32> %r`;
+        return cast(__m256i)LDCInlineIR!(ir, int8, int8, int8)(cast(int8)a, cast(int8)b);
+    }
+    else static if (split)
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_unpackhi_epi32(a_lo, b_lo);
+        __m128i r_hi = _mm_unpackhi_epi32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else
+    {
+        int8 R;
+        int8 ai = cast(int8)a;
+        int8 bi = cast(int8)b;
+        R.ptr[0] = ai.array[2];
+        R.ptr[1] = bi.array[2];
+        R.ptr[2] = ai.array[3];
+        R.ptr[3] = bi.array[3];
+        R.ptr[4] = ai.array[6];
+        R.ptr[5] = bi.array[6];
+        R.ptr[6] = ai.array[7];
+        R.ptr[7] = bi.array[7];
+        return cast(__m256i) R;
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(0, 1,  2,  3,  4,  5,  6,  7);
+    __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+    int8 C = cast(int8) _mm256_unpackhi_epi32(A, B);
+    int[8] correct = [2, 10, 3, 11, 6, 14, 7, 15];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in `a` and `b`,
+__m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_punpckhbw256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <32 x i8> %0, <32 x i8> %1, <32 x i32> <i32 8, i32 40,  i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+            ret <32 x i8> %r`;
+        return cast(__m256i)LDCInlineIR!(ir, byte32, byte32, byte32)(cast(byte32)a, cast(byte32)b);
+    }
+    else
+    {
+        // Splitting always beneficial
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_unpackhi_epi8(a_lo, b_lo);
+        __m128i r_hi = _mm_unpackhi_epi8(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi8(  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+                                  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+    __m256i B = _mm256_setr_epi8( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+    byte32 C = cast(byte32) _mm256_unpackhi_epi8(A, B);
+    byte[32] correct =          [  8, 40,  9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                                  24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 ];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in `a` and `b`.
+__m256i _mm256_unpackhi_epi64 (__m256i a, __m256i b) pure @trusted
+{
+    version(GNU)
+        enum split = true; // Benefits GDC in non-AVX2
+    else
+        enum split = false;
+
+    static if (GDC_with_AVX2)
+    {
+        return __builtin_ia32_punpckhqdq256(a, b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <4 x i64> %0, <4 x i64> %1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+            ret <4 x i64> %r`;
+        return cast(__m256i)LDCInlineIR!(ir, long4, long4, long4)(a, b);
+    }
+    else static if (split)
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_unpackhi_epi64(a_lo, b_lo);
+        __m128i r_hi = _mm_unpackhi_epi64(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else
+    {        
+        long4 R;
+        R.ptr[0] = a.array[1];
+        R.ptr[1] = b.array[1];
+        R.ptr[2] = a.array[3];
+        R.ptr[3] = b.array[3];
+        return R;
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi64(0x22222222_22222222, 0x33333333_33333333, 2, 3);
+    __m256i B = _mm256_setr_epi64(0x44444444_44444444, 0x55555555_55555555, 4, 5);
+    long4 C = _mm256_unpackhi_epi64(A, B);
+    long[4] correct = [0x33333333_33333333, 0x55555555_55555555, 3, 5];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in `a` and `b`.
+__m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b) pure @safe
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_punpcklwd256(cast(short16)a, cast(short16)b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <16 x i16> %0, <16 x i16> %1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+            ret <16 x i16> %r`;
+        return cast(__m256i)LDCInlineIR!(ir, short16, short16, short16)(cast(short16)a, cast(short16)b);
+    }
+    else
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_unpacklo_epi16(a_lo, b_lo);
+        __m128i r_hi = _mm_unpacklo_epi16(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi16( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
+    __m256i B = _mm256_setr_epi16(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+    short16 C = cast(short16) _mm256_unpacklo_epi16(A, B);
+    short[16] correct = [0,  16, 1,  17, 2, 18, 3, 19, 
+                         8,  24, 9,  25, 10, 26, 11, 27];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in `a` and `b`.
+__m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b) pure @trusted
+{
+    static if (GDC_with_AVX2)
+        enum bool split = false;
+    else version(GNU)
+        enum bool split = true;
+    else
+        enum bool split = false;
+
+    static if (GDC_with_AVX2)
+    {
+        return cast(long4) __builtin_ia32_punpckldq256(cast(int8)a, cast(int8)b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // LDC AVX2: Suprisingly, this start using vunpcklps in LDC 1.31 -O1
+        enum ir = `%r = shufflevector <8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+            ret <8 x i32> %r`;
+        return cast(__m256i)LDCInlineIR!(ir, int8, int8, int8)(cast(int8)a, cast(int8)b);
+    }
+    else static if (split)
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_unpacklo_epi32(a_lo, b_lo);
+        __m128i r_hi = _mm_unpacklo_epi32(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else
+    {
+        int8 R;
+        int8 ai = cast(int8)a;
+        int8 bi = cast(int8)b;
+        R.ptr[0] = ai.array[0];
+        R.ptr[1] = bi.array[0];
+        R.ptr[2] = ai.array[1];
+        R.ptr[3] = bi.array[1];
+        R.ptr[4] = ai.array[4];
+        R.ptr[5] = bi.array[4];
+        R.ptr[6] = ai.array[5];
+        R.ptr[7] = bi.array[5];
+        return cast(__m256i) R;
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(0, 1,  2,  3,  4,  5,  6,  7);
+    __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+    int8 C = cast(int8) _mm256_unpacklo_epi32(A, B);
+    int[8] correct = [0, 8, 1, 9, 4, 12, 5, 13];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in `a` and `b`.
+__m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b) pure @trusted
+{
+    version(GNU)
+        enum split = true; // Benefits GDC in non-AVX2
+    else
+        enum split = false;
+
+    static if (GDC_with_AVX2)
+    {
+        return __builtin_ia32_punpcklqdq256(a, b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <4 x i64> %0, <4 x i64> %1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+            ret <4 x i64> %r`;
+        return cast(__m256i)LDCInlineIR!(ir, long4, long4, long4)(a, b);
+    }
+    else static if (split)
+    {
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_unpacklo_epi64(a_lo, b_lo);
+        __m128i r_hi = _mm_unpacklo_epi64(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+    else
+    {        
+        long4 R;
+        R.ptr[0] = a.array[0];
+        R.ptr[1] = b.array[0];
+        R.ptr[2] = a.array[2];
+        R.ptr[3] = b.array[2];
+        return R;
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi64(0x22222222_22222222, 0x33333333_33333333, 2, 3);
+    __m256i B = _mm256_setr_epi64(0x44444444_44444444, 0x55555555_55555555, 4, 5);
+    long4 C = _mm256_unpacklo_epi64(A, B);
+    long[4] correct = [0x22222222_22222222, 0x44444444_44444444, 2, 4];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in `a` and `b`. 
+__m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b) pure @trusted
+{
+    static if (GDC_with_AVX2)
+    {
+        return cast(__m256i) __builtin_ia32_punpcklbw256(cast(ubyte32)a, cast(ubyte32)b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <32 x i8> %0, <32 x i8> %1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+            ret <32 x i8> %r`;
+        return cast(__m256i)LDCInlineIR!(ir, byte32, byte32, byte32)(cast(byte32)a, cast(byte32)b);
+    }
+    else
+    {
+        // Splitting always beneficial
+        __m128i a_lo = _mm256_extractf128_si256!0(a);
+        __m128i a_hi = _mm256_extractf128_si256!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128i r_lo = _mm_unpacklo_epi8(a_lo, b_lo);
+        __m128i r_hi = _mm_unpacklo_epi8(a_hi, b_hi);
+        return _mm256_set_m128i(r_hi, r_lo);
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi8(  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+                                  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+    __m256i B = _mm256_setr_epi8( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+    byte32 C = cast(byte32) _mm256_unpacklo_epi8(A, B);
+    byte[32] correct =          [  0, 32,  1, 33,  2, 34,  3, 35,  4, 36,  5, 37,  6, 38,  7, 39,
+                                  16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 ];
+    assert(C.array == correct);
+}
+
+/// Compute the bitwise XOR of 256 bits (representing integer data) in `a` and `b`.
+__m256i _mm256_xor_si256 (__m256i a, __m256i b) pure @safe
+{
+    return a ^ b;
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi64(975394,    619809709,    -1,    54);
+    __m256i B = _mm256_setr_epi64(-920275025,       -6, 85873, 96644);
+    long4 R = cast(long4) _mm256_xor_si256(A, B);
+    long[4] correct = [975394 ^ (-920275025L), 619809709L ^ -6, (-1) ^ 85873, 54 ^ 96644];
+    assert(R.array == correct);
+}
+
+
+/+
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.d")
+int4 __builtin_ia32_gatherd_d(int4, const void*, int4, int4, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.d.256")
+int8 __builtin_ia32_gatherd_d256(int8, const void*, int8, int8, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.pd")
+double2 __builtin_ia32_gatherd_pd(double2, const void*, int4, double2, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.pd.256")
+double4 __builtin_ia32_gatherd_pd256(double4, const void*, int4, double4, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.ps")
+float4 __builtin_ia32_gatherd_ps(float4, const void*, int4, float4, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.ps.256")
+float8 __builtin_ia32_gatherd_ps256(float8, const void*, int8, float8, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.q")
+long2 __builtin_ia32_gatherd_q(long2, const void*, int4, long2, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.q.256")
+long4 __builtin_ia32_gatherd_q256(long4, const void*, int4, long4, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.d")
+int4 __builtin_ia32_gatherq_d(int4, const void*, long2, int4, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.d.256")
+int4 __builtin_ia32_gatherq_d256(int4, const void*, long4, int4, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.pd")
+double2 __builtin_ia32_gatherq_pd(double2, const void*, long2, double2, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.pd.256")
+double4 __builtin_ia32_gatherq_pd256(double4, const void*, long4, double4, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.ps")
+float4 __builtin_ia32_gatherq_ps(float4, const void*, long2, float4, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.ps.256")
+float4 __builtin_ia32_gatherq_ps256(float4, const void*, long4, float4, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.q")
+long2 __builtin_ia32_gatherq_q(long2, const void*, long2, long2, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.q.256")
+long4 __builtin_ia32_gatherq_q256(long4, const void*, long4, long4, byte);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d")
+void __builtin_ia32_maskstored(void*, int4, int4);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d.256")
+void __builtin_ia32_maskstored256(void*, int8, int8);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q")
+void __builtin_ia32_maskstoreq(void*, long2, long2);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q.256")
+void __builtin_ia32_maskstoreq256(void*, long4, long4);
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.mpsadbw")
+short16 __builtin_ia32_mpsadbw256(byte32, byte32, byte) pure @safe;
+
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.pblendvb")
+byte32 __builtin_ia32_pblendvb256(byte32, byte32, byte32) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.permd")
+int8 __builtin_ia32_permvarsi256(int8, int8) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.permps")
+float8 __builtin_ia32_permvarsf256(float8, int8) pure @safe;
+
+
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.pmadd.ub.sw")
+short16 __builtin_ia32_pmaddubsw256(byte32, byte32) pure @safe;
+
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.pmovmskb")
+int __builtin_ia32_pmovmskb256(byte32) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.pmul.hr.sw")
+short16 __builtin_ia32_pmulhrsw256(short16, short16) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.pmulh.w")
+short16 __builtin_ia32_pmulhw256(short16, short16) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.pmulhu.w")
+short16 __builtin_ia32_pmulhuw256(short16, short16) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psad.bw")
+long4 __builtin_ia32_psadbw256(byte32, byte32) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.pshuf.b")
+byte32 __builtin_ia32_pshufb256(byte32, byte32) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psll.q")
+long4 __builtin_ia32_psllq256(long4, long2) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psll.w")
+short16 __builtin_ia32_psllw256(short16, short8) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.d")
+int8 __builtin_ia32_pslldi256(int8, int) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.q")
+long4 __builtin_ia32_psllqi256(long4, int) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.w")
+short16 __builtin_ia32_psllwi256(short16, int) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.d")
+int4 __builtin_ia32_psllv4si(int4, int4) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.d.256")
+int8 __builtin_ia32_psllv8si(int8, int8) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.q")
+long2 __builtin_ia32_psllv2di(long2, long2) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.q.256")
+long4 __builtin_ia32_psllv4di(long4, long4) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psra.d")
+int8 __builtin_ia32_psrad256(int8, int4) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psra.w")
+short16 __builtin_ia32_psraw256(short16, short8) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrai.d")
+int8 __builtin_ia32_psradi256(int8, int) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrai.w")
+short16 __builtin_ia32_psrawi256(short16, int) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrav.d")
+int4 __builtin_ia32_psrav4si(int4, int4) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrav.d.256")
+int8 __builtin_ia32_psrav8si(int8, int8) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.d")
+int8 __builtin_ia32_psrld256(int8, int4) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.q")
+long4 __builtin_ia32_psrlq256(long4, long2) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.w")
+short16 __builtin_ia32_psrlw256(short16, short8) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.d")
+int8 __builtin_ia32_psrldi256(int8, int) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.q")
+long4 __builtin_ia32_psrlqi256(long4, int) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.w")
+short16 __builtin_ia32_psrlwi256(short16, int) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.d")
+int4 __builtin_ia32_psrlv4si(int4, int4) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.d.256")
+int8 __builtin_ia32_psrlv8si(int8, int8) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.q")
+long2 __builtin_ia32_psrlv2di(long2, long2) pure @safe;
+
+pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.q.256")
+long4 __builtin_ia32_psrlv4di(long4, long4) pure @safe;
+
++/
diff --git a/external/inteli/avxintrin.d b/external/inteli/avxintrin.d
new file mode 100644
index 0000000..a82b46c
--- /dev/null
+++ b/external/inteli/avxintrin.d
@@ -0,0 +1,4990 @@
+/**
+* AVX and FP16C intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX
+*
+* Copyright: Guillaume Piolat 2022.
+*            Johan Engelen 2022.
+*            cet 2024.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.avxintrin;
+
+// AVX instructions
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=AVX
+// Note: this header will work whether you have AVX enabled or not.
+// With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively
+// generate AVX instructions.
+// With GDC, use "dflags-gdc": ["-mavx"] or equivalent to actively
+// generate AVX instructions.
+
+// This header also implements FP16C intrinsics.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=F16C
+// With LDC, use "dflags-ldc": ["-mattr=+f16c"] or equivalent to actively
+// generate F16C instructions.
+// With GDC, use "dflags-gdc": ["-mf16c"] or equivalent to actively
+// generate F16C instructions.
+
+/// IMPORTANT NOTE ABOUT MASK LOAD/STORE:
+///
+/// In theory, masked load/store can adress unadressable memory provided the mask is zero.
+/// In practice, that is not the case for the following reasons:
+/// 
+/// - AMD manual says:
+///   "Exception and trap behavior for elements not selected for loading or storing from/to memory
+///   is implementation dependent. For instance, a given implementation may signal a data 
+///   breakpoint or a page fault for doublewords that are zero-masked and not actually written."
+///
+/// - Intel fetches the whole cacheline anyway:
+///   https://erik.science/2019/06/21/AVX-fun.html
+///   "Even if the mask is stored in the special mask registers, it will still first fetch the data
+///    before checking the mask."
+///
+/// So intel-intrinsics adopted the tightened semantics of only adressing fully addressable memory 
+/// with masked loads and stores.
+
+
+/// Some AVX intrinsics takes a float comparison constant.
+/// When labelled "ordered" it means "AND ordered"
+/// When labelled "unordered" it means "OR unordered"
+alias _CMP_EQ = int;
+///ditto
+enum : _CMP_EQ
+{
+    _CMP_EQ_OQ    = 0x00, // Equal (ordered, non-signaling)
+    _CMP_LT_OS    = 0x01, // Less-than (ordered, signaling)
+    _CMP_LE_OS    = 0x02, // Less-than-or-equal (ordered, signaling)
+    _CMP_UNORD_Q  = 0x03, // Unordered (non-signaling)
+    _CMP_NEQ_UQ   = 0x04, // Not-equal (unordered, non-signaling)
+    _CMP_NLT_US   = 0x05, // Not-less-than (unordered, signaling)
+    _CMP_NLE_US   = 0x06, // Not-less-than-or-equal (unordered, signaling)
+    _CMP_ORD_Q    = 0x07, // Ordered (nonsignaling)
+    _CMP_EQ_UQ    = 0x08, // Equal (unordered, non-signaling)
+    _CMP_NGE_US   = 0x09, // Not-greater-than-or-equal (unordered, signaling)
+    _CMP_NGT_US   = 0x0a, // Not-greater-than (unordered, signaling)
+    _CMP_FALSE_OQ = 0x0b, // False (ordered, non-signaling)
+    _CMP_NEQ_OQ   = 0x0c, // Not-equal (ordered, non-signaling)
+    _CMP_GE_OS    = 0x0d, // Greater-than-or-equal (ordered, signaling)
+    _CMP_GT_OS    = 0x0e, // Greater-than (ordered, signaling)
+    _CMP_TRUE_UQ  = 0x0f, // True (unordered, non-signaling)
+    _CMP_EQ_OS    = 0x10, // Equal (ordered, signaling)
+    _CMP_LT_OQ    = 0x11, // Less-than (ordered, non-signaling)
+    _CMP_LE_OQ    = 0x12, // Less-than-or-equal (ordered, non-signaling)
+    _CMP_UNORD_S  = 0x13, // Unordered (signaling)
+    _CMP_NEQ_US   = 0x14, // Not-equal (unordered, signaling)
+    _CMP_NLT_UQ   = 0x15, // Not-less-than (unordered, non-signaling)
+    _CMP_NLE_UQ   = 0x16, // Not-less-than-or-equal (unordered, non-signaling)
+    _CMP_ORD_S    = 0x17, // Ordered (signaling)
+    _CMP_EQ_US    = 0x18, // Equal (unordered, signaling)
+    _CMP_NGE_UQ   = 0x19, // Not-greater-than-or-equal (unordered, non-signaling)
+    _CMP_NGT_UQ   = 0x1a, // Not-greater-than (unordered, non-signaling)
+    _CMP_FALSE_OS = 0x1b, // False (ordered, signaling)
+    _CMP_NEQ_OS   = 0x1c, // Not-equal (ordered, signaling)
+    _CMP_GE_OQ    = 0x1d, // Greater-than-or-equal (ordered, non-signaling)
+    _CMP_GT_OQ    = 0x1e, // Greater-than (ordered, non-signaling)
+    _CMP_TRUE_US  = 0x1f  // (unordered, signaling)
+}
+
+public import inteli.types;
+import inteli.internals;
+
+// Pull in all previous instruction set intrinsics.
+public import inteli.smmintrin;
+public import inteli.tmmintrin;
+public import inteli.nmmintrin;
+
+
+
+// In x86, LDC earlier version may have trouble preserving the stack pointer when an unsupported
+// 256-bit vector type is passed, and AVX is disabled.
+// This leads to disabling some intrinsics in this particular situation, since they are not safe for
+// the caller.
+version(LDC)
+{
+    version(X86)
+    {
+        enum llvm256BitStackWorkaroundIn32BitX86 = __VERSION__ < 2099;
+    }
+    else 
+        enum llvm256BitStackWorkaroundIn32BitX86 = false;
+}
+else
+    enum llvm256BitStackWorkaroundIn32BitX86 = false;
+
+
+
+
+nothrow @nogc:
+
+/// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
+__m256d _mm256_add_pd (__m256d a, __m256d b) pure @trusted
+{
+    return a + b;
+}
+unittest
+{
+    align(32) double[4] A = [-1, 2, -3, 40000];
+    align(32) double[4] B = [ 9, -7, 8, -0.5];
+    __m256d R = _mm256_add_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr));
+    double[4] correct = [8, -5, 5, 39999.5];
+    assert(R.array == correct);
+}
+
+/// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
+__m256 _mm256_add_ps (__m256 a, __m256 b) pure @trusted
+{
+    return a + b;
+}
+unittest
+{
+    align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6];
+    align(32) float[8] B = [ 9.0f, -7, 8,  -0.5, 8, 7, 3, -1];
+    __m256 R = _mm256_add_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr));
+    float[8] correct     = [8, -5, 5, 39999.5, 8, 10, 8, 5];
+    assert(R.array == correct);
+}
+
+/// Alternatively add and subtract packed double-precision (64-bit) floating-point
+///  elements in `a` to/from packed elements in `b`.
+__m256d _mm256_addsub_pd (__m256d a, __m256d b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_addsubpd256(a, b);
+    }
+    else
+    {
+        //// Note: GDC x86 generates addsubpd since GDC 11.1 with -O3
+        ////       LDC x86 generates addsubpd since LDC 1.18 with -O2
+        //// LDC ARM: not fantastic, ok since LDC 1.18 -O2
+        a.ptr[0] = a.array[0] + (-b.array[0]);
+        a.ptr[1] = a.array[1] + b.array[1];
+        a.ptr[2] = a.array[2] + (-b.array[2]);
+        a.ptr[3] = a.array[3] + b.array[3];
+        return a;
+    }
+}
+unittest
+{
+    align(32) double[4] A = [-1, 2, -3, 40000];
+    align(32) double[4] B = [ 9, -7, 8, -0.5];
+    __m256d R = _mm256_addsub_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr));
+    double[4] correct = [-10, -5, -11, 39999.5];
+    assert(R.array == correct);
+}
+
+/// Alternatively add and subtract packed single-precision (32-bit) floating-point elements 
+/// in `a` to/from packed elements in `b`.
+__m256 _mm256_addsub_ps (__m256 a, __m256 b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_addsubps256(a, b);
+    }
+    else
+    {
+        // Note: GDC x86 generates addsubps since GDC 11 -O3
+        //               and in absence of AVX, a pair of SSE3 addsubps since GDC 12 -O2
+        //       LDC x86 generates addsubps since LDC 1.18 -O2
+        //               and in absence of AVX, a pair of SSE3 addsubps since LDC 1.1 -O1
+        // LDC ARM: neat output since LDC 1.21 -O2
+   
+        a.ptr[0] = a.array[0] + (-b.array[0]);
+        a.ptr[1] = a.array[1] + b.array[1];
+        a.ptr[2] = a.array[2] + (-b.array[2]);
+        a.ptr[3] = a.array[3] + b.array[3];
+        a.ptr[4] = a.array[4] + (-b.array[4]);
+        a.ptr[5] = a.array[5] + b.array[5];
+        a.ptr[6] = a.array[6] + (-b.array[6]);
+        a.ptr[7] = a.array[7] + b.array[7];
+        return a;
+    }
+}
+unittest
+{
+    align(32) float[8] A = [-1.0f,  2,  -3, 40000,    0, 3,  5,  6];
+    align(32) float[8] B = [ 9.0f, -7,   8,  -0.5,    8, 7,  3, -1];
+    __m256 R = _mm256_addsub_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr));
+    float[8] correct     = [  -10, -5, -11, 39999.5, -8, 10, 2,  5];
+    assert(R.array == correct);
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in `a` and `b`.
+__m256d _mm256_and_pd (__m256d a, __m256d b) pure @trusted
+{
+    // Note: GCC avxintrin.h uses the builtins for AND NOTAND OR of _ps and _pd,
+    //       but those do not seem needed at any optimization level.
+    return cast(__m256d)(cast(__m256i)a & cast(__m256i)b);
+}
+unittest
+{
+    double a = 4.32;
+    double b = -78.99;
+    long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
+    __m256d A = _mm256_set_pd(a, b, a, b);
+    __m256d B = _mm256_set_pd(b, a, b, a);
+    long4 R = cast(long4)( _mm256_and_pd(A, B) );
+    assert(R.array[0] == correct);
+    assert(R.array[1] == correct);
+    assert(R.array[2] == correct);
+    assert(R.array[3] == correct);
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
+__m256 _mm256_and_ps (__m256 a, __m256 b) pure @trusted
+{
+    return cast(__m256)(cast(__m256i)a & cast(__m256i)b);
+}
+unittest
+{
+    float a = 4.32f;
+    float b = -78.99f;
+    int correct = (*cast(int*)(&a)) & (*cast(int*)(&b));
+    __m256 A = _mm256_set_ps(a, b, a, b, a, b, a, b);
+    __m256 B = _mm256_set_ps(b, a, b, a, b, a, b, a);
+    int8 R = cast(int8)( _mm256_and_ps(A, B) );
+    foreach(i; 0..8)
+        assert(R.array[i] == correct);
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a`
+/// and then AND with b.
+__m256d _mm256_andnot_pd (__m256d a, __m256d b) pure @trusted
+{
+    // PERF DMD
+    __m256i notA = _mm256_not_si256(cast(__m256i)a);
+    __m256i ib = cast(__m256i)b;
+    __m256i ab = notA & ib;
+    return cast(__m256d)ab;
+}
+unittest
+{
+    double a = 4.32;
+    double b = -78.99;
+    long notA = ~ ( *cast(long*)(&a) );
+    long correct = notA & (*cast(long*)(&b));
+    __m256d A = _mm256_set_pd(a, a, a, a);
+    __m256d B = _mm256_set_pd(b, b, b, b);
+    long4 R = cast(long4)( _mm256_andnot_pd(A, B) );
+    foreach(i; 0..4)
+        assert(R.array[i] == correct);
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a`
+/// and then AND with b.
+__m256 _mm256_andnot_ps (__m256 a, __m256 b) pure @trusted
+{
+    // PERF DMD
+    __m256i notA = _mm256_not_si256(cast(__m256i)a);
+    __m256i ib = cast(__m256i)b;
+    __m256i ab = notA & ib;
+    return cast(__m256)ab;
+}
+unittest
+{
+    float a = 4.32f;
+    float b = -78.99f;
+    int notA = ~ ( *cast(int*)(&a) );
+    int correct = notA & (*cast(int*)(&b));
+    __m256 A = _mm256_set1_ps(a);
+    __m256 B = _mm256_set1_ps(b);
+    int8 R = cast(int8)( _mm256_andnot_ps(A, B) );
+    foreach(i; 0..8)
+        assert(R.array[i] == correct);
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control 
+/// mask `imm8`.
+__m256d _mm256_blend_pd(int imm8)(__m256d a, __m256d b)
+{
+    static assert(imm8 >= 0 && imm8 < 16);
+
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_blendpd256 (a, b, imm8);
+    }
+    else
+    {
+        // Works great with LDC.
+        double4 r;
+        for (int n = 0; n < 4; ++n)
+        {
+            r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
+        }
+        return r;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(0, 1, 2, 3);
+    __m256d B = _mm256_setr_pd(8, 9, 10, 11);
+    double4 C = _mm256_blend_pd!0x06(A, B);
+    double[4] correct =    [0, 9, 10, 3];
+    assert(C.array == correct);
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 
+/// mask `imm8`.
+__m256 _mm256_blend_ps(int imm8)(__m256 a, __m256 b) pure @trusted
+{
+    static assert(imm8 >= 0 && imm8 < 256);
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_blendps256 (a, b, imm8);
+    }
+    else version(LDC)
+    {
+        // LDC x86: generates a vblendps since LDC 1.1 -O0
+        //   arm64: pretty good, four instructions worst case
+        return shufflevectorLDC!(float8, (imm8 & 1) ? 8 : 0,
+                                 (imm8 & 2) ? 9 : 1,
+                                 (imm8 & 4) ? 10 : 2,
+                                 (imm8 & 8) ? 11 : 3,
+                                 (imm8 & 16) ? 12 : 4,
+                                 (imm8 & 32) ? 13 : 5,
+                                 (imm8 & 64) ? 14 : 6,
+                                 (imm8 & 128) ? 15 : 7)(a, b);
+    }
+    else
+    {
+        // LDC x86: vblendps generated since LDC 1.27 -O1
+        float8 r;
+        for (int n = 0; n < 8; ++n)
+        {
+            r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
+        }
+        return r;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(0, 1,  2,  3,  4,  5,  6,  7);
+    __m256 B = _mm256_setr_ps(8, 9, 10, 11, 12, 13, 14, 15);
+    float8 C = _mm256_blend_ps!0xe7(A, B);
+    float[8] correct =       [8, 9, 10,  3,  4, 13, 14, 15];
+    assert(C.array == correct);
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using mask.
+__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        // Amazingly enough, GCC/GDC generates the vblendvpd instruction
+        // with -mavx2 but not -mavx.
+        // Not sure what is the reason, and there is a replacement sequence.
+        // Sounds like a bug, similar to _mm_blendv_pd
+        // or maybe the instruction in unsafe?
+        return __builtin_ia32_blendvpd256(a, b, mask);
+    }
+    else static if (LDC_with_AVX)
+    {
+        return __builtin_ia32_blendvpd256(a, b, mask);
+    }
+    else
+    {
+        // LDC x86: vblendvpd since LDC 1.27 -O2
+        //     arm64: only 4 instructions, since LDC 1.27 -O2
+        __m256d r;
+        long4 lmask = cast(long4)mask;
+        for (int n = 0; n < 4; ++n)
+        {
+            r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
+        }
+        return r;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
+    __m256d B = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0);
+    __m256d M = _mm256_setr_pd(-3.0, 2.0, 1.0, -4.0);
+    __m256d R = _mm256_blendv_pd(A, B, M);
+    double[4] correct1 = [5.0, 2.0, 3.0, 8.0];
+    assert(R.array == correct1);
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` 
+/// using `mask`.
+__m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask) @trusted
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_blendvps256(a, b, mask);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        int8 shift;
+        shift = 31;
+        int8 lmask = cast(int8)mask >> shift;     
+        int8 ia = cast(int8)a;   
+        int8 ib = cast(int8)b;
+        return cast(__m256)(ia ^ ((ia ^ ib) & lmask));
+    }
+    else
+    {
+        // In both LDC and GDC with SSE4.1, this generates blendvps as fallback
+        __m256 r;
+        int8 lmask = cast(int8)mask;
+        for (int n = 0; n < 8; ++n)
+        {
+            r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
+        }
+        return r;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+    __m256 B = _mm256_setr_ps(5.0f, 6.0f, 7.0f, 8.0f, 5.0f, 6.0f, 7.0f, 8.0f);
+    __m256 M = _mm256_setr_ps(-3.0f, 2.0f, 1.0f, -4.0f, -3.0f, 2.0f, 1.0f, -4.0f);
+    __m256 R = _mm256_blendv_ps(A, B, M);
+    float[8] correct1 = [5.0f, 2.0f, 3.0f, 8.0f, 5.0f, 2.0f, 3.0f, 8.0f];
+    assert(R.array == correct1);
+}
+
+/// Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) to all elements.
+/// This effectively duplicates the 128-bit vector.
+__m256d _mm256_broadcast_pd (const(__m128d)* mem_addr) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_vbroadcastf128_pd256(cast(float4*)mem_addr);
+    }
+    else
+    {
+        const(double)* p = cast(const(double)*) mem_addr;
+        __m256d r;
+        r.ptr[0] = p[0];
+        r.ptr[1] = p[1];
+        r.ptr[2] = p[0];
+        r.ptr[3] = p[1];
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(3, -4);
+    __m256d B = _mm256_broadcast_pd(&A);
+    double[4] correct = [3, -4, 3, -4];
+    assert(B.array == correct);
+}
+
+/// Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) 
+/// floating-point elements) to all elements.
+/// This effectively duplicates the 128-bit vector.
+__m256 _mm256_broadcast_ps (const(__m128)* mem_addr) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_vbroadcastf128_ps256(cast(float4*)mem_addr);
+    }   
+    else
+    {
+        const(float)* p = cast(const(float)*)mem_addr;
+        __m256 r;
+        r.ptr[0] = p[0];
+        r.ptr[1] = p[1];
+        r.ptr[2] = p[2];
+        r.ptr[3] = p[3];
+        r.ptr[4] = p[0];
+        r.ptr[5] = p[1];
+        r.ptr[6] = p[2];
+        r.ptr[7] = p[3];
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1, 2, 3, -4);
+    __m256 B = _mm256_broadcast_ps(&A);
+    float[8] correct = [1.0f, 2, 3, -4, 1, 2, 3, -4];
+    assert(B.array == correct);
+}
+
+/// Broadcast a single-precision (32-bit) floating-point element from memory to all elements.
+__m256d _mm256_broadcast_sd (const(double)* mem_addr) pure @trusted
+{
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_vbroadcastsd256(mem_addr);
+    }
+    else
+    {
+        double a = *mem_addr;
+        __m256d r;
+        r.ptr[0] = a;
+        r.ptr[1] = a;
+        r.ptr[2] = a;
+        r.ptr[3] = a;
+        return r;
+    }
+}
+unittest
+{
+    double t = 7.5f;
+    __m256d A = _mm256_broadcast_sd(&t);
+    double[4] correct = [7.5, 7.5, 7.5, 7.5];
+    assert(A.array == correct);
+}
+
+/// Broadcast a single-precision (32-bit) floating-point element from memory to all elements.
+__m128 _mm_broadcast_ss (const(float)* mem_addr) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_vbroadcastss(mem_addr);
+    }
+    else
+    {
+        float a = *mem_addr;
+        __m128 r;
+        r.ptr[0] = a;
+        r.ptr[1] = a;
+        r.ptr[2] = a;
+        r.ptr[3] = a;
+        return r;
+    }
+}
+unittest
+{
+    float t = 7.5f;
+    __m128 A = _mm_broadcast_ss(&t);
+    float[4] correct = [7.5f, 7.5f, 7.5f, 7.5f];
+    assert(A.array == correct);
+}
+
+__m256 _mm256_broadcast_ss (const(float)* mem_addr)
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_vbroadcastss256 (mem_addr);
+    }
+    else
+    {
+        float a = *mem_addr;
+        __m256 r = __m256(a);
+        return r;
+    }
+}
+unittest
+{
+    float t = 7.5f;
+    __m256 A = _mm256_broadcast_ss(&t);
+    float[8] correct = [7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f];
+    assert(A.array == correct);
+}
+
+/// Cast vector of type `__m256d` to type `__m256`.
+__m256 _mm256_castpd_ps (__m256d a) pure @safe
+{
+    return cast(__m256)a;
+}
+
+/// Cast vector of type `__m256d` to type `__m256i`.
+__m256i _mm256_castpd_si256 (__m256d a) pure @safe
+{
+    return cast(__m256i)a;
+}
+
+/// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are undefined.
+__m256d _mm256_castpd128_pd256 (__m128d a) pure @trusted
+{
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_pd256_pd(a);
+    }
+    else
+    {
+        __m256d r = void;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = a.array[1];
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(4.0, -6.125);
+    __m256d B = _mm256_castpd128_pd256(A);
+    assert(B.array[0] == 4.0);
+    assert(B.array[1] == -6.125);
+}
+
+/// Cast vector of type `__m256d` to type `__m128d`; the upper 128 bits of `a` are lost.
+__m128d _mm256_castpd256_pd128 (__m256d a) pure @trusted
+{
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_pd_pd256(a);
+    }
+    else
+    {
+        __m128d r;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = a.array[1];
+        return r;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_set_pd(1, 2, -6.25, 4.0);
+    __m128d B = _mm256_castpd256_pd128(A);
+    assert(B.array[0] == 4.0);
+    assert(B.array[1] == -6.25);
+}
+
+/// Cast vector of type `__m256` to type `__m256d`.
+__m256d _mm256_castps_pd (__m256 a) pure @safe
+{
+    return cast(__m256d)a;
+}
+
+/// Cast vector of type `__m256` to type `__m256i`.
+__m256i _mm256_castps_si256 (__m256 a) pure @safe
+{
+    return cast(__m256i)a;
+}
+
+/// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are undefined.
+__m256 _mm256_castps128_ps256 (__m128 a) pure @trusted
+{
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_ps256_ps(a);
+    }
+    else
+    {
+        __m256 r = void;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = a.array[1];
+        r.ptr[2] = a.array[2];
+        r.ptr[3] = a.array[3];
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2, 3, 4);
+    __m256 B = _mm256_castps128_ps256(A);
+    float[4] correct = [1.0f, 2, 3, 4];
+    assert(B.array[0..4] == correct);
+}
+
+/// Cast vector of type `__m256` to type `__m128`. The upper 128-bit of `a` are lost.
+__m128 _mm256_castps256_ps128 (__m256 a) pure @trusted
+{
+    return *cast(const(__m128)*)(&a);
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8);
+    __m128 B = _mm256_castps256_ps128(A);
+    float[4] correct = [1.0f, 2, 3, 4];
+    assert(B.array == correct);
+}
+
+/// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are undefined.
+__m256i _mm256_castsi128_si256 (__m128i a) pure @trusted
+{
+    long2 la = cast(long2)a;
+    long4 r = void;
+    r.ptr[0] = la.array[0];
+    r.ptr[1] = la.array[1];
+    return r;
+}
+unittest
+{
+    __m128i A = _mm_setr_epi64(-1, 42);
+    __m256i B = _mm256_castsi128_si256(A);
+    long[2] correct = [-1, 42];
+    assert(B.array[0..2] == correct);
+}
+
+/// Cast vector of type `__m256i` to type `__m256d`.
+__m256d _mm256_castsi256_pd (__m256i a) pure @safe
+{
+    return cast(__m256d)a;
+}
+
+/// Cast vector of type `__m256i` to type `__m256`.
+__m256 _mm256_castsi256_ps (__m256i a) pure @safe
+{
+    return cast(__m256)a;
+}
+
+/// Cast vector of type `__m256i` to type `__m128i`. The upper 128-bit of `a` are lost.
+__m128i _mm256_castsi256_si128 (__m256i a) pure @trusted
+{
+    long2 r = void;
+    r.ptr[0] = a.array[0];
+    r.ptr[1] = a.array[1];
+    return cast(__m128i)r;
+}
+unittest
+{
+    long4 A;
+    A.ptr[0] = -1;
+    A.ptr[1] = 42;
+    long2 B = cast(long2)(_mm256_castsi256_si128(A));
+    long[2] correct = [-1, 42];
+    assert(B.array[0..2] == correct);
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer 
+/// value, and store the results as packed double-precision floating-point elements.
+__m256d _mm256_ceil_pd (__m256d a) @safe
+{
+    static if (LDC_with_ARM64)
+    {
+         __m128d lo = _mm256_extractf128_pd!0(a);
+        __m128d hi = _mm256_extractf128_pd!1(a);
+        __m128d ilo = _mm_ceil_pd(lo);
+        __m128d ihi = _mm_ceil_pd(hi);
+        return _mm256_set_m128d(ihi, ilo);
+    }
+    else
+    {
+        return _mm256_round_pd!2(a);
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f);
+    A = _mm256_ceil_pd(A);
+    double[4] correct = [2.0, -2.0, 54.0, -2.0];
+    assert(A.array == correct);
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer 
+/// value, and store the results as packed single-precision floating-point elements.
+__m256 _mm256_ceil_ps (__m256 a) @safe
+{
+    static if (LDC_with_ARM64)
+    {
+        __m128 lo = _mm256_extractf128_ps!0(a);
+        __m128 hi = _mm256_extractf128_ps!1(a);
+        __m128 ilo = _mm_ceil_ps(lo);
+        __m128 ihi = _mm_ceil_ps(hi);
+        return _mm256_set_m128(ihi, ilo);
+    }
+    else
+    {
+        return _mm256_round_ps!2(a);
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f);
+    __m256 C = _mm256_ceil_ps(A);
+    float[8] correct       = [2.0f, -2.0f,  54.0f, -2.0f, -1,    3,     -53,    3];
+    assert(C.array == correct);
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in `a` and `b` based on the 
+/// comparison operand specified by `imm8`. 
+__m128d _mm_cmp_pd(int imm8)(__m128d a, __m128d b) pure @safe
+{
+    enum comparison = mapAVXFPComparison(imm8);
+    return cast(__m128d) cmppd!comparison(a, b);
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(double.infinity, double.nan);
+    __m128d B = _mm_setr_pd(3.0,             4.0);
+    long2 R = cast(long2) _mm_cmp_pd!_CMP_GT_OS(A, B);
+    long[2] correct = [-1, 0];
+    assert(R.array == correct);
+
+    long2 R2 = cast(long2) _mm_cmp_pd!_CMP_NLE_UQ(A, B);
+    long[2] correct2 = [-1, -1];
+    assert(R2.array == correct2);
+}
+
+///ditto
+__m256d _mm256_cmp_pd(int imm8)(__m256d a, __m256d b) pure @safe
+{
+    enum comparison = mapAVXFPComparison(imm8);
+    return cast(__m256d) cmppd256!comparison(a, b);
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, double.nan);
+    __m256d B = _mm256_setr_pd(3.0, 2.0, 1.0, double.nan);
+    __m256i R = cast(__m256i) _mm256_cmp_pd!_CMP_LT_OS(A, B);
+    long[4] correct = [-1, 0, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Compare packed double-precision (32-bit) floating-point elements in `a` and `b` based on the 
+/// comparison operand specified by `imm8`. 
+__m128 _mm_cmp_ps(int imm8)(__m128 a, __m128 b) pure @safe
+{
+    enum comparison = mapAVXFPComparison(imm8);
+    return cast(__m128) cmpps!comparison(a, b);
+}
+
+///ditto
+__m256 _mm256_cmp_ps(int imm8)(__m256 a, __m256 b) pure @safe
+{
+    enum comparison = mapAVXFPComparison(imm8);
+    return cast(__m256) cmpps256!comparison(a, b);
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in `a` and `b` based on the
+/// comparison operand specified by `imm8`, store the result in the lower element of result, and 
+/// copy the upper element from `a` to the upper element of result.
+__m128d _mm_cmp_sd(int imm8)(__m128d a, __m128d b) pure @safe
+{
+    enum comparison = mapAVXFPComparison(imm8);
+    return cast(__m128d) cmpsd!comparison(a, b);
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` based on the
+/// comparison operand specified by `imm8`, store the result in the lower element of result, and 
+/// copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_cmp_ss(int imm8)(__m128 a, __m128 b) pure @safe
+{
+    enum comparison = mapAVXFPComparison(imm8);
+    return cast(__m128) cmpss!comparison(a, b);
+}
+
+/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point 
+/// elements.
+__m256d _mm256_cvtepi32_pd (__m128i a) pure @trusted
+{
+    static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %r = sitofp <4 x i32> %0 to <4 x double>
+            ret <4 x double> %r`;
+        return LDCInlineIR!(ir, double4, __m128i)(a);
+    }
+    else static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_cvtdq2pd256(a);
+    }
+    else
+    {
+        double4 r;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = a.array[1];
+        r.ptr[2] = a.array[2];
+        r.ptr[3] = a.array[3];
+        return r;
+    }
+}
+unittest
+{
+    __m256d R = _mm256_cvtepi32_pd(_mm_set1_epi32(54));
+    double[4] correct = [54.0, 54, 54, 54];
+    assert(R.array == correct);
+}
+
+/// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point 
+/// elements.
+__m256 _mm256_cvtepi32_ps (__m256i a) pure @trusted
+{
+    static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %r = sitofp <8 x i32> %0 to <8 x float>
+            ret <8 x float> %r`;
+        return LDCInlineIR!(ir, float8, int8)(cast(int8)a);
+    }
+    else static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_cvtdq2ps256(cast(int8)a);
+    }
+    else
+    {
+        int8 ia = cast(int8)a;
+        __m256 r;
+        r.ptr[0] = ia.array[0];
+        r.ptr[1] = ia.array[1];
+        r.ptr[2] = ia.array[2];
+        r.ptr[3] = ia.array[3];
+        r.ptr[4] = ia.array[4];
+        r.ptr[5] = ia.array[5];
+        r.ptr[6] = ia.array[6];
+        r.ptr[7] = ia.array[7];
+        return r;
+    }
+}
+unittest
+{
+    __m256 R = _mm256_cvtepi32_ps(_mm256_set1_epi32(5));
+    float[8] correct = [5.0f, 5, 5, 5, 5, 5, 5, 5];
+    assert(R.array == correct);
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 
+/// integers. Follows the current rounding mode.
+__m128i _mm256_cvtpd_epi32 (__m256d a) @safe
+{
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_cvtpd2dq256(a);
+    }
+    else
+    {
+        __m128d lo = _mm256_extractf128_pd!0(a);
+        __m128d hi = _mm256_extractf128_pd!1(a);
+        __m128i ilo = _mm_cvtpd_epi32(lo); // Only lower 64-bit contains significant values
+        __m128i ihi = _mm_cvtpd_epi32(hi);
+        return _mm_unpacklo_epi64(ilo, ihi);
+    }
+}
+unittest
+{
+    int4 A = _mm256_cvtpd_epi32(_mm256_setr_pd(61.0, 55.0, -100, 1_000_000));
+    int[4] correct = [61, 55, -100, 1_000_000];
+    assert(A.array == correct);
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed single-precision (32-bit) 
+/// floating-point elements.
+__m128 _mm256_cvtpd_ps (__m256d a) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_cvtpd2ps256(a);
+    }
+    else
+    {
+        __m128 r;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = a.array[1];
+        r.ptr[2] = a.array[2];
+        r.ptr[3] = a.array[3];
+        return r;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(1.0, 2, 3, 5);
+    __m128 R = _mm256_cvtpd_ps(A);
+    float[4] correct = [1.0f, 2, 3, 5];
+    assert(R.array == correct);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 
+/// integers, using the current rounding mode.
+__m256i _mm256_cvtps_epi32 (__m256 a) @trusted
+{
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return cast(__m256i) __builtin_ia32_cvtps2dq256(a);
+    }
+    else
+    {
+        __m128 lo = _mm256_extractf128_ps!0(a);
+        __m128 hi = _mm256_extractf128_ps!1(a);
+        __m128i ilo = _mm_cvtps_epi32(lo);
+        __m128i ihi = _mm_cvtps_epi32(hi);
+        return _mm256_set_m128i(ihi, ilo);
+    }
+}
+unittest
+{
+    uint savedRounding = _MM_GET_ROUNDING_MODE();
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+    __m256i A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.1f, 53.5f, -2.9f, -1.4f, 2.1f, -53.5f, 2.9f));
+    assert( (cast(int8)A).array == [1, -2, 54, -3, -1, 2, -54, 3]);
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+    A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.11f, 53.4f, -2.8f, -1.3f, 2.11f, -53.4f, 2.8f));
+    assert( (cast(int8)A).array == [1, -3, 53, -3, -2, 2, -54, 2]);
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+    A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f));
+    assert( (cast(int8)A).array == [2, -2, 54, -2, -1, 3, -53, 3]);
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+    A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.17f, 53.8f, -2.91f, -1.4f, 2.17f, -53.8f, 2.91f));
+    assert( (cast(int8)A).array == [1, -2, 53, -2, -1, 2, -53, 2]);
+
+    _MM_SET_ROUNDING_MODE(savedRounding);
+}
+
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a`` to packed double-precision 
+/// (64-bit) floating-point elements.
+__m256d _mm256_cvtps_pd (__m128 a) pure @trusted
+{   
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_cvtps2pd256(a); // LDC doesn't have the builtin
+    }
+    else
+    {
+        // LDC: x86, needs -O2 to generate cvtps2pd since LDC 1.2.0
+        __m256d r;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = a.array[1];
+        r.ptr[2] = a.array[2];
+        r.ptr[3] = a.array[3];
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2, 3, 5);
+    __m256d R = _mm256_cvtps_pd(A);
+    double[4] correct = [1.0, 2, 3, 5];
+    assert(R.array == correct);
+}
+
+/// Return the lower double-precision (64-bit) floating-point element of `a`.
+double _mm256_cvtsd_f64 (__m256d a) pure @safe
+{
+    return a.array[0];
+}
+
+/// Return the lower 32-bit integer in `a`.
+int _mm256_cvtsi256_si32 (__m256i a) pure @safe
+{
+    return (cast(int8)a).array[0];
+}
+
+/// Return the lower single-precision (32-bit) floating-point element of `a`.
+float _mm256_cvtss_f32 (__m256 a) pure @safe
+{
+    return a.array[0];
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 
+/// integers with truncation.
+__m128i _mm256_cvttpd_epi32 (__m256d a) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return cast(__m128i)__builtin_ia32_cvttpd2dq256(a);
+    }
+    else
+    {
+        __m128i r;
+        r.ptr[0] = cast(int)a.array[0];
+        r.ptr[1] = cast(int)a.array[1];
+        r.ptr[2] = cast(int)a.array[2];
+        r.ptr[3] = cast(int)a.array[3];
+        return r;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_set_pd(4.7, -1000.9, -7.1, 3.1);
+    __m128i R = _mm256_cvttpd_epi32(A);
+    int[4] correct = [3, -7, -1000, 4];
+    assert(R.array == correct);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a`.
+__m256i _mm256_cvttps_epi32 (__m256 a) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return cast(__m256i)__builtin_ia32_cvttps2dq256(a);
+    }
+    else
+    {
+        int8 r;
+        r.ptr[0] = cast(int)a.array[0];
+        r.ptr[1] = cast(int)a.array[1];
+        r.ptr[2] = cast(int)a.array[2];
+        r.ptr[3] = cast(int)a.array[3];
+        r.ptr[4] = cast(int)a.array[4];
+        r.ptr[5] = cast(int)a.array[5];
+        r.ptr[6] = cast(int)a.array[6];
+        r.ptr[7] = cast(int)a.array[7];
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_set_ps(4.7, -1000.9, -7.1, 3.1, 1.4, 2.9, -2.9, 0);
+    int8 R = cast(int8) _mm256_cvttps_epi32(A);
+    int[8] correct = [0, -2, 2, 1, 3, -7, -1000, 4];
+    assert(R.array == correct);
+}
+
+/// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
+__m256d _mm256_div_pd (__m256d a, __m256d b) pure @safe
+{
+    return a / b;
+}
+unittest
+{
+    __m256d a = [1.5, -2.0, 3.0, 1.0];
+    a = _mm256_div_pd(a, a);
+    double[4] correct = [1.0, 1.0, 1.0, 1.0];
+    assert(a.array == correct);
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`.
+__m256 _mm256_div_ps (__m256 a, __m256 b) pure @safe
+{
+    return a / b;
+}
+unittest
+{
+    __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 4.5f, -5.0f, 6.0f, 7.0f];
+    a = _mm256_div_ps(a, a);
+    float[8] correct = [1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f];
+    assert(a.array == correct);
+}
+
+/// Conditionally multiply the packed single-precision (32-bit) floating-point elements in `a` and 
+/// `b` using the high 4 bits in `imm8`, sum the four products, and conditionally store the sum 
+/// using the low 4 bits of `imm8`.
+__m256 _mm256_dp_ps(int imm8)(__m256 a, __m256 b)
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_dpps256(a, b, cast(ubyte)imm8);
+    }
+    else
+    {
+        // Note: in LDC with SSE4.1 but no AVX, we _could_ increase perf a bit by using two 
+        // _mm_dp_ps.
+        __m256 zero = _mm256_setzero_ps();
+        enum ubyte op = (imm8 >>> 4) & 15;
+        __m256 temp = _mm256_blend_ps!( op | (op << 4) )(zero, a * b);
+        float lo = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
+        float hi = temp.array[4] + temp.array[5] + temp.array[6] + temp.array[7];
+        __m256 r = _mm256_set_m128(_mm_set1_ps(hi), _mm_set1_ps(lo));
+        enum ubyte op2 = (imm8 & 15);
+        return _mm256_blend_ps!(op2 | (op2 << 4))(zero, r);
+    }
+}
+unittest
+{
+    // Products:                 9    14    20   24     6    16    12   -24
+    __m256 A = _mm256_setr_ps(1.0f, 2.0f, 4.0f, 8.0f, 1.0f, 2.0f, 4.0f, 8.0f);
+    __m256 B = _mm256_setr_ps(9.0f, 7.0f, 5.0f, 3.0f, 6.0f, 8.0f, 3.0f,-3.0f);
+    float8 R1 = _mm256_dp_ps!(0xf0 + 0xf)(A, B);
+    float8 R2 = _mm256_dp_ps!(0x30 + 0x5)(A, B);
+    float8 R3 = _mm256_dp_ps!(0x50 + 0xa)(A, B);
+    float[8] correct1 =   [67.0f, 67.0f, 67.0f,67.0f,  10,   10,   10,  10];
+    float[8] correct2 =   [23.0f, 0.0f, 23.0f,  0.0f,  22,    0,   22,   0];
+    float[8] correct3 =   [0.0f, 29.0f, 0.0f,  29.0f,   0,   18,    0,  18];
+    assert(R1.array == correct1);
+    assert(R2.array == correct2);
+    assert(R3.array == correct3);
+}
+
+/// Extract a 32-bit integer from `a`, selected with `imm8`.
+int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted
+{
+    return (cast(int8)a).array[imm8 & 7];
+}
+unittest
+{
+    align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6];
+    auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr);
+    assert(_mm256_extract_epi32(A, 0) == -1);
+    assert(_mm256_extract_epi32(A, 1 + 8) == 2);
+    assert(_mm256_extract_epi32(A, 3 + 16) == 4);
+    assert(_mm256_extract_epi32(A, 7 + 32) == -6);
+}
+
+/// Extract a 64-bit integer from `a`, selected with `index`.
+long _mm256_extract_epi64 (__m256i a, const int index) pure @safe
+{
+    return a.array[index & 3];
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi64x(-7, 6, 42, 0);
+    assert(_mm256_extract_epi64(A, -8) == -7);
+    assert(_mm256_extract_epi64(A, 1) == 6);
+    assert(_mm256_extract_epi64(A, 2 + 4) == 42);
+}
+
+/// Extract a 128-bits lane from `a`, selected with `index` (0 or 1).
+/// Note: `_mm256_extractf128_pd!0` is equivalent to `_mm256_castpd256_pd128`.
+__m128d _mm256_extractf128_pd(ubyte imm8)(__m256d a) pure @trusted
+{
+    version(GNU) pragma(inline, true); // else GDC has trouble inlining this
+
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        // Note: needs to be a template intrinsics because of this builtin.
+        return __builtin_ia32_vextractf128_pd256(a, imm8 & 1);
+    }
+    else
+    {
+        double2 r = void;
+        enum int index = 2*(imm8 & 1);
+        r.ptr[0] = a.array[index+0];
+        r.ptr[1] = a.array[index+1];
+        return r;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
+    double[4] correct = [1.0, 2, 3, 4];
+    __m128d l0 = _mm256_extractf128_pd!18(A);
+    __m128d l1 = _mm256_extractf128_pd!55(A);
+    assert(l0.array == correct[0..2]);
+    assert(l1.array == correct[2..4]);
+}
+
+///ditto
+__m128 _mm256_extractf128_ps(ubyte imm8)(__m256 a) pure @trusted
+{
+    version(GNU) pragma(inline, true); // else GDC has trouble inlining this
+
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_vextractf128_ps256(a, imm8 & 1);
+    }
+    else
+    {
+        float4 r = void; // Optimize well since LDC 1.1 -O1
+        enum int index = 4*(imm8 & 1);
+        r.ptr[0] = a.array[index+0];
+        r.ptr[1] = a.array[index+1];
+        r.ptr[2] = a.array[index+2];
+        r.ptr[3] = a.array[index+3];
+        return r;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(1.0, 2, 3, 4, 5, 6, 7, 8);
+    float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8];
+    __m128 l0 = _mm256_extractf128_ps!8(A);
+    __m128 l1 = _mm256_extractf128_ps!255(A);
+    assert(l0.array == correct[0..4]);
+    assert(l1.array == correct[4..8]);
+}
+
+///ditto
+__m128i _mm256_extractf128_si256(ubyte imm8)(__m256i a) pure @trusted
+{
+    version(GNU) pragma(inline, true); // else GDC has trouble inlining this
+
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        // Note: if it weren't for this GDC intrinsic, _mm256_extractf128_si256
+        // could be a non-template, however, this wins in -O0.
+        // Same story for _mm256_extractf128_ps and _mm256_extractf128_pd
+        return __builtin_ia32_vextractf128_si256(cast(int8)a, imm8 & 1);
+    }
+    else
+    {
+        long2 r = void;
+        enum int index = 2*(imm8 & 1);
+        r.ptr[0] = a.array[index+0];
+        r.ptr[1] = a.array[index+1];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi32(9, 2, 3, 4, 5, 6, 7, 8);
+    int[8] correct = [9, 2, 3, 4, 5, 6, 7, 8];
+    __m128i l0 = _mm256_extractf128_si256!0(A);
+    __m128i l1 = _mm256_extractf128_si256!1(A);
+    assert(l0.array == correct[0..4]);
+    assert(l1.array == correct[4..8]);
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a` down to an integer 
+/// value, and store the results as packed double-precision floating-point elements.
+__m256d _mm256_floor_pd (__m256d a) @safe
+{
+    static if (LDC_with_ARM64)
+    {
+        __m128d lo = _mm256_extractf128_pd!0(a);
+        __m128d hi = _mm256_extractf128_pd!1(a);
+        __m128d ilo = _mm_floor_pd(lo);
+        __m128d ihi = _mm_floor_pd(hi);
+        return _mm256_set_m128d(ihi, ilo);
+    }
+    else
+    {
+        return _mm256_round_pd!1(a);
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f);
+    A = _mm256_floor_pd(A);
+    double[4] correct = [1.0, -3.0, 53.0, -3.0];
+    assert(A.array == correct);
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a` down to an integer 
+/// value, and store the results as packed single-precision floating-point elements.
+__m256 _mm256_floor_ps (__m256 a) @safe
+{
+    static if (LDC_with_ARM64)
+    {
+        __m128 lo = _mm256_extractf128_ps!0(a);
+        __m128 hi = _mm256_extractf128_ps!1(a);
+        __m128 ilo = _mm_floor_ps(lo);
+        __m128 ihi = _mm_floor_ps(hi);
+        return _mm256_set_m128(ihi, ilo);
+    }
+    else
+    {
+        return _mm256_round_ps!1(a);
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f);
+    __m256 C = _mm256_floor_ps(A);
+    float[8] correct       = [1.0f, -3.0f,  53.0f, -3.0f, -2,    2,     -54,    2];
+    assert(C.array == correct);
+}
+
+/// Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in `a` 
+/// and `b`. 
+__m256d _mm256_hadd_pd (__m256d a, __m256d b) pure @trusted
+{
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_haddpd256(a, b);
+    }
+    else
+    {
+        __m256d res;
+        res.ptr[0] = a.array[1] + a.array[0];
+        res.ptr[1] = b.array[1] + b.array[0];
+        res.ptr[2] = a.array[3] + a.array[2];
+        res.ptr[3] = b.array[3] + b.array[2];
+        return res;
+    }
+}
+unittest
+{
+    __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0);
+    __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0);
+    __m256d C = _mm256_hadd_pd(A, B);
+    double[4] correct =      [3.5, 8.0, 30.0, 114.0];
+    assert(C.array == correct);
+}
+
+/// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+__m256 _mm256_hadd_ps (__m256 a, __m256 b) pure @trusted
+{
+    // PERD DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_haddps256(a, b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        __m128 a_hi = _mm256_extractf128_ps!1(a);
+        __m128 a_lo = _mm256_extractf128_ps!0(a);
+        __m128 b_hi = _mm256_extractf128_ps!1(b);
+        __m128 b_lo = _mm256_extractf128_ps!0(b);
+        __m128 hi = vpaddq_f32(a_hi, b_hi);
+        __m128 lo = vpaddq_f32(a_lo, b_lo);
+        return _mm256_set_m128(hi, lo);
+    }
+    else
+    {    
+        __m256 res;
+        res.ptr[0] = a.array[1] + a.array[0];
+        res.ptr[1] = a.array[3] + a.array[2];
+        res.ptr[2] = b.array[1] + b.array[0];
+        res.ptr[3] = b.array[3] + b.array[2];
+        res.ptr[4] = a.array[5] + a.array[4];
+        res.ptr[5] = a.array[7] + a.array[6];
+        res.ptr[6] = b.array[5] + b.array[4];
+        res.ptr[7] = b.array[7] + b.array[6];
+        return res;
+    }
+}
+unittest
+{
+    __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f);
+    __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f);
+    __m256 R = _mm256_hadd_ps(A, B);
+    float[8] correct =      [3.0f, 8.0f, 3.5f, 7.5f, 3.0f, 8.0f, 3.5f, 8.5f];
+    assert(R.array == correct);
+}
+
+/// Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in
+/// `a` and `b`. 
+__m256d _mm256_hsub_pd (__m256d a, __m256d b) pure @trusted
+{
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_hsubpd256(a, b);
+    }
+    else 
+    {
+        // 2 zip1, 2 zip2, 2 fsub... I don't think there is better in arm64
+        __m256d res;
+        res.ptr[0] = a.array[0] - a.array[1];
+        res.ptr[1] = b.array[0] - b.array[1];
+        res.ptr[2] = a.array[2] - a.array[3];
+        res.ptr[3] = b.array[2] - b.array[3];
+        return res;
+    }
+}
+unittest
+{
+    __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0);
+    __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0);
+    __m256d C = _mm256_hsub_pd(A, B);
+    double[4] correct =      [-0.5, -6.0, 12.0, 86.0];
+    assert(C.array == correct);
+}
+
+__m256 _mm256_hsub_ps (__m256 a, __m256 b) pure @trusted
+{
+    // PERD DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_hsubps256(a, b);
+    }
+    else
+    {
+        __m128 a_hi = _mm256_extractf128_ps!1(a);
+        __m128 a_lo = _mm256_extractf128_ps!0(a);
+        __m128 b_hi = _mm256_extractf128_ps!1(b);
+        __m128 b_lo = _mm256_extractf128_ps!0(b);
+        __m128 hi = _mm_hsub_ps(a_hi, b_hi);
+        __m128 lo = _mm_hsub_ps(a_lo, b_lo);
+        return _mm256_set_m128(hi, lo);
+    }
+}
+unittest
+{
+    __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f);
+    __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f);
+    __m256 R = _mm256_hsub_ps(A, B);
+    float[8] correct =   [-1.0f, -2.0f, -0.5f, -0.5f, -1.0f, -2.0f, -0.5f, -1.5f];
+    assert(R.array == correct);
+}
+
+/// Copy `a`, and insert the 16-bit integer `i` into the result at the location specified by 
+/// `index & 15`.
+__m256i _mm256_insert_epi16 (__m256i a, short i, const int index) pure @trusted
+{
+    short16 sa = cast(short16)a;
+    sa.ptr[index & 15] = i;
+    return cast(__m256i)sa;
+}
+unittest
+{
+    __m256i A = _mm256_set1_epi16(1);
+    short16 R = cast(short16) _mm256_insert_epi16(A, 2, 16 + 16 + 7);
+    short[16] correct = [1, 1, 1, 1, 1, 1, 1, 2, 
+                         1, 1, 1, 1, 1, 1, 1, 1 ];
+    assert(R.array == correct);
+}
+
+/// Copy `a`, and insert the 32-bit integer `i` into the result at the location specified by 
+/// `index & 7`.
+__m256i _mm256_insert_epi32 (__m256i a, int i, const int index) pure @trusted
+{
+    int8 ia = cast(int8)a;
+    ia.ptr[index & 7] = i;
+    return cast(__m256i)ia;
+}
+unittest
+{
+    __m256i A = _mm256_set1_epi32(1);
+    int8 R = cast(int8) _mm256_insert_epi32(A, -2, 8 + 8 + 1);
+    int[8] correct = [1, -2, 1, 1, 1, 1, 1, 1];
+    assert(R.array == correct);
+}
+
+/// Copy `a`, and insert the 64-bit integer `i` into the result at the location specified by 
+/// `index & 3`.
+__m256i _mm256_insert_epi64(__m256i a, long i, const int index) pure @trusted
+{
+    a.ptr[index & 3] = i;
+    return a;
+}
+unittest
+{
+    __m256i A = _mm256_set1_epi64(1);
+    long4 R = cast(long4) _mm256_insert_epi64(A, -2, 2 - 4 - 4);
+    long[4] correct = [1, 1, -2, 1];
+    assert(R.array == correct);
+}
+
+/// Copy `a`, and insert the 8-bit integer `i` into the result at the location specified by 
+/// `index & 31`.
+__m256i _mm256_insert_epi8(__m256i a, byte i, const int index) pure @trusted
+{
+    byte32 ba = cast(byte32)a;
+    ba.ptr[index & 31] = i;
+    return cast(__m256i)ba;
+}
+unittest
+{
+    __m256i A = _mm256_set1_epi8(1);
+    byte32 R = cast(byte32) _mm256_insert_epi8(A, -2, 7 - 32 - 32);
+    byte[32] correct = [1, 1, 1, 1, 1, 1, 1,-2, 1, 1, 1, 1, 1, 1, 1, 1,
+                        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ];
+    assert(R.array == correct);
+}
+
+/// Copy `a`, then insert 128 bits (composed of 2 packed double-precision (64-bit) 
+/// floating-point elements) from `b` at the location specified by `imm8`.
+__m256d _mm256_insertf128_pd(int imm8)(__m256d a, __m128d b) pure @trusted
+{
+    static if (GDC_with_AVX)
+    {
+        enum ubyte lane = imm8 & 1;
+        return __builtin_ia32_vinsertf128_pd256(a, b, lane);
+    }
+    else
+    {
+        __m256d r = a;
+        enum int index = (imm8 & 1) ? 2 : 0;
+        r.ptr[index] = b.array[0];
+        r.ptr[index+1] = b.array[1];
+        return r;
+    }
+}
+
+/// Copy `a` then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point
+/// elements) from `b`, at the location specified by `imm8`.
+__m256 _mm256_insertf128_ps(int imm8)(__m256 a, __m128 b) pure @trusted
+{
+    static if (GDC_with_AVX)
+    {
+        enum ubyte lane = imm8 & 1;
+        return __builtin_ia32_vinsertf128_ps256(a, b, lane);
+    }
+    else
+    {
+        __m256 r = a;
+        enum int index = (imm8 & 1) ? 4 : 0;
+        r.ptr[index] = b.array[0];
+        r.ptr[index+1] = b.array[1];
+        r.ptr[index+2] = b.array[2];
+        r.ptr[index+3] = b.array[3];
+        return r;
+    }
+}
+
+/// Copy `a`, then insert 128 bits from `b` at the location specified by `imm8`.
+__m256i _mm256_insertf128_si256(int imm8)(__m256i a, __m128i b) pure @trusted
+{
+    static if (GDC_with_AVX)
+    {
+        enum ubyte lane = imm8 & 1;
+        return cast(__m256i) __builtin_ia32_vinsertf128_si256 (cast(int8)a, b, lane);
+    }
+    else
+    {
+        long2 lb = cast(long2)b;
+        __m256i r = a;
+        enum int index = (imm8 & 1) ? 2 : 0;
+        r.ptr[index] = lb.array[0];
+        r.ptr[index+1] = lb.array[1];
+        return r;
+    }
+}
+
+/// Load 256-bits of integer data from unaligned memory into dst. 
+/// This intrinsic may run better than `_mm256_loadu_si256` when the data crosses a cache 
+/// line boundary.
+__m256i _mm256_lddqu_si256(const(__m256i)* mem_addr) @trusted
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return cast(__m256i) __builtin_ia32_lddqu256(cast(const(char)*)mem_addr);
+    }
+    else
+        return _mm256_loadu_si256(mem_addr);
+}
+unittest
+{
+    int[10] correct = [0, -1, 2, -3, 4, 9, -7, 8, -6, 34];
+    int8 A = cast(int8) _mm256_lddqu_si256(cast(__m256i*) &correct[1]);
+    assert(A.array == correct[1..9]);
+}
+
+/// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 
+/// from memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 
+/// exception may be generated.
+__m256d _mm256_load_pd (const(double)* mem_addr) pure @trusted
+{
+    return *cast(__m256d*)mem_addr;
+}
+unittest
+{
+    static immutable align(32) double[4] correct = [1.0, 2.0, 3.5, -42.0];
+    __m256d A = _mm256_load_pd(correct.ptr);
+    assert(A.array == correct);
+}
+
+/// Load 256-bits (composed of 8 packed single-precision (32-bit) 
+/// floating-point elements) from memory. 
+/// `mem_addr` must be aligned on a 32-byte boundary or a 
+/// general-protection exception may be generated.
+__m256 _mm256_load_ps (const(float)* mem_addr) pure @trusted
+{
+    return *cast(__m256*)mem_addr;
+}
+unittest
+{
+    static immutable align(32) float[8] correct = 
+        [1.0, 2.0, 3.5, -42.0, 7.43f, 0.0f, 3, 2];
+    __m256 A = _mm256_load_ps(correct.ptr);
+    assert(A.array == correct);
+}
+
+/// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on
+/// any particular boundary.
+// See this dlang forum post => https://forum.dlang.org/thread/vymrsngsfibkmqsqffce@forum.dlang.org
+__m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        return loadUnaligned!(__m256i)(cast(long*)mem_addr);
+    }
+    else
+    {
+        const(long)* p = cast(const(long)*)mem_addr; 
+        long4 r;
+        r.ptr[0] = p[0];
+        r.ptr[1] = p[1];
+        r.ptr[2] = p[2];
+        r.ptr[3] = p[3];
+        return r;
+    }
+}
+unittest
+{
+    align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6];
+    int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr);
+    assert(A.array == correct);
+}
+
+/// Load 256-bits of integer data from memory. `mem_addr` must be aligned on a 
+/// 32-byte boundary or a general-protection exception may be generated.
+__m256i _mm256_load_si256 (const(void)* mem_addr) pure @system
+{
+    return *cast(__m256i*)mem_addr;
+}
+unittest
+{
+    static immutable align(64) long[4] correct = [1, -2, long.min, long.max];
+    __m256i A = _mm256_load_si256(correct.ptr);
+    assert(A.array == correct);
+}
+
+/// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 
+/// from memory. `mem_addr` does not need to be aligned on any particular boundary.
+__m256d _mm256_loadu_pd (const(void)* mem_addr) pure @system
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_loadupd256 ( cast(const(double)*) mem_addr);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        return loadUnaligned!(__m256d)(cast(double*)mem_addr);
+    }    
+    else
+    {
+        const(double)* p = cast(const(double)*)mem_addr; 
+        double4 r;
+        r.ptr[0] = p[0];
+        r.ptr[1] = p[1];
+        r.ptr[2] = p[2];
+        r.ptr[3] = p[3];
+        return r;
+    }
+}
+unittest
+{
+    double[4] correct = [1.0, -2.0, 0.0, 768.5];
+    __m256d A = _mm256_loadu_pd(correct.ptr);
+    assert(A.array == correct);
+}
+
+/// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+__m256 _mm256_loadu_ps (const(float)* mem_addr) pure @system
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_loadups256 ( cast(const(float)*) mem_addr);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        return loadUnaligned!(__m256)(cast(float*)mem_addr);
+    }    
+    else
+    {
+        const(float)* p = cast(const(float)*)mem_addr; 
+        float8 r = void;
+        r.ptr[0] = p[0];
+        r.ptr[1] = p[1];
+        r.ptr[2] = p[2];
+        r.ptr[3] = p[3];
+        r.ptr[4] = p[4];
+        r.ptr[5] = p[5];
+        r.ptr[6] = p[6];
+        r.ptr[7] = p[7];
+        return r;
+    }
+}
+unittest
+{
+    align(32) float[10] correct = [0.0f, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+    __m256 A = _mm256_loadu_ps(&correct[1]);
+    assert(A.array == correct[1..9]);
+}
+
+/// Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point 
+/// elements) from memory, and combine them into a 256-bit value. 
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+__m256 _mm256_loadu2_m128 (const(float)* hiaddr, const(float)* loaddr) pure @system
+{
+    // Note: no particular instruction for this in x86.
+    return _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr));
+}
+unittest
+{
+    align(32) float[6] A = [4.5f, 2, 8, 97, -1, 3];
+    align(32) float[6] B = [6.5f, 3, 9, 98, -2, 4];
+    __m256 R = _mm256_loadu2_m128(&B[1], &A[1]);
+    float[8] correct = [2.0f, 8, 97, -1, 3, 9, 98, -2];
+    assert(R.array == correct);
+}
+
+/// Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point
+/// elements) from memory, and combine them into a 256-bit value. 
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+__m256d _mm256_loadu2_m128d (const(double)* hiaddr, const(double)* loaddr) pure @system
+{
+    // Note: no particular instruction for this in x86.
+    return _mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr));
+}
+unittest
+{
+    align(32) double[4] A = [4.5f, 2, 8, 97];
+    align(32) double[4] B = [6.5f, 3, 9, 98];
+    __m256d R = _mm256_loadu2_m128d(&B[1], &A[1]);
+    double[4] correct = [2.0, 8, 3, 9];
+    assert(R.array == correct);
+}
+
+/// Load two 128-bit values (composed of integer data) from memory, and combine them into a 
+/// 256-bit value. `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+__m256i _mm256_loadu2_m128i (const(__m128i)* hiaddr, const(__m128i)* loaddr) pure @trusted
+{
+    // Note: no particular instruction for this in x86.
+    return _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr));
+}
+unittest
+{
+    align(32) long[4] A = [5, 2, 8, 97];
+    align(32) long[4] B = [6, 3, 9, 98];
+    __m256i R = _mm256_loadu2_m128i(cast(const(__m128i)*) &B[1], cast(const(__m128i)*)  &A[1]);
+    long[4] correct = [2, 8, 3, 9];
+    assert(R.array == correct);
+}
+
+version(DigitalMars)
+{
+    // this avoids a bug with DMD < 2.099 -a x86 -O
+    private enum bool maskLoadWorkaroundDMD = (__VERSION__ < 2099);
+}
+else
+{
+    private enum bool maskLoadWorkaroundDMD = false;
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory using `mask` 
+/// (elements are zeroed out when the high bit of the corresponding element is not set).
+/// Note: emulating that instruction isn't efficient, since it needs to perform memory access
+/// only when needed.
+/// See: "Note about mask load/store" to know why you must address valid memory only.
+__m128d _mm_maskload_pd (const(double)* mem_addr, __m128i mask) /* pure */ @system
+{
+    // PERF DMD
+    static if (LDC_with_AVX)
+    {
+        // MAYDO report that the builtin is impure
+        return __builtin_ia32_maskloadpd(mem_addr, cast(long2)mask);
+    }
+    else static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_maskloadpd(cast(double2*)mem_addr, cast(long2)mask);
+    }
+    else
+    {
+        __m128d a = _mm_loadu_pd(mem_addr);
+        __m128d zero = _mm_setzero_pd();
+        return _mm_blendv_pd(zero, a, cast(double2)mask);
+    }
+}
+unittest
+{
+    static if (!maskLoadWorkaroundDMD) 
+    {
+        double[2] A = [7.5, 1];
+        double2 B = _mm_maskload_pd(A.ptr, _mm_setr_epi64(-1, 1));
+        double[2] correct = [7.5, 0];
+        assert(B.array == correct);
+    }
+}
+
+/// Load packed double-precision (64-bit) floating-point elements from memory using `mask`
+/// (elements are zeroed out when the high bit of the corresponding element is not set).
+/// See: "Note about mask load/store" to know why you must address valid memory only.
+__m256d _mm256_maskload_pd (const(double)* mem_addr, __m256i mask) /*pure*/ @system
+{
+    // PERF DMD
+    static if (LDC_with_AVX)
+    {
+        // MAYDO that the builtin is impure
+        return __builtin_ia32_maskloadpd256(mem_addr, mask);
+    }
+    else static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_maskloadpd256(cast(double4*)mem_addr, mask);
+    }
+    else
+    {
+        __m256d a = _mm256_loadu_pd(mem_addr);
+        __m256d zero = _mm256_setzero_pd();
+        return _mm256_blendv_pd(zero, a, cast(double4)mask);
+    }
+}
+unittest
+{
+    static if (!maskLoadWorkaroundDMD)
+    {
+        double[4] A = [7.5, 1, 2, 3];
+        double4 B = _mm256_maskload_pd(A.ptr, _mm256_setr_epi64(1, -1, -1, 1));
+        double[4] correct = [0.0, 1, 2, 0];
+        assert(B.array == correct);
+    }
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory using mask (elements
+/// are zeroed out when the high bit of the corresponding element is not set).
+/// Warning: See "Note about mask load/store" to know why you must address valid memory only.
+__m128 _mm_maskload_ps (const(float)* mem_addr, __m128i mask) /* pure */ @system
+{
+    // PERF DMD
+    static if (LDC_with_AVX)
+    {
+        // MAYDO report that the builtin is impure
+        return __builtin_ia32_maskloadps(mem_addr, mask);
+    }
+    else static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_maskloadps(cast(float4*)mem_addr, mask);
+    }
+    else
+    {
+        __m128 a = _mm_loadu_ps(mem_addr);
+        __m128 zero = _mm_setzero_ps();
+        return _mm_blendv_ps(zero, a, cast(float4)mask);
+    }
+}
+unittest
+{
+    static if (!maskLoadWorkaroundDMD)
+    {
+        float[4] A = [7.5f, 1, 2, 3];
+        float4 B = _mm_maskload_ps(A.ptr, _mm_setr_epi32(1, -1, -1, 1));  // can NOT address invalid memory!
+        float[4] correct = [0.0f, 1, 2, 0];
+        assert(B.array == correct);
+    }
+}
+
+/// Load packed single-precision (32-bit) floating-point elements from memory using `mask`
+/// (elements are zeroed out when the high bit of the corresponding element is not set).
+/// Note: emulating that instruction isn't efficient, since it needs to perform memory access
+/// only when needed.
+/// See: "Note about mask load/store" to know why you must address valid memory only.
+__m256 _mm256_maskload_ps (const(float)* mem_addr, __m256i mask) /*pure*/ @system
+{
+    // PERF DMD
+    static if (LDC_with_AVX)
+    {
+        // MAYDO report that the builtin is impure
+        return __builtin_ia32_maskloadps256(mem_addr, cast(int8)mask);
+    }
+    else static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_maskloadps256(cast(float8*)mem_addr, cast(int8)mask);
+    }
+    else
+    {
+        __m256 a = _mm256_loadu_ps(mem_addr);
+        __m256 zero = _mm256_setzero_ps();
+        return _mm256_blendv_ps(zero, a, cast(float8)mask);
+    }
+}
+unittest
+{
+    float[8] A                  = [1,   7.5f,  1,  2, 3,  4,  5, 6];
+    __m256i  M = _mm256_setr_epi32(1,     -1,  1, -1, 1, -1, -1, 1);
+    float8 B = _mm256_maskload_ps(A.ptr, M);
+    float[8] correct =            [0.0f, 7.5f, 0,  2, 0,  4,  5, 0];
+    assert(B.array == correct);
+}
+
+/// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`.
+/// Note: emulating that instruction isn't efficient, since it needs to perform memory access
+/// only when needed.
+/// See: "Note about mask load/store" to know why you must address valid memory only.
+void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a) /* pure */ @system
+{
+    // PERF DMD
+    static if (LDC_with_AVX)
+    {
+        // MAYDO that the builtin is impure
+        __builtin_ia32_maskstorepd(mem_addr, cast(long2)mask, a);
+    }
+    else static if (GDC_with_AVX)
+    {
+        __builtin_ia32_maskstorepd(cast(double2*)mem_addr, cast(long2)mask, a);
+    }
+    else
+    {
+        __m128d source = _mm_loadu_pd(mem_addr);
+        __m128d r = _mm_blendv_pd(source, a, cast(double2) mask);
+        _mm_storeu_pd(mem_addr, r);
+    }
+}
+unittest
+{
+    double[2] A = [0.0, 1.0];
+    __m128i M = _mm_setr_epi64(-1, 0);
+    __m128d B = _mm_setr_pd(2.0, 3.0);
+    _mm_maskstore_pd(A.ptr, M, B);
+    double[2] correct = [2.0, 1.0];
+    assert(A == correct);
+}
+
+
+/// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`.
+/// See: "Note about mask load/store" to know why you must address valid memory only.
+static if (!llvm256BitStackWorkaroundIn32BitX86)
+{
+    void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a) /* pure */ @system
+    {
+        // PERF DMD
+        static if (LDC_with_AVX)
+        {
+            // MAYDO that the builtin is impure
+            __builtin_ia32_maskstorepd256(mem_addr, cast(long4)mask, a);
+        }
+        else static if (GDC_with_AVX)
+        {
+            __builtin_ia32_maskstorepd256(cast(double4*)mem_addr, cast(long4)mask, a);
+        }
+        else
+        {
+            __m256d source = _mm256_loadu_pd(mem_addr);
+            __m256d r = _mm256_blendv_pd(source, a, cast(double4) mask);
+            _mm256_storeu_pd(mem_addr, r);
+        }
+    }
+    unittest
+    {
+        double[4] A = [0.0, 1, 2, 3];
+        __m256i M = _mm256_setr_epi64x(-9, 0, -1, 0);
+        __m256d B = _mm256_setr_pd(2, 3, 4, 5);
+        _mm256_maskstore_pd(A.ptr, M, B);
+        double[4] correct = [2.0, 1, 4, 3];
+        assert(A == correct);
+    }
+}
+
+/// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`.
+/// Note: emulating that instruction isn't efficient, since it needs to perform memory access
+/// only when needed.
+/// See: "Note about mask load/store" to know why you must address valid memory only.
+void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)  /* pure */ @system
+{
+    // PERF DMD
+    static if (LDC_with_AVX)
+    {
+        // MAYDO report that the builtin is impure
+        __builtin_ia32_maskstoreps(mem_addr, mask, a);
+    }
+    else static if (GDC_with_AVX)
+    {
+        __builtin_ia32_maskstoreps(cast(float4*)mem_addr, mask, a);
+    }
+    else
+    {
+        __m128 source = _mm_loadu_ps(mem_addr);
+        __m128 r = _mm_blendv_ps(source, a, cast(float4) mask);
+        _mm_storeu_ps(mem_addr, r);
+    }
+}
+unittest
+{
+    float[4] A = [0.0f, 1, 2, 6];
+    __m128i M = _mm_setr_epi32(-1, 0, -1, 0);
+    __m128 B = _mm_setr_ps(2, 3, 4, 5);
+    _mm_maskstore_ps(A.ptr, M, B);
+    float[4] correct = [2.0f, 1, 4, 6];
+    assert(A == correct);
+}
+
+static if (!llvm256BitStackWorkaroundIn32BitX86)
+{
+    /// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`.
+    /// See: "Note about mask load/store" to know why you must address valid memory only.
+    void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) /* pure */ @system
+    {
+        // PERF DMD
+        static if (LDC_with_AVX)
+        {
+            // MAYDO report that the builtin is impure
+            __builtin_ia32_maskstoreps256(mem_addr, cast(int8)mask, a);
+        }
+        else static if (GDC_with_AVX)
+        {
+            __builtin_ia32_maskstoreps256(cast(float8*)mem_addr, cast(int8)mask, a);
+        }
+        else
+        {
+            __m256 source = _mm256_loadu_ps(mem_addr);
+            __m256 r = _mm256_blendv_ps(source, a, cast(float8) mask);
+            _mm256_storeu_ps(mem_addr, r);
+        }
+    }
+    unittest
+    {
+        float[8] A                 = [0.0f, 0, 1,  2, 3,  4,  5, 7];
+        __m256i M = _mm256_setr_epi32(  0, -1, 0, -1, 0, -1, -1, 0);
+        __m256 B = _mm256_set1_ps(6.0f);
+        _mm256_maskstore_ps(A.ptr, M, B);
+        float[8] correct           = [0.0f, 6, 1,  6, 3,  6,  6, 7];
+        assert(A == correct);
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
+/// packed maximum values.
+__m256d _mm256_max_pd (__m256d a, __m256d b) pure @trusted
+{    
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_maxpd256(a, b);
+    }
+    else
+    {
+        // LDC: becomes good in -O2
+        // PERF: GDC without AVX
+        a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
+        a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
+        a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2];
+        a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3];
+        return a;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity);
+    __m256d B = _mm256_setr_pd(1.0, 8.0,  0.0, 100000.0);
+    __m256d M = _mm256_max_pd(A, B);
+    double[4] correct =       [4.0, 8.0, 0.0, double.infinity];
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 
+/// packed maximum values.
+__m256 _mm256_max_ps (__m256 a, __m256 b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_maxps256(a, b);
+    }
+    else
+    {
+        // LDC: becomes good in -O2, but looks brittle.
+        // PERF GDC without AVX
+        a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
+        a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
+        a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2];
+        a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3];
+        a.ptr[4] = (a.array[4] > b.array[4]) ? a.array[4] : b.array[4];
+        a.ptr[5] = (a.array[5] > b.array[5]) ? a.array[5] : b.array[5];
+        a.ptr[6] = (a.array[6] > b.array[6]) ? a.array[6] : b.array[6];
+        a.ptr[7] = (a.array[7] > b.array[7]) ? a.array[7] : b.array[7];
+        return a;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4);
+    __m256 B = _mm256_setr_ps(1.0, 8.0,  0.0, 100000.0f     , 4, 3, 2, 1);
+    __m256 M = _mm256_max_ps(A, B);
+    float[8] correct =       [4.0, 8.0,  0.0, float.infinity , 4, 3, 3, 4];
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
+/// packed minimum values.
+__m256d _mm256_min_pd (__m256d a, __m256d b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_minpd256(a, b);
+    }
+    else
+    {
+        // LDC: becomes good in -O2
+        // PERF: GDC without AVX
+        a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
+        a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
+        a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2];
+        a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3];
+        return a;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity);
+    __m256d B = _mm256_setr_pd(1.0, 8.0,  0.0, 100000.0);
+    __m256d M = _mm256_min_pd(A, B);
+    double[4] correct =       [1.0, 8.0, -9.0, 100000.0];
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 
+/// packed maximum values.
+__m256 _mm256_min_ps (__m256 a, __m256 b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_minps256(a, b);
+    }
+    else
+    {
+        // LDC: becomes good in -O2, but looks brittle.
+        // PERF GDC without AVX
+        a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
+        a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
+        a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2];
+        a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3];
+        a.ptr[4] = (a.array[4] < b.array[4]) ? a.array[4] : b.array[4];
+        a.ptr[5] = (a.array[5] < b.array[5]) ? a.array[5] : b.array[5];
+        a.ptr[6] = (a.array[6] < b.array[6]) ? a.array[6] : b.array[6];
+        a.ptr[7] = (a.array[7] < b.array[7]) ? a.array[7] : b.array[7];
+        return a;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4);
+    __m256 B = _mm256_setr_ps(1.0, 8.0,  0.0, 100000.0f     , 4, 3, 2, 1);
+    __m256 M = _mm256_min_ps(A, B);
+    float[8] correct =       [1.0, 1.0, -9.0, 100000.0f     , 1, 2, 2, 1];
+}
+
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements from `a`.
+__m256d _mm256_movedup_pd (__m256d a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_movddup256 (a);
+    }
+    else
+    {
+        a.ptr[1] = a.array[0];
+        a.ptr[3] = a.array[2];
+        return a;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
+    A = _mm256_movedup_pd(A);
+    double[4] correct = [1.0, 1, 3, 3];
+    assert(A.array == correct);
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`.
+__m256 _mm256_movehdup_ps (__m256 a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_movshdup256 (a);
+    }
+    else
+    {
+        a.ptr[0] = a.array[1];
+        a.ptr[2] = a.array[3];
+        a.ptr[4] = a.array[5];
+        a.ptr[6] = a.array[7];
+        return a;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8);
+    A = _mm256_movehdup_ps(A);
+    float[8] correct = [2.0, 2, 4, 4, 6, 6, 8, 8];
+    assert(A.array == correct);
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`.
+__m256 _mm256_moveldup_ps (__m256 a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_movsldup256 (a);
+    }
+    else
+    {
+        a.ptr[1] = a.array[0];
+        a.ptr[3] = a.array[2];
+        a.ptr[5] = a.array[4];
+        a.ptr[7] = a.array[6];
+        return a;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8);
+    A = _mm256_moveldup_ps(A);
+    float[8] correct = [1.0, 1, 3, 3, 5, 5, 7, 7];
+    assert(A.array == correct);
+}
+
+/// Set each bit of result mask based on the most significant bit of the corresponding packed 
+/// double-precision (64-bit) floating-point element in `a`.
+int _mm256_movemask_pd (__m256d a) @safe
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_movmskpd256(a);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        // this doesn't benefit GDC, and not clear for arm64.
+        __m128d A_lo = _mm256_extractf128_pd!0(a);
+        __m128d A_hi = _mm256_extractf128_pd!1(a);
+
+        return (_mm_movemask_pd(A_hi) << 2) | _mm_movemask_pd(A_lo);
+    }
+    else
+    {
+        // Fortunately, branchless on arm64
+        long4 lv = cast(long4)a;
+        int r = 0;
+        if (lv.array[0] < 0) r += 1;
+        if (lv.array[1] < 0) r += 2;
+        if (lv.array[2] < 0) r += 4;
+        if (lv.array[3] < 0) r += 8;
+        return r;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(-1, -double.infinity, 0, -1);
+    assert(_mm256_movemask_pd(A) == 1 + 2 + 8);
+}
+
+/// Set each bit of mask result based on the most significant bit of the corresponding packed 
+/// single-precision (32-bit) floating-point element in `a`.
+int _mm256_movemask_ps (__m256 a) @system
+{
+    // PERF DMD
+    // PERF GDC without AVX
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_movmskps256(a);
+    }
+    else version(LDC)
+    {
+        // this doesn't benefit GDC (unable to inline), but benefits both LDC with SSE2 and ARM64
+        __m128 A_lo = _mm256_extractf128_ps!0(a);
+        __m128 A_hi = _mm256_extractf128_ps!1(a);
+        return (_mm_movemask_ps(A_hi) << 4) | _mm_movemask_ps(A_lo);
+    }
+    else
+    {
+        int8 lv = cast(int8)a;
+        int r = 0;
+        if (lv.array[0] < 0) r += 1;
+        if (lv.array[1] < 0) r += 2;
+        if (lv.array[2] < 0) r += 4;
+        if (lv.array[3] < 0) r += 8;
+        if (lv.array[4] < 0) r += 16;
+        if (lv.array[5] < 0) r += 32;
+        if (lv.array[6] < 0) r += 64;
+        if (lv.array[7] < 0) r += 128;
+        return r;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(-1, -double.infinity, 0, -1, 1, double.infinity, -2, double.nan);
+    assert(_mm256_movemask_ps(A) == 1 + 2 + 8 + 64);
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`.
+__m256d _mm256_mul_pd (__m256d a, __m256d b) pure @safe
+{
+    return a * b;
+}
+unittest
+{
+    __m256d a = [-2.0, 1.5, -2.0, 1.5];
+    a = _mm256_mul_pd(a, a);
+    assert(a.array == [4.0, 2.25, 4.0, 2.25]);
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`.
+__m256 _mm256_mul_ps (__m256 a, __m256 b) pure @safe
+{
+    return a * b;
+}
+unittest
+{
+    __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2.0f, 3.0f, 1.0f];
+    a = _mm256_mul_ps(a, a);
+    float[8] correct = [2.25f, 4.0f, 9.0f, 1.0f, 2.25f, 4.0f, 9.0f, 1.0f];
+    assert(a.array == correct);
+}
+
+
+/// Compute the bitwise NOT of 256 bits in `a`. #BONUS
+__m256i _mm256_not_si256 (__m256i a) pure @safe
+{
+    return ~a;
+}
+unittest
+{
+    __m256i A = _mm256_set1_epi64x(-748);
+    long4 notA = cast(long4) _mm256_not_si256(A);
+    int[4] correct = [747, 747, 747, 747];
+    assert(notA.array == correct);
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
+__m256d _mm256_or_pd (__m256d a, __m256d b) pure @safe
+{
+    return cast(__m256d)( cast(__m256i)a | cast(__m256i)b );
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
+__m256 _mm256_or_ps (__m256 a, __m256 b) pure @safe
+{
+    return cast(__m256)( cast(__m256i)a | cast(__m256i)b );
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `imm8`.
+__m128d _mm_permute_pd(int imm8)(__m128d a) pure @trusted
+{
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_vpermilpd(a, imm8 & 3);
+    }
+    else
+    {
+        // Shufflevector not particularly better for LDC here
+        __m128d r;
+        r.ptr[0] = a.array[imm8 & 1];
+        r.ptr[1] = a.array[(imm8 >> 1) & 1];
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(5, 6);
+    __m128d B = _mm_permute_pd!1(A);
+    __m128d C = _mm_permute_pd!3(A);
+    double[2] RB = [6, 5];
+    double[2] RC = [6, 6];
+    assert(B.array == RB);
+    assert(C.array == RC);
+}
+
+///ditto
+__m256d _mm256_permute_pd(int imm8)(__m256d a) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_vpermilpd256(a, imm8 & 15);
+    }
+    else version(LDC)
+    {
+        return shufflevectorLDC!(double4,        
+                                       (imm8 >> 0) & 1,
+                                     ( (imm8 >> 1) & 1),
+                                 2 + ( (imm8 >> 2) & 1),
+                                 2 + ( (imm8 >> 3) & 1) )(a, a);
+    }
+    else
+    {
+        __m256d r;
+        r.ptr[0] = a.array[ imm8       & 1];
+        r.ptr[1] = a.array[(imm8 >> 1) & 1];
+        r.ptr[2] = a.array[2 + ((imm8 >> 2) & 1)];
+        r.ptr[3] = a.array[2 + ((imm8 >> 3) & 1)];
+        return r;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(0.0, 1, 2, 3);
+    __m256d R = _mm256_permute_pd!(1 + 4)(A);
+    double[4] correct = [1.0, 0, 3, 2];
+    assert(R.array == correct);
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `imm8`.
+__m128 _mm_permute_ps(int imm8)(__m128 a) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_vpermilps(a, cast(ubyte)imm8);
+    }
+    else version(LDC)
+    {
+        return shufflevectorLDC!(float4, (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, 
+            (imm8 >> 6) & 3)(a, a);
+    }
+    else
+    {
+        // PERF: could use _mm_shuffle_ps which is a super set
+        // when AVX isn't available
+        __m128 r;
+        r.ptr[0] = a.array[(imm8 >> 0) & 3];
+        r.ptr[1] = a.array[(imm8 >> 2) & 3];
+        r.ptr[2] = a.array[(imm8 >> 4) & 3];
+        r.ptr[3] = a.array[(imm8 >> 6) & 3];
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(0.0f, 1, 2, 3);
+    __m128 R = _mm_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A);
+    float[4] correct = [1.0f, 3, 0, 2];
+    assert(R.array == correct);
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 
+/// the control in `imm8`. The same shuffle is applied in lower and higher 128-bit lane.
+__m256 _mm256_permute_ps(int imm8)(__m256 a,) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_vpermilps256(a, cast(ubyte)imm8);
+    }
+    else version(LDC)
+    {
+        return shufflevectorLDC!(float8, 
+            (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, (imm8 >> 6) & 3,
+            4 + ((imm8 >> 0) & 3), 4 + ((imm8 >> 2) & 3), 4 + ((imm8 >> 4) & 3), 
+            4 + ((imm8 >> 6) & 3))(a, a);
+    }
+    else
+    {
+        __m256 r;
+        r.ptr[0] = a.array[(imm8 >> 0) & 3];
+        r.ptr[1] = a.array[(imm8 >> 2) & 3];
+        r.ptr[2] = a.array[(imm8 >> 4) & 3];
+        r.ptr[3] = a.array[(imm8 >> 6) & 3];
+        r.ptr[4] = a.array[4 + ((imm8 >> 0) & 3)];
+        r.ptr[5] = a.array[4 + ((imm8 >> 2) & 3)];
+        r.ptr[6] = a.array[4 + ((imm8 >> 4) & 3)];
+        r.ptr[7] = a.array[4 + ((imm8 >> 6) & 3)];
+        return r;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7);
+    __m256 R = _mm256_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A);
+    float[8] correct = [1.0f, 3, 0, 2, 5, 7, 4, 6];
+    assert(R.array == correct);
+} 
+
+/// Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 
+/// selected by `imm8` from `a` and `b`.
+__m256d _mm256_permute2f128_pd(int imm8)(__m256d a, __m256d b) pure @safe
+{
+    return cast(__m256d) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b);
+}
+///ditto
+__m256 _mm256_permute2f128_ps(int imm8)(__m256 a, __m256 b) pure @safe
+{
+    return cast(__m256) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b);
+}
+///ditto
+__m256i _mm256_permute2f128_si256(int imm8)(__m256i a, __m256i b) pure @trusted
+{
+    static if (GDC_with_AVX)
+    {
+        return cast(__m256i) __builtin_ia32_vperm2f128_si256(cast(int8)a, cast(int8)b, cast(ubyte)imm8);
+    }
+    else 
+    {
+        static __m128i SELECT4(int imm4)(__m256i a, __m256i b) pure @trusted
+        {
+            static assert(imm4 >= 0 && imm4 <= 15);
+            static if (imm4 & 8)
+            {
+                return _mm_setzero_si128();
+            }
+            else static if ((imm4 & 2) == 0)
+            {
+                long2 r;
+                enum int index = 2*(imm4 & 1);
+                r.ptr[0] = a.array[index+0];
+                r.ptr[1] = a.array[index+1];
+                return cast(__m128i)r;
+            }
+            else
+            {
+                static assert( (imm4 & 2) != 0);
+                long2 r;
+                enum int index = 2*(imm4 & 1);
+                r.ptr[0] = b.array[index+0];
+                r.ptr[1] = b.array[index+1];
+                return cast(__m128i)r;
+            }
+        }
+
+        long4 r;
+        __m128i lo = SELECT4!(imm8 & 15)(a, b);
+        __m128i hi = SELECT4!((imm8 >> 4) & 15)(a, b);
+        return _mm256_set_m128i(hi, lo);
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(8.0, 1, 2, 3);
+    __m256d B = _mm256_setr_pd(4.0, 5, 6, 7);
+    __m256d R = _mm256_permute2f128_pd!(128 + 2)(A, B);
+    double[4] correct = [4.0, 5.0, 0.0, 0.0];
+    assert(R.array == correct);
+
+    __m256d R2 = _mm256_permute2f128_pd!(3*16 + 1)(A, B);
+    double[4] correct2 = [2.0, 3.0, 6.0, 7.0];
+    assert(R2.array == correct2);
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `b`.
+/// Warning: the selector is in bit 1, not bit 0, of each 64-bit element!
+///          This is really not intuitive.
+__m128d _mm_permutevar_pd(__m128d a, __m128i b) pure @trusted
+{
+    enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64;
+
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return cast(__m128d) __builtin_ia32_vpermilvarpd(a, cast(long2)b);
+    }
+    else static if (implementWithByteShuffle)
+    {
+        align(16) static immutable byte[16] mmAddBase_u8 = [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7];
+        align(16) static immutable byte[16] mmBroadcast_u8 = [0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8];
+        int4 bi = cast(int4)b;
+        long2 two;
+        two = 2;
+        bi = _mm_slli_epi64(cast(__m128i)( (cast(long2)bi) & two), 2);
+        bi = _mm_shuffle_epi8(bi, *cast(__m128i*)mmBroadcast_u8.ptr);
+        // bi is now [ind0 ind0 ind0 ind0 ind0 ind0 ind0 ind0 ind1 ind1 ind1 ind1 ind1 ind1 ind1 ind1 ]
+        byte16 bytesIndices = cast(byte16)bi;
+        bytesIndices = bytesIndices + *cast(byte16*)mmAddBase_u8.ptr;
+
+        // which allows us to make a single _mm_shuffle_epi8
+        return cast(__m128d) _mm_shuffle_epi8(cast(__m128i)a, cast(__m128i)bytesIndices);
+    }
+    else
+    {
+        // This isn't great in ARM64, TBL or TBX instructions can't do that.
+        // that could fit the bill, if it had 64-bit operands. But it only has 8-bit operands.
+        // SVE2 could do it with svtbx[_f64] probably.
+        long2 bl = cast(long2)b;
+        __m128d r;
+        r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1];
+        r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1];
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(5, 6);
+    __m128d B = _mm_permutevar_pd(A, _mm_setr_epi64(2, 1));
+    __m128d C = _mm_permutevar_pd(A, _mm_setr_epi64(1 + 2 + 4, 2));    
+    // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element
+    double[2] RB = [6, 5];
+    double[2] RC = [6, 6];
+    assert(B.array == RB);
+    assert(C.array == RC);
+}
+
+///ditto
+__m256d _mm256_permutevar_pd (__m256d a, __m256i b) pure @trusted
+{
+    // Worth it: for GDC, in SSSE3+
+    //           for LDC, all the time
+    version(LDC)
+        enum bool implementWithByteShuffle = true;
+    else
+        enum bool implementWithByteShuffle = GDC_with_SSSE3;
+
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return cast(__m256d) __builtin_ia32_vpermilvarpd256(a, cast(long4)b);
+    }
+    else static if (implementWithByteShuffle)
+    {
+        // because we don't have 256-bit vectors, split and use _mm_permutevar_ps
+        __m128d a_lo = _mm256_extractf128_pd!0(a);
+        __m128d a_hi = _mm256_extractf128_pd!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128d r_lo = _mm_permutevar_pd(a_lo, b_lo);
+        __m128d r_hi = _mm_permutevar_pd(a_hi, b_hi);
+        return _mm256_set_m128d(r_hi, r_lo);
+    }
+    else
+    {
+        long4 bl = cast(long4)b;
+        __m256d r;
+        r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1];
+        r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1];
+        r.ptr[2] = a.array[2 + ((bl.array[2] & 2) >> 1)];
+        r.ptr[3] = a.array[2 + ((bl.array[3] & 2) >> 1)];
+        return r;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(5, 6, 7, 8);
+    __m256d B = _mm256_permutevar_pd(A, _mm256_setr_epi64(2, 1, 0, 2));
+    __m256d C = _mm256_permutevar_pd(A, _mm256_setr_epi64(1 + 2 + 4, 2, 2, 0));
+    // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element
+    double[4] RB = [6, 5, 7, 8];
+    double[4] RC = [6, 6, 8, 7];
+    assert(B.array == RB);
+    assert(C.array == RC);
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `b`.
+__m128 _mm_permutevar_ps (__m128 a, __m128i b) @trusted
+{
+    // PERF DMD
+
+    enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64;
+
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return cast(__m128) __builtin_ia32_vpermilvarps(a, cast(int4)b);
+    }
+    else static if (implementWithByteShuffle)
+    {
+        // This workaround is worth it: in GDC with SSSE3, in LDC with SSSE3, in ARM64 (neon)
+        int4 bi = cast(int4)b;
+        int4 three;
+        three = 3;
+        bi = _mm_slli_epi32(bi & three, 2);
+        // bi is [ind0 0 0 0 ind1 0 0 0 ind2 0 0 0 ind3 0 0 0]
+        bi = bi | _mm_slli_si128!1(bi);
+        bi = bi | _mm_slli_si128!2(bi);
+        // bi is now [ind0 ind0 ind0 ind0 ind1 ind1 ind1 ind1 ind2 ind2 ind2 ind2 ind3 ind3 ind3 ind3]
+        byte16 bytesIndices = cast(byte16)bi;
+        align(16) static immutable byte[16] mmAddBase_u8 = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
+        bytesIndices = bytesIndices + *cast(byte16*)mmAddBase_u8.ptr;
+
+        // which allows us to make a single _mm_shuffle_epi8
+        return cast(__m128) _mm_shuffle_epi8(cast(__m128i)a, cast(__m128i)bytesIndices);
+    }
+    else
+    {
+
+        int4 bi = cast(int4)b;
+        __m128 r;
+        r.ptr[0] = a.array[ (bi.array[0] & 3) ];
+        r.ptr[1] = a.array[ (bi.array[1] & 3) ];
+        r.ptr[2] = a.array[ (bi.array[2] & 3) ];
+        r.ptr[3] = a.array[ (bi.array[3] & 3) ];
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(5, 6, 7, 8);
+    __m128 B = _mm_permutevar_ps(A, _mm_setr_epi32(2, 1, 0, 2 + 4));
+    __m128 C = _mm_permutevar_ps(A, _mm_setr_epi32(2, 3 + 8, 1, 0));
+    float[4] RB = [7, 6, 5, 7];
+    float[4] RC = [7, 8, 6, 5];
+    assert(B.array == RB);
+    assert(C.array == RC);
+}
+
+///ditto
+__m256 _mm256_permutevar_ps (__m256 a, __m256i b) @trusted
+{
+    // In order to do those two, it is necessary to use _mm_shuffle_epi8 and reconstruct the integers afterwards.
+    enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64;
+
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_vpermilvarps256(a, cast(int8)b);
+    }
+    else static if (implementWithByteShuffle)
+    {
+        // because we don't have 256-bit vectors, split and use _mm_permutevar_ps
+        __m128 a_lo = _mm256_extractf128_ps!0(a);
+        __m128 a_hi = _mm256_extractf128_ps!1(a);
+        __m128i b_lo = _mm256_extractf128_si256!0(b);
+        __m128i b_hi = _mm256_extractf128_si256!1(b);
+        __m128 r_lo = _mm_permutevar_ps(a_lo, b_lo);
+        __m128 r_hi = _mm_permutevar_ps(a_hi, b_hi);
+        return _mm256_set_m128(r_hi, r_lo);
+    }
+    else
+    {
+        int8 bi = cast(int8)b;
+        __m256 r;
+        r.ptr[0] = a.array[ (bi.array[0] & 3) ];
+        r.ptr[1] = a.array[ (bi.array[1] & 3) ];
+        r.ptr[2] = a.array[ (bi.array[2] & 3) ];
+        r.ptr[3] = a.array[ (bi.array[3] & 3) ];
+        r.ptr[4] = a.array[ 4 + (bi.array[4] & 3) ];
+        r.ptr[5] = a.array[ 4 + (bi.array[5] & 3) ];
+        r.ptr[6] = a.array[ 4 + (bi.array[6] & 3) ];
+        r.ptr[7] = a.array[ 4 + (bi.array[7] & 3) ];
+        return r;
+    } 
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(1, 2, 3, 4, 5, 6, 7, 8);
+    __m256 B = _mm256_permutevar_ps(A, _mm256_setr_epi32(2,     1, 0, 2, 3, 2, 1, 0));
+    __m256 C = _mm256_permutevar_ps(A, _mm256_setr_epi32(2, 3 + 8, 1, 0, 2, 3, 0, 1));
+    float[8] RB = [3.0f, 2, 1, 3, 8, 7, 6, 5];
+    float[8] RC = [3.0f, 4, 2, 1, 7, 8, 5, 6];
+    assert(B.array == RB);
+    assert(C.array == RC);
+}
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements
+/// in `a`. The maximum relative error for this approximation is less than 1.5*2^-12.
+__m256 _mm256_rcp_ps (__m256 a) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_rcpps256(a);
+    }
+    else
+    {        
+        a.ptr[0] = 1.0f / a.array[0];
+        a.ptr[1] = 1.0f / a.array[1];
+        a.ptr[2] = 1.0f / a.array[2];
+        a.ptr[3] = 1.0f / a.array[3];
+        a.ptr[4] = 1.0f / a.array[4];
+        a.ptr[5] = 1.0f / a.array[5];
+        a.ptr[6] = 1.0f / a.array[6];
+        a.ptr[7] = 1.0f / a.array[7];
+        return a;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f, 9, -46, 1869816, 55583);
+    __m256 groundTruth = _mm256_set1_ps(1.0f) / A;
+    __m256 result = _mm256_rcp_ps(A);
+    foreach(i; 0..8)
+    {
+        double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
+        assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
+    }
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
+/// rounding parameter, and store the results as packed double-precision floating-point elements.
+/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+__m256d _mm256_round_pd(int rounding)(__m256d a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_roundpd256(a, rounding);
+    }
+    else static if (LDC_with_AVX)
+    {
+        return __builtin_ia32_roundpd256(a, rounding);
+    }
+    else
+    {
+        static if (rounding & _MM_FROUND_CUR_DIRECTION)
+        {
+            // PERF: non-AVX x86, would probably be faster to convert those double at once to int64
+
+            __m128d A_lo = _mm256_extractf128_pd!0(a);
+            __m128d A_hi = _mm256_extractf128_pd!1(a);
+
+            // Convert to 64-bit integers one by one
+            long x0 = _mm_cvtsd_si64(A_lo);
+            long x2 = _mm_cvtsd_si64(A_hi);
+            A_lo.ptr[0] = A_lo.array[1];
+            A_hi.ptr[0] = A_hi.array[1];
+            long x1 = _mm_cvtsd_si64(A_lo);
+            long x3 = _mm_cvtsd_si64(A_hi);
+
+            return _mm256_setr_pd(x0, x1, x2, x3);
+        }
+        else
+        {
+            version(GNU) pragma(inline, false); // this was required for SSE4.1 rounding, let it here
+
+            uint old = _MM_GET_ROUNDING_MODE();
+            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
+            
+            __m128d A_lo = _mm256_extractf128_pd!0(a);
+            __m128d A_hi = _mm256_extractf128_pd!1(a);
+
+            // Convert to 64-bit integers one by one
+            long x0 = _mm_cvtsd_si64(A_lo);
+            long x2 = _mm_cvtsd_si64(A_hi);
+            A_lo.ptr[0] = A_lo.array[1];
+            A_hi.ptr[0] = A_hi.array[1];
+            long x1 = _mm_cvtsd_si64(A_lo);
+            long x3 = _mm_cvtsd_si64(A_hi);
+
+            // Convert back to double to achieve the rounding
+            // The problem is that a 64-bit double can't represent all the values 
+            // a 64-bit integer can (and vice-versa). So this function won't work for
+            // large values. (FUTURE: what range exactly?)
+            _MM_SET_ROUNDING_MODE(old);
+            return _mm256_setr_pd(x0, x1, x2, x3);
+        }
+    }
+}
+unittest
+{
+    // tested in other intrinsics
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
+/// rounding parameter, and store the results as packed single-precision floating-point elements.
+/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+__m256 _mm256_round_ps(int rounding)(__m256 a) @trusted
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_roundps256(a, rounding);
+    }
+    else static if (GDC_or_LDC_with_SSE41)
+    {
+        // we can use _mm_round_ps
+        __m128 lo = _mm256_extractf128_ps!0(a);
+        __m128 hi = _mm256_extractf128_ps!1(a);
+        __m128 ilo = _mm_round_ps!rounding(lo); // unfortunately _mm_round_ps isn't fast for arm64, so we avoid that in that case
+        __m128 ihi = _mm_round_ps!rounding(hi);
+        return _mm256_set_m128(ihi, ilo);
+    }
+    else
+    {
+        static if (rounding & _MM_FROUND_CUR_DIRECTION)
+        {
+            __m256i integers = _mm256_cvtps_epi32(a);
+            return _mm256_cvtepi32_ps(integers);
+        }
+        else
+        {
+            version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
+            uint old = _MM_GET_ROUNDING_MODE();
+            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
+            scope(exit) _MM_SET_ROUNDING_MODE(old);
+
+            // Convert to 32-bit integers
+            __m256i integers = _mm256_cvtps_epi32(a);
+
+            // Convert back to float to achieve the rounding
+            // The problem is that a 32-float can't represent all the values 
+            // a 32-bit integer can (and vice-versa). So this function won't work for
+            // large values. (FUTURE: what range exactly?)
+            __m256 result = _mm256_cvtepi32_ps(integers);
+
+            return result;
+        }
+    }
+}
+unittest
+{
+    // tested in other intrinsics
+}
+
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) 
+/// floating-point elements in `a`. The maximum relative error for this approximation is less than
+/// 1.5*2^-12.
+__m256 _mm256_rsqrt_ps (__m256 a) pure @trusted
+{
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_rsqrtps256(a);
+    }
+    else version(LDC)
+    {
+        a[0] = 1.0f / llvm_sqrt(a[0]);
+        a[1] = 1.0f / llvm_sqrt(a[1]);
+        a[2] = 1.0f / llvm_sqrt(a[2]);
+        a[3] = 1.0f / llvm_sqrt(a[3]);
+        a[4] = 1.0f / llvm_sqrt(a[4]);
+        a[5] = 1.0f / llvm_sqrt(a[5]);
+        a[6] = 1.0f / llvm_sqrt(a[6]);
+        a[7] = 1.0f / llvm_sqrt(a[7]);
+        return a;
+    }
+    else
+    {
+        a.ptr[0] = 1.0f / sqrt(a.array[0]);
+        a.ptr[1] = 1.0f / sqrt(a.array[1]);
+        a.ptr[2] = 1.0f / sqrt(a.array[2]);
+        a.ptr[3] = 1.0f / sqrt(a.array[3]);
+        a.ptr[4] = 1.0f / sqrt(a.array[4]);
+        a.ptr[5] = 1.0f / sqrt(a.array[5]);
+        a.ptr[6] = 1.0f / sqrt(a.array[6]);
+        a.ptr[7] = 1.0f / sqrt(a.array[7]);
+        return a;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f, 2.34f, 70000.0f, 0.00001f, 345.5f);
+    __m256 groundTruth = _mm256_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f,
+                                        0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f);
+    __m256 result = _mm256_rsqrt_ps(A);
+    foreach(i; 0..8)
+    {
+        double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
+        assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
+    }
+}
+
+/// Set packed 16-bit integers with the supplied values.
+__m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
+{
+    short16 r; // Note: = void would prevent GDC from inlining a constant short16...
+    r.ptr[0] = e0;
+    r.ptr[1] = e1;
+    r.ptr[2] = e2;
+    r.ptr[3] = e3;
+    r.ptr[4] = e4;
+    r.ptr[5] = e5;
+    r.ptr[6] = e6;
+    r.ptr[7] = e7;
+    r.ptr[8] = e8;
+    r.ptr[9] = e9;
+    r.ptr[10] = e10;
+    r.ptr[11] = e11;
+    r.ptr[12] = e12;
+    r.ptr[13] = e13;
+    r.ptr[14] = e14;
+    r.ptr[15] = e15;
+    return cast(__m256i) r;
+}
+unittest
+{
+    short16 A = cast(short16) _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 
+                                               7, 6, 5, 4, 3, 2, 1, 0);
+    foreach(i; 0..16)
+        assert(A.array[i] == i);
+}
+
+/// Set packed 32-bit integers with the supplied values.
+__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted
+{
+    // Inlines a constant with GCC -O1, LDC -O2
+    int8 r; // = void would prevent GCC from inlining a constant call
+    r.ptr[0] = e0;
+    r.ptr[1] = e1;
+    r.ptr[2] = e2;
+    r.ptr[3] = e3;
+    r.ptr[4] = e4;
+    r.ptr[5] = e5;
+    r.ptr[6] = e6;
+    r.ptr[7] = e7;
+    return cast(__m256i)r;
+}
+unittest
+{
+    int8 A = cast(int8) _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    foreach(i; 0..8)
+        assert(A.array[i] == i);
+}
+
+/// Set packed 64-bit integers with the supplied values.
+__m256i _mm256_set_epi64x (long e3, long e2, long e1, long e0) pure @trusted
+{
+    long4 r = void;
+    r.ptr[0] = e0;
+    r.ptr[1] = e1;
+    r.ptr[2] = e2;
+    r.ptr[3] = e3;
+    return r;
+}
+unittest
+{
+    __m256i A = _mm256_set_epi64x(-1, 42, long.min, long.max);
+    long[4] correct = [long.max, long.min, 42, -1];
+    assert(A.array == correct);
+}
+
+///ditto
+alias _mm256_set_epi64 = _mm256_set_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API.
+
+/// Set packed 8-bit integers with the supplied values.
+__m256i _mm256_set_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 
+                         byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 
+                         byte e15, byte e14, byte e13, byte e12, byte e11, byte e10,  byte e9,  byte e8, 
+                          byte e7,  byte e6,  byte e5,  byte e4,  byte e3,  byte e2,  byte e1,  byte e0)
+{
+    // Inline a constant call in GDC -O1 and LDC -O2
+    align(32) byte[32] result = [ e0,  e1,  e2,  e3,  e4,  e5,  e6,  e7,
+                                  e8,  e9, e10, e11, e12, e13, e14, e15,
+                                 e16, e17, e18, e19, e20, e21, e22, e23,
+                                 e24, e25, e26, e27, e28, e29, e30, e31 ];
+    return *cast(__m256i*)(result.ptr);
+}
+unittest
+{
+    byte32 R = cast(byte32) _mm256_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7);
+    byte[32] correct = [7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0,
+                        14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1];
+    assert(R.array == correct);
+}
+
+/// Set packed `__m256d` vector with the supplied values.
+__m256 _mm256_set_m128 (__m128 hi, __m128 lo) pure @trusted
+{
+    // DMD PERF
+    static if (GDC_with_AVX)
+    {
+        __m256 r = __builtin_ia32_ps256_ps(lo);
+        return __builtin_ia32_vinsertf128_ps256(r, hi, 1);
+    }
+    else
+    {
+        __m256 r = void;
+        r.ptr[0] = lo.array[0];
+        r.ptr[1] = lo.array[1];
+        r.ptr[2] = lo.array[2];
+        r.ptr[3] = lo.array[3];
+        r.ptr[4] = hi.array[0];
+        r.ptr[5] = hi.array[1];
+        r.ptr[6] = hi.array[2];
+        r.ptr[7] = hi.array[3];
+        return r;
+    }
+
+    /*
+        // BUG, doesn't work if AVX vector is emulated, but SSE vector is not
+        // See issue #108
+        __m256 r = void;
+        __m128* p = cast(__m128*)(&r);
+        p[0] = lo;
+        p[1] = hi;
+        return r;
+    */
+}
+unittest
+{
+    __m128 lo = _mm_setr_ps(1.0f, 2, 3, 4);
+    __m128 hi = _mm_setr_ps(3.0f, 4, 5, 6);
+    __m256 R = _mm256_set_m128(hi, lo);
+    float[8] correct = [1.0f, 2, 3, 4, 3, 4, 5, 6];
+    assert(R.array == correct);
+}
+
+/// Set packed `__m256d` vector with the supplied values.
+__m256d _mm256_set_m128d (__m128d hi, __m128d lo) pure @trusted
+{
+    __m256d r = void;
+    r.ptr[0] = lo.array[0];
+    r.ptr[1] = lo.array[1];
+    r.ptr[2] = hi.array[0];
+    r.ptr[3] = hi.array[1];
+    return r;
+}
+unittest
+{
+    __m128d lo = _mm_setr_pd(1.0, 2.0);
+    __m128d hi = _mm_setr_pd(3.0, 4.0);
+    __m256d R = _mm256_set_m128d(hi, lo);
+    double[4] correct = [1.0, 2.0, 3.0, 4.0];
+    assert(R.array == correct);
+}
+
+/// Set packed `__m256i` vector with the supplied values.
+__m256i _mm256_set_m128i (__m128i hi, __m128i lo) pure @trusted
+{
+    // DMD PERF
+    static if (GDC_with_AVX)
+    {
+        __m256i r = cast(long4) __builtin_ia32_si256_si (lo);
+        return cast(long4) __builtin_ia32_vinsertf128_si256(cast(int8)r, hi, 1);
+    }
+    else
+    {
+        int8 r = void;
+        r.ptr[0] = lo.array[0];
+        r.ptr[1] = lo.array[1];
+        r.ptr[2] = lo.array[2];
+        r.ptr[3] = lo.array[3];
+        r.ptr[4] = hi.array[0];
+        r.ptr[5] = hi.array[1];
+        r.ptr[6] = hi.array[2];
+        r.ptr[7] = hi.array[3];
+        return cast(long4)r;
+    }
+}
+unittest
+{
+    __m128i lo = _mm_setr_epi32( 1,  2,  3,  4);
+    __m128i hi =  _mm_set_epi32(-3, -4, -5, -6);
+    int8 R = cast(int8)_mm256_set_m128i(hi, lo);
+    int[8] correct = [1, 2, 3, 4, -6, -5, -4, -3];
+    assert(R.array == correct);
+}
+
+/// Set packed double-precision (64-bit) floating-point elements with the supplied values.
+__m256d _mm256_set_pd (double e3, double e2, double e1, double e0) pure @trusted
+{
+    __m256d r = void;
+    r.ptr[0] = e0;
+    r.ptr[1] = e1;
+    r.ptr[2] = e2;
+    r.ptr[3] = e3;
+    return r;
+}
+unittest
+{
+    __m256d A = _mm256_set_pd(3, 2, 1, 546);
+    double[4] correct = [546.0, 1.0, 2.0, 3.0];
+    assert(A.array == correct);
+}
+
+/// Set packed single-precision (32-bit) floating-point elements with the supplied values.
+__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted
+{
+    // PERF: see #102, use = void?
+    __m256 r;
+    r.ptr[0] = e0;
+    r.ptr[1] = e1;
+    r.ptr[2] = e2;
+    r.ptr[3] = e3;
+    r.ptr[4] = e4;
+    r.ptr[5] = e5;
+    r.ptr[6] = e6;
+    r.ptr[7] = e7;
+    return r;
+}
+unittest
+{
+    __m256 A = _mm256_set_ps(3, 2, 1, 546.0f, -1.25f, -2, -3, 0);
+    float[8] correct = [0, -3, -2, -1.25f, 546.0f, 1.0, 2.0, 3.0];
+    assert(A.array == correct);
+}
+
+/// Broadcast 16-bit integer `a` to all elements of the return value.
+__m256i _mm256_set1_epi16 (short a) pure @trusted
+{
+    version(DigitalMars) 
+    {
+        // workaround https://issues.dlang.org/show_bug.cgi?id=21469
+        // It used to ICE, after that the codegen was just wrong.
+        // No issue anymore in DMD 2.101, we can eventually remove that
+        static if (__VERSION__ < 2101)
+        {
+            short16 v = a;
+            return cast(__m256i) v;
+        }
+        else
+        {
+            pragma(inline, true);
+            return cast(__m256i)(short16(a));
+        }
+    }
+    else
+    {
+        pragma(inline, true);
+        return cast(__m256i)(short16(a));
+    }
+}
+unittest
+{
+    short16 a = cast(short16) _mm256_set1_epi16(31);
+    for (int i = 0; i < 16; ++i)
+        assert(a.array[i] == 31);
+}
+
+/// Broadcast 32-bit integer `a` to all elements.
+__m256i _mm256_set1_epi32 (int a) pure @trusted
+{
+    version(DigitalMars) 
+    {
+        // No issue anymore in DMD 2.101, we can eventually remove that
+        static if (__VERSION__ < 2101)
+        {
+            int8 v = a;
+            return cast(__m256i) v;
+        }
+        else
+        {
+            pragma(inline, true);
+            return cast(__m256i)(int8(a));
+        }
+    }
+    else
+    {
+        pragma(inline, true);
+        return cast(__m256i)(int8(a));
+    }
+}
+unittest
+{
+    int8 a = cast(int8) _mm256_set1_epi32(31);
+    for (int i = 0; i < 8; ++i)
+        assert(a.array[i] == 31);
+}
+
+/// Broadcast 64-bit integer `a` to all elements of the return value.
+__m256i _mm256_set1_epi64x (long a) pure
+{
+    return cast(__m256i)(long4(a));
+}
+unittest
+{
+    long4 a = cast(long4) _mm256_set1_epi64x(-31);
+    for (int i = 0; i < 4; ++i)
+        assert(a.array[i] == -31);
+}
+///ditto
+alias _mm256_set1_epi64 = _mm256_set1_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API.
+
+/// Broadcast 8-bit integer `a` to all elements of the return value.
+__m256i _mm256_set1_epi8 (byte a) pure @trusted
+{
+    version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469
+    {
+        byte32 v = a;
+        return cast(__m256i) v;
+    }
+    else
+    {
+        pragma(inline, true);
+        return cast(__m256i)(byte32(a));
+    }
+}
+unittest
+{
+    byte32 a = cast(byte32) _mm256_set1_epi8(31);
+    for (int i = 0; i < 32; ++i)
+        assert(a.array[i] == 31);
+}
+
+/// Broadcast double-precision (64-bit) floating-point value `a` to all elements of the return value.
+__m256d _mm256_set1_pd (double a) pure @trusted
+{
+    return __m256d(a);
+}
+unittest
+{
+    double a = 464.21;
+    double[4] correct = [a, a, a, a];
+    double4 A = cast(double4) _mm256_set1_pd(a);
+    assert(A.array == correct);
+}
+
+/// Broadcast single-precision (32-bit) floating-point value `a` to all elements of the return value.
+__m256 _mm256_set1_ps (float a) pure @trusted
+{
+    return __m256(a);
+}
+unittest
+{
+    float a = 464.21f;
+    float[8] correct = [a, a, a, a, a, a, a, a];
+    float8 A = cast(float8) _mm256_set1_ps(a);
+    assert(A.array == correct);
+}
+
+/// Set packed 16-bit integers with the supplied values in reverse order.
+__m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9,  short e8,
+                           short e7,  short e6,  short e5,  short e4,  short e3,  short e2,  short e1,  short e0) pure @trusted
+{
+    short[16] result = [ e15,  e14,  e13,  e12,  e11,  e10,  e9,   e8,
+                         e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
+    static if (GDC_with_AVX)
+    {
+         return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        return cast(__m256i)( loadUnaligned!(short16)(result.ptr) );
+    }
+    else
+    {
+        short16 r;
+        for(int n = 0; n < 16; ++n)
+            r.ptr[n] = result[n];
+        return cast(__m256i)r;
+    }
+}
+unittest
+{
+    short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128,
+                                                -1, 0, -21, 21, 42, 127, -42, -128);
+    short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128,
+                         -1, 0, -21, 21, 42, 127, -42, -128];
+    assert(A.array == correct);
+}
+
+/// Set packed 32-bit integers with the supplied values in reverse order.
+__m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted
+{
+    // Inlines a constant with GCC -O1, LDC -O2
+    int8 r; // = void would prevent GDC from inlining a constant call
+    r.ptr[0] = e7;
+    r.ptr[1] = e6;
+    r.ptr[2] = e5;
+    r.ptr[3] = e4;
+    r.ptr[4] = e3;
+    r.ptr[5] = e2;
+    r.ptr[6] = e1;
+    r.ptr[7] = e0;
+    return cast(__m256i)r;
+}
+unittest
+{
+    int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666);
+    int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666];
+    assert(A.array == correct);
+}
+
+/// Set packed 64-bit integers with the supplied values in reverse order.
+__m256i _mm256_setr_epi64x (long e3, long e2, long e1, long e0) pure @trusted
+{
+    long4 r = void;
+    r.ptr[0] = e3;
+    r.ptr[1] = e2;
+    r.ptr[2] = e1;
+    r.ptr[3] = e0;
+    return r;
+}
+unittest
+{
+    __m256i A = _mm256_setr_epi64x(-1, 42, long.min, long.max);
+    long[4] correct = [-1, 42, long.min, long.max];
+    assert(A.array == correct);
+}
+///ditto
+alias _mm256_setr_epi64 = _mm256_setr_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API.
+
+/// Set packed 8-bit integers with the supplied values in reverse order.
+__m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24,
+                          byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16,
+                          byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9,  byte e8,
+                          byte e7,  byte e6,  byte e5,  byte e4,  byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
+{
+    // Inline a constant call in GDC -O1 and LDC -O2
+    align(32) byte[32] result = [ e31,  e30,  e29,  e28,  e27,  e26,  e25,  e24,
+                                  e23,  e22,  e21,  e20,  e19,  e18,  e17,  e16,
+                                  e15,  e14,  e13,  e12,  e11,  e10,  e9,   e8,
+                                   e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
+    return *cast(__m256i*)(result.ptr);
+}
+unittest
+{
+    byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128,
+                                              -1, 0, -21, 21, 42, 127, -42, -128,
+                                              -1, 0, -21, 21, 42, 127, -42, -128,
+                                              -1, 0, -21, 21, 42, 127, -42, -128);
+    byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128,
+                        -1, 0, -21, 21, 42, 127, -42, -128,
+                        -1, 0, -21, 21, 42, 127, -42, -128,
+                        -1, 0, -21, 21, 42, 127, -42, -128];
+    assert(A.array == correct);
+}
+
+/// Set packed `__m256` vector with the supplied values.
+__m256 _mm256_setr_m128 (__m128 lo, __m128 hi) pure
+{
+    return _mm256_set_m128(hi, lo);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2, 3, 4);
+    __m128 B = _mm_setr_ps(3.0f, 4, 5, 6);
+    __m256 R = _mm256_setr_m128(B, A);
+    float[8] correct = [3.0f, 4, 5, 6, 1, 2, 3, 4,];
+    assert(R.array == correct);
+}
+
+/// Set packed `__m256d` vector with the supplied values.
+__m256d _mm256_setr_m128d (__m128d lo, __m128d hi) pure
+{
+    return _mm256_set_m128d(hi, lo);
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.0, 2.0);
+    __m128d B = _mm_setr_pd(3.0, 4.0);
+    __m256d R = _mm256_setr_m128d(B, A);
+    double[4] correct = [3.0, 4.0, 1.0, 2.0];
+    assert(R.array == correct);
+}
+
+/// Set packed `__m256i` vector with the supplied values.
+__m256i _mm256_setr_m128i (__m128i lo, __m128i hi) pure
+{
+    return _mm256_set_m128i(hi, lo);
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32( 1,  2,  3,  4);
+    __m128i B =  _mm_set_epi32(-3, -4, -5, -6);
+    int8 R = cast(int8)_mm256_setr_m128i(B, A);
+    int[8] correct = [-6, -5, -4, -3, 1, 2, 3, 4];
+    assert(R.array == correct);
+}
+
+/// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
+__m256d _mm256_setr_pd (double e3, double e2, double e1, double e0) pure @trusted
+{
+    static if (LDC_with_optimizations)
+    {
+        // PERF, probably not the best
+        double[4] result = [e3, e2, e1, e0];
+        return loadUnaligned!(double4)(result.ptr);
+    }
+    else
+    {
+        __m256d r;
+        r.ptr[0] = e3;
+        r.ptr[1] = e2;
+        r.ptr[2] = e1;
+        r.ptr[3] = e0;
+        return r;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(3, 2, 1, 546.125);
+    double[4] correct = [3.0, 2.0, 1.0, 546.125];
+    assert(A.array == correct);
+}
+
+
+/// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order.
+__m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        align(32) float[8] r = [ e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
+        return *cast(__m256*)r;
+    }
+    else version(LDC)
+    {
+        align(32) float[8] r = [ e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
+        return *cast(__m256*)r;
+    }
+    else
+    {
+        __m256 r;
+        r.ptr[0] = e7;
+        r.ptr[1] = e6;
+        r.ptr[2] = e5;
+        r.ptr[3] = e4;
+        r.ptr[4] = e3;
+        r.ptr[5] = e2;
+        r.ptr[6] = e1;
+        r.ptr[7] = e0;
+        return r;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(   3, 2, 1, 546.125f, 4, 5, 6, 7);
+    float[8] correct       = [3.0f, 2, 1, 546.125f, 4, 5, 6, 7];
+    assert(A.array == correct);
+}
+
+/// Return vector of type `__m256d` with all elements set to zero.
+__m256d _mm256_setzero_pd() pure @safe
+{
+    return double4(0.0);
+}
+unittest
+{
+    __m256d A = _mm256_setzero_pd();
+    double[4] correct = [0.0, 0.0, 0.0, 0.0];
+    assert(A.array == correct);
+}
+
+/// Return vector of type `__m256` with all elements set to zero.
+__m256 _mm256_setzero_ps() pure @safe
+{
+    return float8(0.0f);
+}
+unittest
+{
+    __m256 A = _mm256_setzero_ps();
+    float[8] correct = [0.0f, 0, 0, 0, 0, 0, 0, 0];
+    assert(A.array == correct);
+}
+
+/// Return vector of type `__m256i` with all elements set to zero.
+__m256i _mm256_setzero_si256() pure @trusted
+{
+    return __m256i(0);
+}
+unittest
+{
+    __m256i A = _mm256_setzero_si256();
+    long[4] correct = [0, 0, 0, 0];
+    assert(A.array == correct);
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the 
+/// control in `imm8`.
+__m256d _mm256_shuffle_pd(int imm8)(__m256d a, __m256d b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_shufpd256(a, b, imm8);
+    }
+    else version(LDC)
+    {
+        return shufflevectorLDC!(double4,        
+                                       (imm8 >> 0) & 1,
+                                 4 + ( (imm8 >> 1) & 1),
+                                 2 + ( (imm8 >> 2) & 1),
+                                 6 + ( (imm8 >> 3) & 1) )(a, b);
+    }
+    else
+    {
+        double4 r = void;
+        r.ptr[0] = a.array[(imm8 >> 0) & 1];
+        r.ptr[1] = b.array[(imm8 >> 1) & 1];
+        r.ptr[2] = a.array[2 + ( (imm8 >> 2) & 1)];
+        r.ptr[3] = b.array[2 + ( (imm8 >> 3) & 1)];
+        return r;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd( 0, 1, 2, 3);
+    __m256d B = _mm256_setr_pd( 4, 5, 6, 7);
+    __m256d C = _mm256_shuffle_pd!75 /* 01001011 */(A, B);
+    double[4] correct = [1.0, 5.0, 2.0, 7.0];
+    assert(C.array == correct);
+} 
+
+/// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 
+/// the control in `imm8`.
+__m256 _mm256_shuffle_ps(int imm8)(__m256 a, __m256 b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_shufps256(a, b, imm8);
+    }
+    else version(LDC)
+    {
+        return shufflevectorLDC!(float8, (imm8 >> 0) & 3,
+                                 (imm8 >> 2) & 3,
+                                 8 + ( (imm8 >> 4) & 3),
+                                 8 + ( (imm8 >> 6) & 3),
+                                 4 + ( (imm8 >> 0) & 3),
+                                 4 + ( (imm8 >> 2) & 3),
+                                 12 + ( (imm8 >> 4) & 3),
+                                 12 + ( (imm8 >> 6) & 3) )(a, b);
+    }
+    else
+    {
+        float8 r = void;
+        r.ptr[0] = a.array[(imm8 >> 0) & 3];
+        r.ptr[1] = a.array[(imm8 >> 2) & 3];
+        r.ptr[2] = b.array[(imm8 >> 4) & 3];
+        r.ptr[3] = b.array[(imm8 >> 6) & 3];
+        r.ptr[4] = a.array[4 + ( (imm8 >> 0) & 3 )];
+        r.ptr[5] = a.array[4 + ( (imm8 >> 2) & 3 )];
+        r.ptr[6] = b.array[4 + ( (imm8 >> 4) & 3 )];
+        r.ptr[7] = b.array[4 + ( (imm8 >> 6) & 3 )];
+        return r;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps( 0,  1,  2,  3,  4,  5,  6,  7);
+    __m256 B = _mm256_setr_ps( 8,  9, 10, 11, 12, 13, 14, 15);
+    __m256 C = _mm256_shuffle_ps!75 /* 01001011 */(A, B);
+    float[8] correct = [3.0f, 2, 8, 9, 7, 6, 12, 13];
+    assert(C.array == correct);
+} 
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in `a`.
+__m256d _mm256_sqrt_pd (__m256d a) pure @trusted
+{
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_sqrtpd256(a);
+    } 
+    else version(LDC)
+    {   
+        static if (__VERSION__ >= 2084) 
+            return llvm_sqrt(a); // that capability appeared in LDC 1.14
+        else
+        {
+            a.ptr[0] = llvm_sqrt(a.array[0]);
+            a.ptr[1] = llvm_sqrt(a.array[1]);
+            a.ptr[2] = llvm_sqrt(a.array[2]);
+            a.ptr[3] = llvm_sqrt(a.array[3]);
+            return a;
+        }
+    }    
+    else
+    {
+        a.ptr[0] = sqrt(a.array[0]);
+        a.ptr[1] = sqrt(a.array[1]);
+        a.ptr[2] = sqrt(a.array[2]);
+        a.ptr[3] = sqrt(a.array[3]);
+        return a;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_sqrt_pd(_mm256_set1_pd(4.0));
+    double[4] correct = [2.0, 2, 2, 2];
+    assert(A.array == correct);
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`.
+__m256 _mm256_sqrt_ps (__m256 a) pure @trusted
+{
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_sqrtps256(a);
+    } 
+    else version(LDC)
+    {  
+        static if (__VERSION__ >= 2084) 
+            return llvm_sqrt(a); // that capability appeared in LDC 1.14
+        else
+        {  
+            a.ptr[0] = llvm_sqrt(a.array[0]);
+            a.ptr[1] = llvm_sqrt(a.array[1]);
+            a.ptr[2] = llvm_sqrt(a.array[2]);
+            a.ptr[3] = llvm_sqrt(a.array[3]);
+            a.ptr[4] = llvm_sqrt(a.array[4]);
+            a.ptr[5] = llvm_sqrt(a.array[5]);
+            a.ptr[6] = llvm_sqrt(a.array[6]);
+            a.ptr[7] = llvm_sqrt(a.array[7]);
+            return a;
+        }
+    }    
+    else
+    {
+        a.ptr[0] = sqrt(a.array[0]);
+        a.ptr[1] = sqrt(a.array[1]);
+        a.ptr[2] = sqrt(a.array[2]);
+        a.ptr[3] = sqrt(a.array[3]);
+        a.ptr[4] = sqrt(a.array[4]);
+        a.ptr[5] = sqrt(a.array[5]);
+        a.ptr[6] = sqrt(a.array[6]);
+        a.ptr[7] = sqrt(a.array[7]);
+        return a;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_sqrt_ps(_mm256_set1_ps(4.0f));
+    float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2];
+    assert(A.array == correct);
+}
+
+/// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 
+/// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 
+/// exception may be generated.
+void _mm256_store_pd (double* mem_addr, __m256d a) pure @system
+{
+    *cast(__m256d*)mem_addr = a;
+}
+unittest
+{
+    align(32) double[4] mem;
+    double[4] correct = [1.0, 2, 3, 4];
+    _mm256_store_pd(mem.ptr, _mm256_setr_pd(1.0, 2, 3, 4));
+    assert(mem == correct);
+}
+
+/// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 
+/// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 
+/// exception may be generated.
+void _mm256_store_ps (float* mem_addr, __m256 a) pure @system
+{
+    *cast(__m256*)mem_addr = a;
+}
+unittest
+{
+    align(32) float[8] mem;
+    float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8];
+    _mm256_store_ps(mem.ptr, _mm256_set_ps(8.0, 7, 6, 5, 4, 3, 2, 1));
+    assert(mem == correct);
+}
+
+/// Store 256-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 32-byte 
+/// boundary or a general-protection exception may be generated.
+void _mm256_store_si256 (__m256i * mem_addr, __m256i a) pure @safe
+{
+    *mem_addr = a;
+}
+unittest
+{
+    align(32) long[4] mem;
+    long[4] correct = [5, -6, -7, 8];
+    _mm256_store_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8));
+    assert(mem == correct);
+}
+
+/// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 
+/// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
+void _mm256_storeu_pd (double * mem_addr, __m256d a) pure @system
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        __builtin_ia32_storeupd256(mem_addr, a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        storeUnaligned!__m256d(a, mem_addr);
+    }
+    else
+    {
+        for(int n = 0; n < 4; ++n)
+            mem_addr[n] = a.array[n];
+    }
+}
+unittest
+{
+    align(32) double[6] arr = [0.0, 0, 0, 0, 0, 0];
+    _mm256_storeu_pd(&arr[1], _mm256_set1_pd(4.0));
+    double[4] correct = [4.0, 4, 4, 4];
+    assert(arr[1..5] == correct);
+}
+
+/// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 
+/// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
+void _mm256_storeu_ps (float* mem_addr, __m256 a) pure @system
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        __builtin_ia32_storeups256(mem_addr, a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        storeUnaligned!__m256(a, mem_addr);
+    }
+    else
+    {
+        for(int n = 0; n < 8; ++n)
+            mem_addr[n] = a.array[n];
+    }
+}
+unittest
+{
+    align(32) float[10] arr = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+    _mm256_storeu_ps(&arr[1], _mm256_set1_ps(4.0f));
+    float[8] correct = [4.0f, 4, 4, 4, 4, 4, 4, 4];
+    assert(arr[1..9] == correct);
+}
+
+
+/// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned
+///  on any particular boundary.
+void _mm256_storeu_si256 (__m256i* mem_addr, __m256i a) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        __builtin_ia32_storedqu256(cast(char*)mem_addr, cast(ubyte32) a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        storeUnaligned!__m256i(a, cast(long*)mem_addr);
+    }
+    else
+    {
+        long4 v = cast(long4)a;
+        long* p = cast(long*)mem_addr;
+        for(int n = 0; n < 4; ++n)
+            p[n] = v[n];
+    }
+}
+unittest
+{
+    align(32) long[6] arr = [0, 0, 0, 0, 0, 0];
+    _mm256_storeu_si256( cast(__m256i*) &arr[1], _mm256_set1_epi64x(4));
+    long[4] correct = [4, 4, 4, 4];
+    assert(arr[1..5] == correct);
+}
+
+/// Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) 
+/// floating-point elements) from `a` into memory two different 128-bit locations. 
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a) pure @system
+{
+    // This is way better on GDC, and similarly in LDC, vs using other intrinsics
+    loaddr[0] = a.array[0];
+    loaddr[1] = a.array[1];
+    loaddr[2] = a.array[2];
+    loaddr[3] = a.array[3];
+    hiaddr[0] = a.array[4];
+    hiaddr[1] = a.array[5];
+    hiaddr[2] = a.array[6];
+    hiaddr[3] = a.array[7];
+}
+unittest
+{
+    align(32) float[11] A = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+    _mm256_storeu2_m128(&A[1], &A[6], _mm256_set1_ps(2.0f));
+    float[11] correct     = [0.0f, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0];
+    assert(A == correct);
+}
+
+/// Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a` into memory two different 128-bit locations. 
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a) pure @system
+{
+    loaddr[0] = a.array[0];
+    loaddr[1] = a.array[1];
+    hiaddr[0] = a.array[2];
+    hiaddr[1] = a.array[3];
+}
+unittest
+{
+    double[2] A;
+    double[2] B;
+    _mm256_storeu2_m128d(A.ptr, B.ptr, _mm256_set1_pd(-43.0));
+    double[2] correct = [-43.0, -43];
+    assert(A == correct);
+    assert(B == correct);
+}
+
+/// Store the high and low 128-bit halves (each composed of integer data) from `a` into memory two 
+/// different 128-bit locations. 
+/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a) pure @trusted
+{
+    long* hi = cast(long*)hiaddr;
+    long* lo = cast(long*)loaddr;
+    lo[0] = a.array[0];
+    lo[1] = a.array[1];
+    hi[0] = a.array[2];
+    hi[1] = a.array[3];
+}
+unittest
+{
+    long[2] A;
+    long[2] B;
+    _mm256_storeu2_m128i(cast(__m128i*)A.ptr, cast(__m128i*)B.ptr, _mm256_set1_epi64x(-42));
+    long[2] correct = [-42, -42];
+    assert(A == correct);
+    assert(B == correct);
+}
+
+/// Store 256-bits (composed of 4 packed single-precision (64-bit) floating-point elements) from
+/// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 
+/// boundary or a general-protection exception may be generated.
+/// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
+void _mm256_stream_pd (double* mem_addr, __m256d a) pure @system
+{
+    // PERF DMD
+    // PERF GDC + SSE2
+    static if (LDC_with_InlineIREx && LDC_with_optimizations)
+    {
+        enum prefix = `!0 = !{ i32 1 }`;
+        enum ir = `
+            store <4 x double> %1, <4 x double>* %0, align 32, !nontemporal !0
+            ret void`;
+        LDCInlineIREx!(prefix, ir, "", void, double4*, double4)(cast(double4*)mem_addr, a);
+    }   
+    else static if (GDC_with_AVX) // any hope to be non-temporal? Using SSE2 instructions.
+    {
+        __builtin_ia32_movntpd256 (mem_addr, a);
+    }
+    else
+    {
+        // Regular store instead.
+        __m256d* dest = cast(__m256d*)mem_addr;
+        *dest = a;
+    }
+}
+unittest
+{
+    align(32) double[4] mem;
+    double[4] correct = [5.0, -6, -7, 8];
+    _mm256_stream_pd(mem.ptr, _mm256_setr_pd(5.0, -6, -7, 8));
+    assert(mem == correct);
+}
+
+/// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from
+/// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 
+/// boundary or a general-protection exception may be generated.
+/// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
+void _mm256_stream_ps (float* mem_addr, __m256 a) pure @system
+{
+    // PERF DMD
+    // PERF GDC + SSE2
+    static if (LDC_with_InlineIREx && LDC_with_optimizations)
+    {
+        enum prefix = `!0 = !{ i32 1 }`;
+        enum ir = `
+            store <8 x float> %1, <8 x float>* %0, align 32, !nontemporal !0
+            ret void`;
+        LDCInlineIREx!(prefix, ir, "", void, float8*, float8)(cast(float8*)mem_addr, a);
+    }   
+    else static if (GDC_with_AVX)
+    {
+        __builtin_ia32_movntps256 (mem_addr, a);
+    }
+    else
+    {
+        // Regular store instead.
+        __m256* dest = cast(__m256*)mem_addr;
+        *dest = a;
+    }
+}
+unittest
+{
+    align(32) float[8] mem;
+    float[8] correct = [5, -6, -7, 8, 1, 2, 3, 4];
+    _mm256_stream_ps(mem.ptr, _mm256_setr_ps(5, -6, -7, 8, 1, 2, 3, 4));
+    assert(mem == correct);
+}
+
+/// Store 256-bits of integer data from `a` into memory using a non-temporal memory hint. 
+/// `mem_addr` must be aligned on a 32-byte boundary or a general-protection exception may be
+/// generated.
+/// Note: there isn't any particular instruction in AVX to do that. It just defers to SSE2.
+/// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
+void _mm256_stream_si256 (__m256i * mem_addr, __m256i a) pure @trusted
+{
+    // PERF DMD
+    // PERF GDC
+    static if (LDC_with_InlineIREx && LDC_with_optimizations)
+    {
+        enum prefix = `!0 = !{ i32 1 }`;
+        enum ir = `
+            store <4 x i64> %1, <4 x i64>* %0, align 16, !nontemporal !0
+            ret void`;
+        LDCInlineIREx!(prefix, ir, "", void, long4*, long4)(mem_addr, a);
+    }
+    else static if (GDC_with_SSE2) // any hope to be non-temporal? Using SSE2 instructions.
+    {
+        long2 lo, hi;
+        lo.ptr[0] = a.array[0];
+        lo.ptr[1] = a.array[1];
+        hi.ptr[0] = a.array[2];
+        hi.ptr[1] = a.array[3];
+        _mm_stream_si128(cast(__m128i*)mem_addr, cast(__m128i)lo);
+        _mm_stream_si128((cast(__m128i*)mem_addr) + 1, cast(__m128i)hi);
+    }
+    else
+    {
+        // Regular store instead.
+        __m256i* dest = cast(__m256i*)mem_addr;
+        *dest = a;
+    }
+}
+unittest
+{
+    align(32) long[4] mem;
+    long[4] correct = [5, -6, -7, 8];
+    _mm256_stream_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8));
+    assert(mem == correct);
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in `b` from 
+/// packed double-precision (64-bit) floating-point elements in `a`.
+__m256d _mm256_sub_pd (__m256d a, __m256d b) pure @safe
+{
+    return a - b;
+}
+unittest
+{
+    __m256d a = [1.5, -2.0, 3.0, 200000.0];
+    a = _mm256_sub_pd(a, a);
+    double[4] correct = [0.0, 0, 0, 0];
+    assert(a.array == correct);
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in `b` from 
+/// packed single-precision (32-bit) floating-point elements in `a`.
+__m256 _mm256_sub_ps (__m256 a, __m256 b) pure @safe
+{
+    return a - b;
+}
+unittest
+{
+    __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2000.0f, 3.0f, 1.0f];
+    a = _mm256_sub_ps(a, a);
+    float[8] correct = [0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f];
+    assert(a.array == correct);
+}
+
+/// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 
+/// return 1 if the sign bit of each 64-bit element in the intermediate value is zero, 
+/// otherwise return 0.
+int _mm_testc_pd (__m128d a, __m128d b) pure @trusted
+{
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_vtestcpd(a, b);
+    }
+    else
+    {
+        // PERF: maybe do the generic version more like simde
+        long2 la = cast(long2)a;
+        long2 lb = cast(long2)b;
+        long2 r = ~la & lb;
+        return r.array[0] >= 0 && r.array[1] >= 0;
+    }
+}
+unittest
+{
+    __m128d A  = _mm_setr_pd(-1, 1);
+    __m128d B = _mm_setr_pd(-1, -1);
+    __m128d C = _mm_setr_pd(1, -1);
+    assert(_mm_testc_pd(A, A) == 1);
+    assert(_mm_testc_pd(A, B) == 0);
+    assert(_mm_testc_pd(B, A) == 1);
+}
+
+///ditto
+int _mm256_testc_pd (__m256d a, __m256d b) pure @safe
+{
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_vtestcpd256(a, b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // better to split than do vanilla (down to 10 inst)
+        __m128d lo_a = _mm256_extractf128_pd!0(a);
+        __m128d lo_b = _mm256_extractf128_pd!0(b);
+        __m128d hi_a = _mm256_extractf128_pd!1(a);
+        __m128d hi_b = _mm256_extractf128_pd!1(b);
+        return _mm_testc_pd(lo_a, lo_b) & _mm_testc_pd(hi_a, hi_b);
+    }
+    else
+    {
+        // PERF: do the generic version more like simde, maybe this get rids of arm64 version
+        long4 la = cast(long4)a;
+        long4 lb = cast(long4)b;
+        long4 r = ~la & lb;
+        return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(-1, 1, -1, 1);
+    __m256d B = _mm256_setr_pd(-1, -1, -1, -1);
+    __m256d C = _mm256_setr_pd(1, -1, 1, -1);
+    assert(_mm256_testc_pd(A, A) == 1);
+    assert(_mm256_testc_pd(A, B) == 0);
+    assert(_mm256_testc_pd(B, A) == 1);
+}
+
+/// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 
+/// return 1 if the sign bit of each 32-bit element in the intermediate value is zero, 
+/// otherwise return 0.
+int _mm_testc_ps (__m128 a, __m128 b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_vtestcps(a, b);
+    }   
+    else static if (LDC_with_ARM64)
+    {
+        int4 la = cast(int4)a;
+        int4 lb = cast(int4)b;
+        int4 r = ~la & lb;
+        int4 shift;
+        shift = 31;
+        r >>= shift;
+        int[4] zero = [0, 0, 0, 0];
+        return r.array == zero;
+    }
+    else
+    {
+        // PERF: do the generic version more like simde, maybe this get rids of arm64 version
+        int4 la = cast(int4)a;
+        int4 lb = cast(int4)b;
+        int4 r = ~la & lb;
+        return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(-1, 1, -1, 1);
+    __m128 B = _mm_setr_ps(-1, -1, -1, -1);
+    __m128 C = _mm_setr_ps(1, -1, 1, -1);
+    assert(_mm_testc_ps(A, A) == 1);
+    assert(_mm_testc_ps(A, B) == 0);
+    assert(_mm_testc_ps(B, A) == 1);
+}
+
+///ditto
+int _mm256_testc_ps (__m256 a, __m256 b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_vtestcps256(a, b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        int8 la = cast(int8)a;
+        int8 lb = cast(int8)b;
+        int8 r = ~la & lb;
+        int8 shift;
+        shift = 31;
+        r >>= shift;
+        int[8] zero = [0, 0, 0, 0, 0, 0, 0, 0];
+        return r.array == zero;
+    }
+    else
+    {
+        // PERF: do the generic version more like simde, maybe this get rids of arm64 version
+        int8 la = cast(int8)a;
+        int8 lb = cast(int8)b;
+        int8 r = ~la & lb;
+        return r.array[0] >= 0 
+            && r.array[1] >= 0
+            && r.array[2] >= 0
+            && r.array[3] >= 0
+            && r.array[4] >= 0
+            && r.array[5] >= 0
+            && r.array[6] >= 0
+            && r.array[7] >= 0;
+    }
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(-1,  1, -1,  1, -1,  1, -1,  1);
+    __m256 B = _mm256_setr_ps(-1, -1, -1, -1, -1, -1, -1, -1);
+    __m256 C = _mm256_setr_ps( 1, -1,  1, -1,  1,  1,  1,  1);
+    assert(_mm256_testc_ps(A, A) == 1);
+    assert(_mm256_testc_ps(B, B) == 1);
+    assert(_mm256_testc_ps(A, B) == 0);
+    assert(_mm256_testc_ps(B, A) == 1);
+    assert(_mm256_testc_ps(C, B) == 0);
+    assert(_mm256_testc_ps(B, C) == 1);
+}
+
+/// Compute the bitwise NOT of `a` and then AND with `b`, and return 1 if the result is zero,
+/// otherwise return 0.
+/// In other words, test if all bits masked by `b` are also 1 in `a`.
+int _mm256_testc_si256 (__m256i a, __m256i b) pure @trusted
+{
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_ptestc256(cast(long4)a, cast(long4)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // better to split than do vanilla (down to 10 inst)
+        __m128i lo_a = _mm256_extractf128_si256!0(a);
+        __m128i lo_b = _mm256_extractf128_si256!0(b);
+        __m128i hi_a = _mm256_extractf128_si256!1(a);
+        __m128i hi_b = _mm256_extractf128_si256!1(b);
+        return _mm_testc_si128(lo_a, lo_b) & _mm_testc_si128(hi_a, hi_b);
+    }
+    else
+    {
+        __m256i c = ~a & b;
+        long[4] zero = [0, 0, 0, 0];
+        return c.array == zero;
+    }
+}
+unittest
+{
+    __m256i A  = _mm256_setr_epi64(0x01, 0x02, 0x04, 0xf8);
+    __m256i M1 = _mm256_setr_epi64(0xfe, 0xfd, 0x00, 0x00);
+    __m256i M2 = _mm256_setr_epi64(0x00, 0x00, 0x04, 0x00);
+    assert(_mm256_testc_si256(A, A) == 1);
+    assert(_mm256_testc_si256(A, M1) == 0);
+    assert(_mm256_testc_si256(A, M2) == 1);
+}
+
+/// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 
+/// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 
+/// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 
+/// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set
+/// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise
+/// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
+///
+/// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`,
+///             AND there is at least one negative number in `b` that correspond to a negative number in `a`.
+int _mm_testnzc_pd (__m128d a, __m128d b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_vtestnzcpd(a, b);
+    }
+    else
+    {
+        // ZF = 0 means "there is at least one pair of negative numbers"
+        // ZF = 1 means "no pairs of negative numbers"
+        // CF = 0 means "there is a negative number in b that is next to a positive number in a"
+        // CF = 1 means "all negative numbers in b are also negative in a"
+        // Consequently, CF = 0 and ZF = 0 means:
+        //   "There is a pair of matching negative numbers in a and b, 
+        //   AND also there is a negative number in b, that is matching a positive number in a"
+        // Phew.
+
+        // courtesy of simd-everywhere
+        __m128i m = _mm_and_si128(cast(__m128i)a, cast(__m128i)b);
+        __m128i m2 = _mm_andnot_si128(cast(__m128i)a, cast(__m128i)b);
+        m = _mm_srli_epi64(m, 63);
+        m2 = _mm_srli_epi64(m2, 63);
+        return cast(int)( m.array[0] | m.array[2]) & (m2.array[0] | m2.array[2]);
+    }
+}
+unittest
+{
+    __m128d PM = _mm_setr_pd( 1, -1);
+    __m128d MP = _mm_setr_pd(-1,  1);
+    __m128d MM = _mm_setr_pd(-1, -1);
+    assert(_mm_testnzc_pd(PM, MP) == 0);
+    assert(_mm_testnzc_pd(PM, MM) == 1);
+    assert(_mm_testnzc_pd(MP, MP) == 0);
+    assert(_mm_testnzc_pd(MP, MM) == 1);
+    assert(_mm_testnzc_pd(MM, MM) == 0);
+}
+
+/// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 
+/// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 
+/// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 
+/// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set
+/// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise
+/// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
+///
+/// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`,
+///             AND there is at least one negative number in `b` that correspond to a negative number in `a`.
+int _mm256_testnzc_pd (__m256d a, __m256d b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_vtestnzcpd256(a, b);
+    }
+    else
+    {
+        long4 la = cast(long4)a;
+        long4 lb = cast(long4)b;
+        long4 r = la & lb;
+        long m = r.array[0] | r.array[1] | r.array[2] | r.array[3];
+        int ZF = (~m >> 63) & 1;
+        long4 r2 = ~la & lb;
+        long m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3];
+        int CF = (~m2 >> 63) & 1;
+        return (CF | ZF) == 0;
+    }
+}
+unittest
+{
+    __m256d PM = _mm256_setr_pd( 1, -1, 1, 1);
+    __m256d MP = _mm256_setr_pd(-1,  1, 1, 1);
+    __m256d MM = _mm256_setr_pd(-1, -1, 1, 1);
+    assert(_mm256_testnzc_pd(PM, MP) == 0);
+    assert(_mm256_testnzc_pd(PM, MM) == 1);
+    assert(_mm256_testnzc_pd(MP, MP) == 0);
+    assert(_mm256_testnzc_pd(MP, MM) == 1);
+    assert(_mm256_testnzc_pd(MM, MM) == 0);
+}
+
+/// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 
+/// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 
+/// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 
+/// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set
+/// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise
+/// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
+///
+/// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`,
+///             AND there is at least one negative number in `b` that correspond to a negative number in `a`.
+int _mm_testnzc_ps (__m128 a, __m128 b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_vtestnzcps(a, b);
+    }
+    else
+    {
+        int4 la = cast(int4)a;
+        int4 lb = cast(int4)b;
+        int4 r = la & lb;
+        int m = r.array[0] | r.array[1] | r.array[2] | r.array[3];
+        int ZF = (~m >> 31) & 1;
+        int4 r2 = ~la & lb;
+        int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3];
+        int CF = (~m2 >> 31) & 1;
+        return (CF | ZF) == 0;
+    }
+}
+unittest
+{
+    __m128 PM = _mm_setr_ps( 1, -1, 1, 1);
+    __m128 MP = _mm_setr_ps(-1,  1, 1, 1);
+    __m128 MM = _mm_setr_ps(-1, -1, 1, 1);
+    assert(_mm_testnzc_ps(PM, MP) == 0);
+    assert(_mm_testnzc_ps(PM, MM) == 1);
+    assert(_mm_testnzc_ps(MP, MP) == 0);
+    assert(_mm_testnzc_ps(MP, MM) == 1);
+    assert(_mm_testnzc_ps(MM, MM) == 0);
+}
+
+/// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 
+/// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 
+/// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 
+/// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set
+/// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise
+/// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
+///
+/// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`,
+///             AND there is at least one negative number in `b` that correspond to a negative number in `a`.
+int _mm256_testnzc_ps (__m256 a, __m256 b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_vtestnzcps256(a, b);
+    }
+    else
+    {
+        int8 la = cast(int8)a;
+        int8 lb = cast(int8)b;
+        int8 r = la & lb;
+        int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]
+            |   r.array[4] | r.array[5] | r.array[6] | r.array[7];
+        int ZF = (~m >> 31) & 1;
+        int8 r2 = ~la & lb;
+        int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3]
+               | r2.array[4] | r2.array[5] | r2.array[6] | r2.array[7];
+        int CF = (~m2 >> 31) & 1;
+        return (CF | ZF) == 0;
+    }
+}
+unittest
+{
+    __m256 PM = _mm256_setr_ps(1, 1, 1, 1,  1, -1, 1, 1);
+    __m256 MP = _mm256_setr_ps(1, 1, 1, 1, -1,  1, 1, 1);
+    __m256 MM = _mm256_setr_ps(1, 1, 1, 1, -1, -1, 1, 1);
+    assert(_mm256_testnzc_ps(PM, MP) == 0);
+    assert(_mm256_testnzc_ps(PM, MM) == 1);
+    assert(_mm256_testnzc_ps(MP, MP) == 0);
+    assert(_mm256_testnzc_ps(MP, MM) == 1);
+    assert(_mm256_testnzc_ps(MM, MM) == 0);
+}
+
+/// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`, 
+/// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
+/// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
+/// result is zero, otherwise set CF to 0. 
+/// Return 1 if both the ZF and CF values are zero, otherwise return 0.
+int _mm256_testnzc_si256 (__m256i a, __m256i b) pure @trusted
+{
+    // PERF ARM64
+    // PERF DMD
+    // PERF LDC without AVX
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_ptestnzc256(cast(long4) a, cast(long4) b);
+    }
+    else
+    {
+        // Need to defer to _mm_testnzc_si128 if possible, for more speed
+        __m256i c = a & b;
+        __m256i d = ~a & b;
+        long m = c.array[0] | c.array[1] | c.array[2] | c.array[3];
+        long n = d.array[0] | d.array[1] | d.array[2] | d.array[3];
+        return (m != 0) & (n != 0);
+    }
+}
+unittest
+{
+    __m256i A  = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0, 0, 0, 0);
+    __m256i M  = _mm256_setr_epi32(0x01, 0x40, 0x00, 0x00, 0, 0, 0, 0);
+    __m256i Z = _mm256_setzero_si256();
+    assert(_mm256_testnzc_si256(A, Z) == 0);
+    assert(_mm256_testnzc_si256(A, M) == 1);
+    assert(_mm256_testnzc_si256(A, A) == 0);
+}
+
+/// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 
+/// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of
+/// each 64-bit element in the intermediate value is zero, otherwise return 0.
+/// In other words, return 1 if `a` and `b` don't both have a negative number as the same place.
+int _mm_testz_pd (__m128d a, __m128d b) pure @trusted
+{
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_vtestzpd(a, b);
+    }
+    else
+    {
+        long2 la = cast(long2)a;
+        long2 lb = cast(long2)b;
+        long2 r = la & lb;
+        long m = r.array[0] | r.array[1];
+        return (~m >> 63) & 1;
+    }
+}
+unittest
+{
+    __m128d A  = _mm_setr_pd(-1, 1);
+    __m128d B = _mm_setr_pd(-1, -1);
+    __m128d C = _mm_setr_pd(1, -1);
+    assert(_mm_testz_pd(A, A) == 0);
+    assert(_mm_testz_pd(A, B) == 0);
+    assert(_mm_testz_pd(C, A) == 1);
+}
+
+/// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 
+/// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of
+/// each 64-bit element in the intermediate value is zero, otherwise return 0.
+/// In other words, return 1 if `a` and `b` don't both have a negative number as the same place.
+int _mm256_testz_pd (__m256d a, __m256d b) pure @trusted
+{
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_vtestzpd256(a, b);
+    }
+    else
+    {
+        long4 la = cast(long4)a;
+        long4 lb = cast(long4)b;
+        long4 r = la & lb;
+        long r2 = r.array[0] | r.array[1] | r.array[2] | r.array[3];
+        return (~r2 >> 63) & 1;
+    }
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(-1, 1, -1, 1);
+    __m256d B = _mm256_setr_pd(1,  1, -1, 1);
+    __m256d C = _mm256_setr_pd(1, -1, 1, -1);
+    assert(_mm256_testz_pd(A, A) == 0);
+    assert(_mm256_testz_pd(A, B) == 0);
+    assert(_mm256_testz_pd(C, A) == 1);
+}
+
+/// Compute the bitwise AND of 128 bits (representing double-precision (32-bit) floating-point 
+/// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of
+/// each 32-bit element in the intermediate value is zero, otherwise return 0.
+/// In other words, return 1 if `a` and `b` don't both have a negative number as the same place.
+int _mm_testz_ps (__m128 a, __m128 b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_vtestzps(a, b);
+    }
+    else
+    {
+        int4 la = cast(int4)a;
+        int4 lb = cast(int4)b;
+        int4 r = la & lb;
+        int m = r.array[0] | r.array[1] | r.array[2] | r.array[3];
+        return (~m >> 31) & 1;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(-1,  1, -1,  1);
+    __m128 B = _mm_setr_ps( 1,  1, -1,  1);
+    __m128 C = _mm_setr_ps( 1, -1,  1, -1);
+    assert(_mm_testz_ps(A, A) == 0);
+    assert(_mm_testz_ps(A, B) == 0);
+    assert(_mm_testz_ps(C, A) == 1);
+    assert(_mm_testz_ps(C, B) == 1);
+}
+
+/// Compute the bitwise AND of 256 bits (representing double-precision (32-bit) floating-point 
+/// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of
+/// each 32-bit element in the intermediate value is zero, otherwise return 0.
+/// In other words, return 1 if `a` and `b` don't both have a negative number as the same place.
+int _mm256_testz_ps (__m256 a, __m256 b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_AVX)
+    {
+        return __builtin_ia32_vtestzps256(a, b);
+    }
+    else
+    {
+        int8 la = cast(int8)a;
+        int8 lb = cast(int8)b;
+        int8 r = la & lb;
+        int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]
+            | r.array[4] | r.array[5] | r.array[6] | r.array[7];
+        return (~m >> 31) & 1;
+    }
+}
+
+/// Compute the bitwise AND of 256 bits (representing integer data) in 
+/// and return 1 if the result is zero, otherwise return 0.
+/// In other words, test if all bits masked by `b` are 0 in `a`.
+int _mm256_testz_si256 (__m256i a, __m256i b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b);
+    }
+    else static if (LDC_with_AVX)
+    {
+        return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b);
+    }
+    else version(LDC)
+    {
+        // better to split than do vanilla (down to 8 inst in arm64)
+        __m128i lo_a = _mm256_extractf128_si256!0(a);
+        __m128i lo_b = _mm256_extractf128_si256!0(b);
+        __m128i hi_a = _mm256_extractf128_si256!1(a);
+        __m128i hi_b = _mm256_extractf128_si256!1(b);
+        return _mm_testz_si128(lo_a, lo_b) & _mm_testz_si128(hi_a, hi_b);
+    }
+    else
+    {
+        __m256i c = a & b;
+        long[4] zero = [0, 0, 0, 0];
+        return c.array == zero;
+    }
+}
+unittest
+{
+    __m256i A  = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0x01, 0x02, 0x04, 0xf8);
+    __m256i M1 = _mm256_setr_epi32(0xfe, 0xfd, 0x00, 0x07, 0xfe, 0xfd, 0x00, 0x07);
+    __m256i M2 = _mm256_setr_epi32(0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00);
+    assert(_mm256_testz_si256(A, A) == 0);
+    assert(_mm256_testz_si256(A, M1) == 1);
+    assert(_mm256_testz_si256(A, M2) == 0);
+}
+
+/// Return vector of type __m256d with undefined elements.
+__m256d _mm256_undefined_pd () pure @safe
+{
+    __m256d r = void;
+    return r;
+}
+
+/// Return vector of type __m256 with undefined elements.
+__m256 _mm256_undefined_ps () pure @safe
+{
+    __m256 r = void;
+    return r;
+}
+
+/// Return vector of type __m256i with undefined elements.
+__m256i _mm256_undefined_si256 () pure @safe
+{
+    __m256i r = void;
+    return r;
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 
+/// each 128-bit lane in `a` and `b`.
+__m256d _mm256_unpackhi_pd (__m256d a, __m256d b) pure @trusted
+{
+    static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+                   ret <4 x double> %r`;
+        return LDCInlineIR!(ir, double4, double4, double4)(a, b);
+    }
+    else static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_unpckhpd256 (a, b);
+    }
+    else
+    {
+        __m256d r;
+        r.ptr[0] = a.array[1];
+        r.ptr[1] = b.array[1];
+        r.ptr[2] = a.array[3];
+        r.ptr[3] = b.array[3];
+        return r;
+    } 
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
+    __m256d B = _mm256_setr_pd(5.0, 6, 7, 8);
+    __m256d C = _mm256_unpackhi_pd(A, B);
+    double[4] correct =       [2.0, 6, 4, 8];
+    assert(C.array == correct);
+}
+
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 
+/// each 128-bit lane in `a` and `b`.
+__m256 _mm256_unpackhi_ps (__m256 a, __m256 b) pure @trusted
+{
+    static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <8 x float> %0, <8 x float> %1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+            ret <8 x float> %r`;
+        return LDCInlineIR!(ir, float8, float8, float8)(a, b);
+    }
+    else static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_unpckhps256 (a, b);
+    }
+    else
+    {
+        __m256 r;
+        r.ptr[0] = a.array[2];
+        r.ptr[1] = b.array[2];
+        r.ptr[2] = a.array[3];
+        r.ptr[3] = b.array[3];
+        r.ptr[4] = a.array[6];
+        r.ptr[5] = b.array[6];
+        r.ptr[6] = a.array[7];
+        r.ptr[7] = b.array[7];
+        return r;
+    } 
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(0.0f,  1,  2,  3,  4,  5,  6,  7);
+    __m256 B = _mm256_setr_ps(8.0f,  9, 10, 11, 12, 13, 14, 15);
+    __m256 C = _mm256_unpackhi_ps(A, B);
+    float[8] correct =       [2.0f, 10,  3, 11,  6, 14,  7, 15];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 
+/// each 128-bit lane in `a` and `b`.
+__m256d _mm256_unpacklo_pd (__m256d a, __m256d b)
+{
+    static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+            ret <4 x double> %r`;
+        return LDCInlineIR!(ir, double4, double4, double4)(a, b);
+    }
+    else static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_unpcklpd256 (a, b);
+    }
+    else
+    {
+        __m256d r;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = b.array[0];
+        r.ptr[2] = a.array[2];
+        r.ptr[3] = b.array[2];
+        return r;        
+    } 
+}
+unittest
+{
+    __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
+    __m256d B = _mm256_setr_pd(5.0, 6, 7, 8);
+    __m256d C = _mm256_unpacklo_pd(A, B);
+    double[4] correct =       [1.0, 5, 3, 7];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of
+/// each 128-bit lane in `a` and `b`.
+__m256 _mm256_unpacklo_ps (__m256 a, __m256 b)
+{
+    static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <8 x float> %0, <8 x float> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+            ret <8 x float> %r`;
+        return LDCInlineIR!(ir, float8, float8, float8)(a, b);
+    }
+    else static if (GDC_with_AVX)
+    {
+        return __builtin_ia32_unpcklps256 (a, b);
+    }
+    else
+    {
+        __m256 r;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = b.array[0];
+        r.ptr[2] = a.array[1];
+        r.ptr[3] = b.array[1];
+        r.ptr[4] = a.array[4];
+        r.ptr[5] = b.array[4];
+        r.ptr[6] = a.array[5];
+        r.ptr[7] = b.array[5];
+        return r;        
+    } 
+}
+unittest
+{
+    __m256 A = _mm256_setr_ps(0.0f,  1,  2,  3,  4,  5,  6,  7);
+    __m256 B = _mm256_setr_ps(8.0f,  9, 10, 11, 12, 13, 14, 15);
+    __m256 C = _mm256_unpacklo_ps(A, B);
+    float[8] correct =       [0.0f,  8,  1,  9,  4, 12,  5, 13];
+    assert(C.array == correct);
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
+__m256d _mm256_xor_pd (__m256d a, __m256d b) pure @safe
+{
+    return cast(__m256d)( cast(__m256i)a ^ cast(__m256i)b );
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
+__m256 _mm256_xor_ps (__m256 a, __m256 b) pure @safe
+{
+    return cast(__m256)( cast(__m256i)a ^ cast(__m256i)b );
+}
+
+void _mm256_zeroall () pure @safe
+{
+    // PERF DMD needs to do it explicitely if AVX is ever used one day.
+
+    static if (GDC_with_AVX)
+    {
+        __builtin_ia32_vzeroall();
+    }
+    else
+    {
+        // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM).
+    }
+}
+
+void _mm256_zeroupper () pure @safe
+{
+    // PERF DMD needs to do it explicitely if AVX is ever used.
+
+    static if (GDC_with_AVX)
+    {
+        __builtin_ia32_vzeroupper();
+    }
+    else
+    {
+        // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM).
+    }
+    
+}
+
+/// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are zeroed.
+__m256d _mm256_zextpd128_pd256 (__m128d a) pure @trusted
+{
+    __m256d r;
+    r.ptr[0] = a.array[0];
+    r.ptr[1] = a.array[1];
+    r.ptr[2] = 0;
+    r.ptr[3] = 0;
+    return r;
+}
+unittest
+{
+    __m256d R = _mm256_zextpd128_pd256(_mm_setr_pd(2.0, -3.0));
+    double[4] correct = [2.0, -3, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are zeroed.
+__m256 _mm256_zextps128_ps256 (__m128 a) pure @trusted
+{
+    double2 la = cast(double2)a;
+    double4 r;
+    r.ptr[0] = la.array[0];
+    r.ptr[1] = la.array[1];
+    r.ptr[2] = 0;
+    r.ptr[3] = 0;
+    return cast(__m256)r;
+}
+unittest
+{
+    __m256 R = _mm256_zextps128_ps256(_mm_setr_ps(2.0, -3.0, 4, -5));
+    float[8] correct = [2.0, -3, 4, -5, 0, 0, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are zeroed. 
+__m256i _mm256_zextsi128_si256 (__m128i a) pure @trusted
+{
+    long2 la = cast(long2)a;
+    __m256i r;
+    r.ptr[0] = la.array[0];
+    r.ptr[1] = la.array[1];
+    r.ptr[2] = 0;
+    r.ptr[3] = 0;
+    return r;
+}
+unittest
+{
+    __m256i R = _mm256_zextsi128_si256(_mm_setr_epi64(-1, 99));
+    long[4] correct = [-1, 99, 0, 0];
+    assert(R.array == correct);
+}
+
+
+// F16C start here
+
+/// Convert 4 packed half-precision (16-bit) floating-point elements 
+/// in `a` to packed single-precision (32-bit) floating-point elements.
+/// Note: Only lowest 64-bit of input considered.
+///       Preserve infinities, sign of zeroes, and NaN-ness.
+__m128 _mm_cvtph_ps(__m128i a) pure @trusted
+{
+    short8 sa = cast(short8)a;
+
+    static if (LDC_with_F16C)
+    {
+        // Note: clang has a __builtin_ia32_vcvtph2ps256 but we don't
+        // Note: LLVM IR fpext leads to  call __extendhfsf2@PLT
+        // Same with the pragma llvm.convert.from.fp16, so not sure 
+        // what to do
+        return cast(__m128)__asm!(float4)("vcvtph2ps $1, $0", "=v,v", a);
+    }
+    else
+    {
+        // Reference: stb_image_resize2.h has F16C emulation.
+        // See: 
+        // Originated from: 
+        __m128i mask_nosign      = _mm_set1_epi32(0x7fff);
+        __m128i smallest_normal  = _mm_set1_epi32(0x0400);
+        __m128i infinity         = _mm_set1_epi32(0x7c00);
+        __m128i expadjust_normal = _mm_set1_epi32((127 - 15) << 23);
+        __m128i magic_denorm     = _mm_set1_epi32(113 << 23);
+        __m128i i = a;
+        __m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() );
+        __m128i mnosign     = mask_nosign;
+        __m128i eadjust     = expadjust_normal;
+        __m128i smallest    = smallest_normal;
+        __m128i infty       = infinity;
+        __m128i expmant     = _mm_and_si128(mnosign, h);
+        __m128i justsign    = _mm_xor_si128(h, expmant);
+        __m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
+        __m128i b_isdenorm  = _mm_cmpgt_epi32(smallest, expmant);
+        __m128i shifted     = _mm_slli_epi32(expmant, 13);
+        __m128i adj_infnan  = _mm_andnot_si128(b_notinfnan, eadjust);
+        __m128i adjusted    = _mm_add_epi32(eadjust, shifted);
+        __m128i den1        = _mm_add_epi32(shifted, magic_denorm);
+        __m128i adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
+        __m128  den2        = _mm_sub_ps(cast(__m128)den1, *cast(const(__m128)*)&magic_denorm);
+        __m128  adjusted3   = _mm_and_ps(den2, cast(__m128)b_isdenorm);
+        __m128  adjusted4   = _mm_andnot_ps(cast(__m128)b_isdenorm, cast(__m128)adjusted2);
+        __m128  adjusted5   = _mm_or_ps(adjusted3, adjusted4);
+        __m128i sign        = _mm_slli_epi32(justsign, 16);
+        __m128  final_      = _mm_or_ps(adjusted5, cast(__m128)sign);
+        return final_;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(cast(short)0x8000, 0x7C00, cast(short)0xDA90, 0x5000, 0, 0, 0, 0);
+    float[4] correct      = [-0.0f, float.infinity, -210.0f,  32.0f];
+    __m128 R = _mm_cvtph_ps(A);
+    assert(R.array == correct);
+}
+
+/// Convert 8 packed half-precision (16-bit) floating-point elements 
+/// in `a` to packed single-precision (32-bit) floating-point elements.
+/// Note: Preserve infinities, sign of zeroes, and NaN-ness.
+__m256 _mm256_cvtph_ps(__m128i a) pure @trusted
+{
+    static if (LDC_with_F16C)
+    {
+        return __asm!(float8)("vcvtph2ps $1, $0", "=v,v", a);
+    }
+    else
+    {
+        // In stb_image_resize2.h, _mm_cvtph_ps is simply hand-inlined 2x
+        // so we do the same here.
+        int4 ihi;
+        ihi.ptr[0] = a.array[2];
+        ihi.ptr[1] = a.array[3];
+        __m128 lo = _mm_cvtph_ps(a);
+        __m128 hi = _mm_cvtph_ps(ihi);
+        return _mm256_set_m128(hi, lo);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(0, cast(short)-32768, 0, cast(short)0xFC00, 0x7C00, 0x5A90,cast(short)0xDA90, 0x5000);
+    float[8] correct      = [0.0f, -0.0f, 0.0f, -float.infinity, float.infinity, 210.0f, -210.0f,  32.0f];
+    __m256 R = _mm256_cvtph_ps(A);
+    assert(R.array == correct);
+}
+
+// __m128i _mm_cvtps_ph (__m128 a, int imm8) TODO
+// __m128i _mm256_cvtps_ph (__m256 a, int imm8) TODO
\ No newline at end of file
diff --git a/external/inteli/bmi2intrin.d b/external/inteli/bmi2intrin.d
new file mode 100644
index 0000000..50bcde7
--- /dev/null
+++ b/external/inteli/bmi2intrin.d
@@ -0,0 +1,363 @@
+/**
+* BMI2 intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
+*
+* Copyright: Copyright Johan Engelen 2021.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.bmi2intrin;
+
+import inteli.internals;
+
+nothrow @nogc pure @safe:
+
+/// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
+uint _bzhi_u32 (uint a, uint index)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+            return __builtin_ia32_bzhi_si(a, index);
+        else
+            return bzhi!uint(a, index);
+    }
+    else
+    {
+        return bzhi!uint(a, index);
+    }
+}
+unittest
+{
+    static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
+           assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
+    static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
+           assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
+    static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
+           assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
+}
+
+/// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
+ulong _bzhi_u64 (ulong a, uint index)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+        {
+            version(X86_64)
+            {
+                // This instruction not available in 32-bit x86.
+                return __builtin_ia32_bzhi_di(a, index);
+            }
+            else
+                return bzhi!ulong(a, index);
+        }
+        else
+            return bzhi!ulong(a, index);
+    }
+    else
+    {
+        return bzhi!ulong(a, index);
+    }
+}
+unittest
+{
+    static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
+           assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
+    static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
+           assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
+    static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
+           assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
+    static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
+           assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
+}
+
+// Helper function for BZHI
+private T bzhi(T)(T a, uint index)
+{
+    /+
+        n := index[7:0]
+        dst := a
+        IF (n < number of bits)
+            dst[MSB:n] := 0
+        FI
+    +/
+    enum numbits = T.sizeof*8;
+    T dst = a;
+    if (index < numbits)
+    {
+        T mask = (T(1) << index) - 1;
+        dst &= mask;
+    }
+    return dst;
+}
+
+/// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, 
+/// and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
+/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
+///       But, those particular semantics don't exist at the level of intrinsics.
+uint _mulx_u32 (uint a, uint b, uint* hi)
+{
+    // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for
+    // some reason, even with LLVM IR.
+    // Also same with GDC.
+    ulong result = cast(ulong) a * b;
+    *hi = cast(uint) (result >>> 32);
+    return cast(uint)result;
+}
+@system unittest
+{
+    uint hi;
+    assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
+    assert (hi == 0x014B_66DC);
+}
+
+/// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and 
+/// store the high 64-bits in `hi`. This does not read or write arithmetic flags.
+/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
+///       But, those particular semantics don't exist at the level of intrinsics.
+ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
+{
+    /+
+        dst[63:0] := (a * b)[63:0]
+        MEM[hi+63:hi]  := (a * b)[127:64]
+    +/
+
+    static if (LDC_with_optimizations)
+    {
+        static if (__VERSION__ >= 2094)
+            enum bool withLDCIR = true;
+        else
+            enum bool withLDCIR = false;
+    }
+    else
+    {
+        enum bool withLDCIR = false;
+    }
+
+    static if (withLDCIR)
+    {
+        // LDC x86: Generates mulx from -O0
+        enum ir = `
+            %4 = zext i64 %0 to i128
+            %5 = zext i64 %1 to i128
+            %6 = mul nuw i128 %5, %4
+            %7 = lshr i128 %6, 64
+            %8 = trunc i128 %7 to i64
+            store i64 %8, i64* %2, align 8
+            %9 = trunc i128 %6 to i64
+            ret i64 %9`;
+        return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
+    }
+    else
+    {
+        /+ Straight-forward implementation with `ucent`:
+        ucent result = cast(ucent) a * b;
+        *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
+        return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
+        +/
+
+        /+
+            Implementation using 64bit math is more complex...
+            a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
+                  = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
+                  = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
+                  = c2 << 64 + c11 << 32 + c12 << 32 + c0
+                  = z1 << 64  +  z0
+        // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
+        // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
+        // intermediate result is then the 'carry' that we need to add when calculating z1's sum.
+            z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
+        The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
+            z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
+        +/
+
+        const ulong a_low = a & 0xFFFF_FFFF;
+        const ulong a_high = a >>> 32;
+        const ulong b_low = b & 0xFFFF_FFFF;
+        const ulong b_high = b >>> 32;
+
+        const ulong c2 = a_high*b_high;
+        const ulong c11 = a_high*b_low;
+        const ulong c12 = a_low*b_high;
+        const ulong c0 = a_low*b_low;
+
+        const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
+        const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
+        const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
+
+        *hi = z1;
+        return z0;
+    }
+}
+@system unittest
+{
+    ulong hi;
+    // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
+    assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
+    assert (hi == 0x14b_66dc_33f6_acdc);
+}
+
+/// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
+uint _pdep_u32 (uint a, uint mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+            return __builtin_ia32_pdep_si(a, mask);
+        else
+            return pdep!uint(a, mask);
+    }
+    else
+    {
+        return pdep!uint(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
+           assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
+}
+
+/// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
+ulong _pdep_u64 (ulong a, ulong mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+        {
+            version(X86_64)
+            {
+                // This instruction not available in 32-bit x86.
+                return __builtin_ia32_pdep_di(a, mask);
+            }
+            else
+                return pdep!ulong(a, mask);
+        }
+        else
+            return pdep!ulong(a, mask);
+    }
+    else
+    {
+        return pdep!ulong(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
+           assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
+}
+
+// Helper function for PDEP
+private T pdep(T)(T a, T mask)
+{
+    /+
+        tmp := a
+        dst := 0
+        m := 0
+        k := 0
+        DO WHILE m < 32
+            IF mask[m] == 1
+                dst[m] := tmp[k]
+                k := k + 1
+            FI
+            m := m + 1
+        OD
+    +/
+    T dst;
+    T k_bitpos = 1;
+    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
+    foreach (m; 0..T.sizeof*8)
+    {
+        if (mask & m_bitpos)
+        {
+            dst |= (a & k_bitpos) ? m_bitpos : 0;
+            k_bitpos <<= 1;
+        }
+        m_bitpos <<= 1;
+    }
+    return dst;
+}
+
+
+/// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by 
+/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
+uint _pext_u32 (uint a, uint mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+            return __builtin_ia32_pext_si(a, mask);
+        else
+            return pext!uint(a, mask);
+    }
+    else
+    {
+        return pext!uint(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
+           assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
+}
+
+/// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by 
+/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
+ulong _pext_u64 (ulong a, ulong mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+        {
+            version(X86_64)
+            {
+                // This instruction not available in 32-bit x86.
+                return __builtin_ia32_pext_di(a, mask);
+            }
+            else
+                return pext!ulong(a, mask);
+        }
+        else
+            return pext!ulong(a, mask);
+    }
+    else
+    {
+        return pext!ulong(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
+           assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
+}
+
+// Helper function for PEXT
+private T pext(T)(T a, T mask)
+{
+    /+
+        tmp := a
+        dst := 0
+        m := 0
+        k := 0
+        DO WHILE m < number of bits in T
+            IF mask[m] == 1
+                dst[k] := tmp[m]
+                k := k + 1
+            FI
+            m := m + 1
+        OD
+    +/
+    T dst;
+    T k_bitpos = 1;
+    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
+    foreach (m; 0..T.sizeof*8)
+    {
+        if (mask & m_bitpos)
+        {
+            dst |= (a & m_bitpos) ? k_bitpos : 0;
+            k_bitpos <<= 1;
+        }
+        m_bitpos <<= 1;
+    }
+    return dst;
+}
diff --git a/external/inteli/emmintrin.d b/external/inteli/emmintrin.d
new file mode 100644
index 0000000..67bf67a
--- /dev/null
+++ b/external/inteli/emmintrin.d
@@ -0,0 +1,5773 @@
+/**
+* SSE2 intrinsics. 
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2
+*
+* Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.emmintrin;
+
+public import inteli.types;
+public import inteli.xmmintrin; // SSE2 includes SSE1
+import inteli.mmx;
+import inteli.internals;
+
+nothrow @nogc:
+
+
+// SSE2 instructions
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
+
+/// Add packed 16-bit integers in `a` and `b`.
+__m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m128i)(cast(short8)a + cast(short8)b);
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
+    short8 R = cast(short8) _mm_add_epi16(A, A);
+    short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
+    assert(R.array == correct);
+}
+
+/// Add packed 32-bit integers in `a` and `b`.
+__m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m128i)(cast(int4)a + cast(int4)b);
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
+    int4 R = _mm_add_epi32(A, A);
+    int[4] correct = [ -14, -2, 0, 18 ];
+    assert(R.array == correct);
+}
+
+/// Add packed 64-bit integers in `a` and `b`.
+__m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m128i)(cast(long2)a + cast(long2)b);
+}
+unittest
+{
+    __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
+    long2 R = cast(long2) _mm_add_epi64(A, A);
+    long[2] correct = [ -2, 0 ];
+    assert(R.array == correct);
+}
+
+/// Add packed 8-bit integers in `a` and `b`.
+__m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m128i)(cast(byte16)a + cast(byte16)b);
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
+    byte16 R = cast(byte16) _mm_add_epi8(A, A);
+    byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
+    assert(R.array == correct);
+}
+
+/// Add the lower double-precision (64-bit) floating-point element 
+/// in `a` and `b`, store the result in the lower element of dst, 
+/// and copy the upper element from `a` to the upper element of destination. 
+__m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128d) __simd(XMM.ADDSD, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_addsd(a, b);
+    }
+    else version(DigitalMars)
+    {
+        // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
+        // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
+        asm pure nothrow @nogc @trusted { nop;}
+        a[0] = a[0] + b[0];
+        return a;
+    }
+    else
+    {
+        a[0] += b[0];
+        return a;
+    }
+}
+unittest
+{
+    __m128d a = [1.5, -2.0];
+    a = _mm_add_sd(a, a);
+    assert(a.array == [3.0, -2.0]);
+}
+
+/// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
+__m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
+{
+    pragma(inline, true);
+    return a + b;
+}
+unittest
+{
+    __m128d a = [1.5, -2.0];
+    a = _mm_add_pd(a, a);
+    assert(a.array == [3.0, -4.0]);
+}
+
+/// Add 64-bit integers `a` and `b`.
+__m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
+{
+    // PERF DMD
+    pragma(inline, true);
+    return a + b;
+}
+
+/// Add packed 16-bit integers in `a` and `b` using signed saturation.
+__m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PADDSW, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
+    }
+    else static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m128i) inteli_llvm_adds!short8(cast(short8)a, cast(short8)b);
+    }
+    else
+    {
+        short[8] res; // PERF =void;
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        foreach(i; 0..8)
+            res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
+        return _mm_loadu_si128(cast(int4*)res.ptr);
+    }
+}
+unittest
+{
+    short8 res = cast(short8) _mm_adds_epi16(_mm_setr_epi16( 7,  6,  5, -32768, 3, 3, 32767,   0),
+                                             _mm_setr_epi16( 7,  6,  5, -30000, 3, 1,     1, -10));
+    static immutable short[8] correctResult             =  [14, 12, 10, -32768, 6, 4, 32767, -10];
+    assert(res.array == correctResult);
+}
+
+/// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
+__m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PADDSB, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m128i) inteli_llvm_adds!byte16(cast(byte16)a, cast(byte16)b);
+    }
+    else
+    {
+        byte[16] res; // PERF =void;
+        byte16 sa = cast(byte16)a;
+        byte16 sb = cast(byte16)b;
+        foreach(i; 0..16)
+            res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
+        return _mm_loadu_si128(cast(int4*)res.ptr);
+    }
+}
+unittest
+{
+    byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0),
+                                            _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0));
+    static immutable byte[16] correctResult = [0, 2, 4, 6, -128, 10, 12, 14,
+                                               16, 18, 127, 22, 24, 26, 28, 30];
+    assert(res.array == correctResult);
+}
+
+/// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
+__m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PADDUSB, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_paddusb128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m128i) inteli_llvm_addus!byte16(cast(byte16)a, cast(byte16)b);
+    }
+    else
+    {
+        ubyte[16] res; // PERF =void;
+        byte16 sa = cast(byte16)a;
+        byte16 sb = cast(byte16)b;
+        foreach(i; 0..16)
+            res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
+        return _mm_loadu_si128(cast(int4*)res.ptr);
+    }
+}
+unittest
+{
+    byte16 res = cast(byte16) 
+        _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
+                      _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
+    static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
+                                               0, cast(byte)255, 4, 6, 8, 10, 12, 14];
+    assert(res.array == correctResult);
+}
+
+/// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
+__m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        // Note: DMD generates a reverted paddusw vs LDC and GDC, but that doesn't change the result anyway
+        return cast(__m128i) __simd(XMM.PADDUSW, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_paddusw128(cast(short8)a, cast(short8)b);
+    }
+    else static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m128i) inteli_llvm_addus!short8(cast(short8)a, cast(short8)b);
+    }
+    else
+    {
+        ushort[8] res; // PERF =void;
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        foreach(i; 0..8)
+            res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
+        return _mm_loadu_si128(cast(int4*)res.ptr);
+    }
+}
+unittest
+{
+    short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
+                                             _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
+    static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
+    assert(res.array == correctResult);
+}
+
+/// Compute the bitwise AND of packed double-precision (64-bit) 
+/// floating-point elements in `a` and `b`.
+__m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m128d)( cast(long2)a & cast(long2)b );
+}
+unittest
+{
+    double a = 4.32;
+    double b = -78.99;
+    long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
+    __m128d A = _mm_set_pd(a, b);
+    __m128d B = _mm_set_pd(b, a);
+    long2 R = cast(long2)( _mm_and_pd(A, B) );
+    assert(R.array[0] == correct);
+    assert(R.array[1] == correct);
+}
+
+/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
+__m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
+{
+    pragma(inline, true);
+    return a & b;
+}
+unittest
+{
+    __m128i A = _mm_set1_epi32(7);
+    __m128i B = _mm_set1_epi32(14);
+    __m128i R = _mm_and_si128(A, B);
+    int[4] correct = [6, 6, 6, 6];
+    assert(R.array == correct);
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) 
+/// floating-point elements in `a` and then AND with `b`.
+__m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128d) __simd(XMM.ANDNPD, a, b);
+    }
+    else
+    {
+        return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
+    }
+}
+unittest
+{
+    double a = 4.32;
+    double b = -78.99;
+    long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
+    long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
+    __m128d A = _mm_setr_pd(a, b);
+    __m128d B = _mm_setr_pd(b, a);
+    long2 R = cast(long2)( _mm_andnot_pd(A, B) );
+    assert(R.array[0] == correct);
+    assert(R.array[1] == correct2);
+}
+
+/// Compute the bitwise NOT of 128 bits (representing integer data) 
+/// in `a` and then AND with `b`.
+__m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PANDN, a, b);
+    }
+    else
+    {
+        return (~a) & b;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(7, -2, 9, 54654);
+    __m128i B = _mm_setr_epi32(14, 78, 111, -256);
+    __m128i R = _mm_andnot_si128(A, B);
+    int[4] correct = [8, 0, 102, -54784];
+    assert(R.array == correct);
+}
+
+/// Average packed unsigned 16-bit integers in `a` and `b`.
+__m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PAVGW, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSE2 && __VERSION__ >= 2094)
+    {
+        // Exists since LDC 1.18
+        return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // Generates pavgw even in LDC 1.0, even in -O0
+        // But not in ARM
+        enum ir = `
+            %ia = zext <8 x i16> %0 to <8 x i32>
+            %ib = zext <8 x i16> %1 to <8 x i32>
+            %isum = add <8 x i32> %ia, %ib
+            %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+            %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+            %r = trunc <8 x i32> %isums to <8 x i16>
+            ret <8 x i16> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        short8 sr = void;
+        foreach(i; 0..8)
+        {
+            sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
+        }
+        return cast(int4)sr;
+    }
+}
+unittest
+{
+    __m128i A = _mm_set1_epi16(31);
+    __m128i B = _mm_set1_epi16(64);
+    short8 avg = cast(short8)(_mm_avg_epu16(A, B));
+    foreach(i; 0..8)
+        assert(avg.array[i] == 48);
+}
+
+/// Average packed unsigned 8-bit integers in `a` and `b`.
+__m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PAVGB, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else static if (LDC_with_SSE2 && __VERSION__ >= 2094)
+    {
+        // Exists since LDC 1.18
+        return cast(__m128i) __builtin_ia32_pavgb128(cast(byte16)a, cast(byte16)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // Generates pavgb even in LDC 1.0, even in -O0
+        // But not in ARM
+        enum ir = `
+            %ia = zext <16 x i8> %0 to <16 x i16>
+            %ib = zext <16 x i8> %1 to <16 x i16>
+            %isum = add <16 x i16> %ia, %ib
+            %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+            %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+            %r = trunc <16 x i16> %isums to <16 x i8>
+            ret <16 x i8> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
+    }
+    else
+    {
+        byte16 sa = cast(byte16)a;
+        byte16 sb = cast(byte16)b;
+        byte16 sr = void;
+        foreach(i; 0..16)
+        {
+            sr.ptr[i] = cast(ubyte)( (cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]) + 1) >> 1 );
+        }
+        return cast(int4)sr;
+    }
+}
+unittest
+{
+    __m128i A = _mm_set1_epi8(31);
+    __m128i B = _mm_set1_epi8(64);
+    byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
+    foreach(i; 0..16)
+        assert(avg.array[i] == 48);
+}
+
+/// Shift `a` left by `bytes` bytes while shifting in zeros.
+alias _mm_bslli_si128 = _mm_slli_si128;
+unittest
+{
+    __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
+    __m128i result = _mm_bslli_si128!5(toShift);
+    assert( (cast(byte16)result).array == exact);
+}
+
+/// Shift `v` right by `bytes` bytes while shifting in zeros.
+alias _mm_bsrli_si128 = _mm_srli_si128;
+unittest
+{
+    __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
+    __m128i result = _mm_bsrli_si128!5(toShift);
+    assert( (cast(byte16)result).array == exact);
+}
+
+/// Cast vector of type `__m128d` to type `__m128`. 
+/// Note: Also possible with a regular `cast(__m128)(a)`.
+__m128 _mm_castpd_ps (__m128d a) pure @safe
+{
+    return cast(__m128)a;
+}
+
+/// Cast vector of type `__m128d` to type `__m128i`. 
+/// Note: Also possible with a regular `cast(__m128i)(a)`.
+__m128i _mm_castpd_si128 (__m128d a) pure @safe
+{
+    return cast(__m128i)a;
+}
+
+/// Cast vector of type `__m128` to type `__m128d`. 
+/// Note: Also possible with a regular `cast(__m128d)(a)`.
+__m128d _mm_castps_pd (__m128 a) pure @safe
+{
+    return cast(__m128d)a;
+}
+
+/// Cast vector of type `__m128` to type `__m128i`. 
+/// Note: Also possible with a regular `cast(__m128i)(a)`.
+__m128i _mm_castps_si128 (__m128 a) pure @safe
+{
+    return cast(__m128i)a;
+}
+
+/// Cast vector of type `__m128i` to type `__m128d`. 
+/// Note: Also possible with a regular `cast(__m128d)(a)`.
+__m128d _mm_castsi128_pd (__m128i a) pure @safe
+{
+    return cast(__m128d)a;
+}
+
+/// Cast vector of type `__m128i` to type `__m128`. 
+/// Note: Also possible with a regular `cast(__m128)(a)`.
+__m128 _mm_castsi128_ps (__m128i a) pure @safe
+{
+    return cast(__m128)a;
+}
+
+/// Invalidate and flush the cache line that contains `p` 
+/// from all levels of the cache hierarchy.
+void _mm_clflush (const(void)* p) @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        __builtin_ia32_clflush(p);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        __builtin_ia32_clflush(cast(void*)p);
+    }
+    else version(D_InlineAsm_X86)
+    {
+        asm pure nothrow @nogc @trusted
+        {
+            mov EAX, p;
+            clflush [EAX];
+        }
+    }
+    else version(D_InlineAsm_X86_64)
+    {
+        asm pure nothrow @nogc @trusted
+        {
+            mov RAX, p;
+            clflush [RAX];
+        }
+    }
+    else 
+    {
+        // Do nothing. Invalidating cacheline does
+        // not affect correctness.
+    }
+}
+unittest
+{
+    ubyte[64] cacheline;
+    _mm_clflush(cacheline.ptr);
+}
+
+/// Compare packed 16-bit integers in `a` and `b` for equality.
+__m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(__m128i)(cast(short8)a == cast(short8)b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
+    }
+    else
+    {
+        return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
+    }
+}
+unittest
+{
+    short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
+    short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
+    short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
+    short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
+    assert(R.array == E);
+}
+
+/// Compare packed 32-bit integers in `a` and `b` for equality.
+__m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(__m128i)(cast(int4)a == cast(int4)b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_pcmpeqd128(a, b);
+    }
+    else
+    {
+        return equalMask!__m128i(a, b);
+    }
+}
+unittest
+{
+    int4   A = [-3, -2, -1,  0];
+    int4   B = [ 4, -2,  2,  0];
+    int[4] E = [ 0, -1,  0, -1];
+    int4   R = cast(int4)(_mm_cmpeq_epi32(A, B));
+    assert(R.array == E);
+}
+
+/// Compare packed 8-bit integers in `a` and `b` for equality.
+__m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(__m128i)(cast(byte16)a == cast(byte16)b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else
+    {
+        return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
+    __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
+    byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
+    byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
+    assert(C.array == correct);
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for equality.
+__m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(double2)(cast(double2)a == cast(double2)b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpeqpd(a, b);
+    }
+    else
+    {
+        return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
+    }
+}
+unittest
+{
+    double2 A = _mm_setr_pd(1.0, 2.0);
+    double2 B = _mm_setr_pd(0.0, 2.0);
+    double2 N = _mm_setr_pd(double.nan, double.nan);
+    long2 C = cast(long2) _mm_cmpeq_pd(A, B);
+    long[2] correctC = [0, -1];
+    assert(C.array == correctC);
+    long2 D = cast(long2) _mm_cmpeq_pd(N, N);
+    long[2] correctD = [0, 0];
+    assert(D.array == correctD);
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements
+/// in `a` and `b` for equality, store the result in the lower element,
+/// and copy the upper element from `a`.
+__m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128d) __simd(XMM.CMPSD, a, b, 0);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpeqsd(a, b);
+    }
+    else
+    {
+        return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
+    }
+}
+unittest
+{
+    double2 A = _mm_setr_pd(0.0, 2.0);
+    double2 B = _mm_setr_pd(1.0, 2.0);
+    double2 C = _mm_setr_pd(1.0, 3.0);
+    double2 D = cast(double2) _mm_cmpeq_sd(A, B);
+    long2 E = cast(long2) _mm_cmpeq_sd(B, C);
+    double[2] correctD = [0.0, 2.0];
+    double two = 2.0;
+    long[2] correctE = [-1, *cast(long*)&two];
+    assert(D.array == correctD);
+    assert(E.array == correctE);
+}
+
+/// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal.
+/// #BONUS
+__m128i _mm_cmpge_epi16 (__m128i a, __m128i b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(__m128i)(cast(short8)a >= cast(short8)b);
+    }
+    else version (LDC)
+    {
+        // LDC ARM64: generates cmge since -O1
+        return cast(__m128i) greaterOrEqualMask!short8(cast(short8)a, cast(short8)b);
+    }
+    else
+    {        
+        return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_cmpgt_epi16(a, b));
+    }
+}
+unittest
+{
+    short8   A = [-3, -2, -32768,  0,  0,  1,  2,  3];
+    short8   B = [ 4,  3,  32767,  1,  0, -1, -2, -3];
+    short[8] E = [ 0,  0,      0,  0,  -1, -1, -1, -1];
+    short8   R = cast(short8)(_mm_cmpge_epi16(cast(__m128i)A, cast(__m128i)B));
+    assert(R.array == E);
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for greater-than-or-equal.
+__m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(__m128d)(a >= b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpgepd(a, b);
+    }
+    else
+    {
+        return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for greater-than-or-equal, store the result in the 
+/// lower element, and copy the upper element from `a`.
+__m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128d) __simd(XMM.CMPSD, b, a, 2);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmplesd(b, a);
+    }
+    else
+    {
+        return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.0, 0.0);
+    __m128d B = _mm_setr_pd(double.nan, 0.0);
+    __m128d C = _mm_setr_pd(2.0, 0.0);
+    assert( (cast(long2)_mm_cmpge_sd(A, A)).array[0] == -1);
+    assert( (cast(long2)_mm_cmpge_sd(A, B)).array[0] ==  0);
+    assert( (cast(long2)_mm_cmpge_sd(A, C)).array[0] ==  0);
+    assert( (cast(long2)_mm_cmpge_sd(B, A)).array[0] ==  0);
+    assert( (cast(long2)_mm_cmpge_sd(B, B)).array[0] ==  0);
+    assert( (cast(long2)_mm_cmpge_sd(B, C)).array[0] ==  0);
+    assert( (cast(long2)_mm_cmpge_sd(C, A)).array[0] == -1);
+    assert( (cast(long2)_mm_cmpge_sd(C, B)).array[0] ==  0);
+    assert( (cast(long2)_mm_cmpge_sd(C, C)).array[0] == -1);
+}
+
+/// Compare packed 16-bit integers in `a` and `b` for greater-than.
+__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(__m128i)(cast(short8)a > cast(short8)b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
+    }
+    else
+    {
+        return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
+    }
+}
+unittest
+{
+    short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
+    short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
+    short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
+    short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
+    assert(R.array == E);
+}
+
+/// Compare packed 32-bit integers in `a` and `b` for greater-than.
+__m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(__m128i)(cast(int4)a > cast(int4)b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_pcmpgtd128(a, b); 
+    }
+    else
+    {
+        return cast(__m128i)( greaterMask!int4(a, b));
+    }
+}
+unittest
+{
+    int4   A = [-3,  2, -1,  0];
+    int4   B = [ 4, -2,  2,  0];
+    int[4] E = [ 0, -1,  0,  0];
+    int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
+    assert(R.array == E);
+}
+
+/// Compare packed 8-bit integers in `a` and `b` for greater-than.
+__m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(__m128i)(cast(byte16)a > cast(byte16)b);
+    }
+    else
+    {
+        // Note: __builtin_ia32_pcmpgtb128 is buggy, do not use with GDC
+        // TODO: re-check that
+        return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(1, 2, 3, 1,  127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
+    __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
+    byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
+    byte[16] correct =       [0, 0,-1, 0,   -1,   0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
+    __m128i D = _mm_cmpeq_epi8(A, B);
+    assert(C.array == correct);
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for greater-than.
+__m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(__m128d)(a > b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpgtpd(a, b); 
+    }
+    else
+    {
+        return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for greater-than, store the result in the lower element,
+/// and copy the upper element from `a`.
+__m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128d) __simd(XMM.CMPSD, b, a, 1);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpltsd(b, a);
+    }
+    else
+    {
+        return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.0, 0.0);
+    __m128d B = _mm_setr_pd(double.nan, 0.0);
+    __m128d C = _mm_setr_pd(2.0, 0.0);
+    assert( (cast(long2)_mm_cmpgt_sd(A, A)).array[0] ==  0);
+    assert( (cast(long2)_mm_cmpgt_sd(A, B)).array[0] ==  0);
+    assert( (cast(long2)_mm_cmpgt_sd(A, C)).array[0] ==  0);
+    assert( (cast(long2)_mm_cmpgt_sd(B, A)).array[0] ==  0);
+    assert( (cast(long2)_mm_cmpgt_sd(B, B)).array[0] ==  0);
+    assert( (cast(long2)_mm_cmpgt_sd(B, C)).array[0] ==  0);
+    assert( (cast(long2)_mm_cmpgt_sd(C, A)).array[0] == -1);
+    assert( (cast(long2)_mm_cmpgt_sd(C, B)).array[0] ==  0);
+    assert( (cast(long2)_mm_cmpgt_sd(C, C)).array[0] ==  0);
+}
+
+
+/// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal.
+/// #BONUS
+__m128i _mm_cmple_epi16 (__m128i a, __m128i b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(__m128i)(cast(short8)a <= cast(short8)b);
+    }
+    else version (LDC)
+    {
+        // LDC ARM64: generates cmge since -O1
+        return cast(__m128i) greaterOrEqualMask!short8(cast(short8)b, cast(short8)a);
+    }
+    else
+    {
+        return _mm_xor_si128(_mm_cmpeq_epi16(b, a), _mm_cmpgt_epi16(b, a));
+    }
+}
+unittest
+{
+    short8   A = [-3, -2, -32768,  1,  0,  1,  2,  3];
+    short8   B = [ 4,  3,  32767,  0,  0, -1, -2, -3];
+    short[8] E = [-1, -1,     -1,  0,  -1, 0,  0,  0];
+    short8   R = cast(short8)(_mm_cmple_epi16(cast(__m128i)A, cast(__m128i)B));
+    assert(R.array == E);
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for less-than-or-equal.
+__m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(__m128d)(a <= b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmplepd(a, b); 
+    }
+    else
+    {
+        return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for less-than-or-equal, store the result in the 
+/// lower element, and copy the upper element from `a`.
+__m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128d) __simd(XMM.CMPSD, a, b, 2);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmplesd(a, b); 
+    }
+    else
+    {
+        return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
+    }
+}
+
+/// Compare packed 16-bit integers in `a` and `b` for less-than.
+__m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
+{
+    return _mm_cmpgt_epi16(b, a);
+}
+
+/// Compare packed 32-bit integers in `a` and `b` for less-than.
+__m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
+{
+    return _mm_cmpgt_epi32(b, a);
+}
+
+/// Compare packed 8-bit integers in `a` and `b` for less-than.
+__m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
+{
+    return _mm_cmpgt_epi8(b, a);
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements
+/// in `a` and `b` for less-than.
+__m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(__m128d)(a < b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpltpd(a, b); 
+    }
+    else
+    {
+        return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements
+/// in `a` and `b` for less-than, store the result in the lower 
+/// element, and copy the upper element from `a`.
+__m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128d) __simd(XMM.CMPSD, a, b, 1);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpltsd(a, b); 
+    }
+    else
+    {
+        return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements
+/// in `a` and `b` for not-equal.
+__m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpneqpd(a, b); 
+    }
+    else
+    {
+        return cast(__m128d) cmppd!(FPComparison.une)(a, b);
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements
+/// in `a` and `b` for not-equal, store the result in the lower 
+/// element, and copy the upper element from `a`.
+__m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpneqsd(a, b); 
+    }
+    else
+    {
+        return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for not-greater-than-or-equal.
+__m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpngepd(a, b); 
+    }
+    else
+    {
+        return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for not-greater-than-or-equal, store the result in 
+/// the lower element, and copy the upper element from `a`.
+__m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
+{
+    // Note: There is no __builtin_ia32_cmpngesd builtin.
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpltsd(b, a); 
+    }
+    else
+    {
+        return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for not-greater-than.
+__m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpngtpd(a, b);
+    }
+    else
+    {
+        return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for not-greater-than, store the result in the 
+/// lower element, and copy the upper element from `a`.
+__m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
+{
+    // Note: There is no __builtin_ia32_cmpngtsd builtin.
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmplesd(b, a);
+    }
+    else
+    {
+        return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for not-less-than-or-equal.
+__m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpnlepd(a, b);
+    }
+    else
+    {
+        return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for not-less-than-or-equal, store the result in the 
+/// lower element, and copy the upper element from `a`.
+__m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpnlesd(a, b);
+    }
+    else
+    {
+        return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
+    }
+}
+ 
+/// Compare packed double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for not-less-than.
+__m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpnltpd(a, b);
+    }
+    else
+    {
+        return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements 
+/// in `a` and `b` for not-less-than, store the result in the lower 
+/// element, and copy the upper element from `a`.
+__m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpnltsd(a, b);
+    }
+    else
+    {
+        return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements 
+/// in `a` and `b` to see if neither is NaN.
+__m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpordpd(a, b);
+    }
+    else
+    {
+        return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements 
+/// in `a` and `b` to see if neither is NaN, store the result in the 
+/// lower element, and copy the upper element from `a` to the upper element.
+__m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpordsd(a, b);
+    }
+    else
+    {
+        return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
+    }
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements 
+/// in `a` and `b` to see if either is NaN.
+__m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpunordpd(a, b);
+    }
+    else
+    {
+        return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements 
+/// in `a` and `b` to see if either is NaN, store the result in the lower 
+/// element, and copy the upper element from `a` to the upper element.
+__m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cmpunordsd(a, b);
+    }
+    else
+    {
+        return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
+    }
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element 
+/// in `a` and `b` for equality, and return the boolean result (0 or 1).
+int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
+{
+    // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 
+    // comisd instruction, it returns false in case of unordered instead.
+    //
+    // Actually C++ compilers disagree over the meaning of that instruction.
+    // GCC will manage NaNs like the comisd instruction (return true if unordered), 
+    // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
+    // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
+    return a.array[0] == b.array[0];
+}
+unittest
+{
+    assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
+    assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
+    assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
+    assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
+    assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element 
+/// in `a` and `b` for greater-than-or-equal, and return the boolean 
+/// result (0 or 1).
+int _mm_comige_sd (__m128d a, __m128d b) pure @safe
+{
+    return a.array[0] >= b.array[0];
+}
+unittest
+{
+    assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
+    assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
+    assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
+    assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
+    assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
+    assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element 
+/// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
+int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
+{
+    return a.array[0] > b.array[0];
+}
+unittest
+{
+    assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
+    assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
+    assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
+    assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
+    assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element 
+/// in `a` and `b` for less-than-or-equal.
+int _mm_comile_sd (__m128d a, __m128d b) pure @safe
+{
+    return a.array[0] <= b.array[0];
+}
+unittest
+{
+    assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
+    assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
+    assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
+    assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
+    assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
+    assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element 
+/// in `a` and `b` for less-than, and return the boolean result (0 or 1).
+int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
+{
+    return a.array[0] < b.array[0];
+}
+unittest
+{
+    assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
+    assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
+    assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
+    assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
+    assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
+    assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element
+/// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
+int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
+{
+    return a.array[0] != b.array[0];
+}
+unittest
+{
+    assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
+    assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
+    assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
+    assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
+    assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
+}
+
+/// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
+/// floating-point elements.
+__m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
+{
+    static if (LDC_with_optimizations)
+    {
+        // Generates cvtdq2pd since LDC 1.0, even without optimizations
+        enum ir = `
+            %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
+            %r = sitofp <2 x i32> %v to <2 x double>
+            ret <2 x double> %r`;
+        return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cvtdq2pd(a);
+    }
+    else
+    {
+        double2 r = void;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = a.array[1];
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
+    assert(A.array[0] == 54.0);
+    assert(A.array[1] == 54.0);
+}
+
+/// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
+/// floating-point elements.
+__m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128)__simd(XMM.CVTDQ2PS, cast(void16) a);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cvtdq2ps(a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // See #86 for why we had to resort to LLVM IR.
+        // Plain code below was leading to catastrophic behaviour. 
+        // x86: Generates cvtdq2ps since LDC 1.1.0 -O0
+        // ARM: Generats scvtf.4s since LDC 1.8.0 -O0
+        enum ir = `
+            %r = sitofp <4 x i32> %0 to <4 x float>
+            ret <4 x float> %r`;
+        return cast(__m128) LDCInlineIR!(ir, float4, int4)(a);
+    }
+    else static if (LDC_with_x86_asm)
+    {
+        __m128 r;
+        asm pure nothrow @nogc @trusted
+        {
+            movdqu XMM0, a;
+            cvtdq2ps XMM0, XMM0;
+            movdqu r, XMM0;
+        }
+        return r;
+    }
+    else
+    {
+        __m128 res; // PERF =void;
+        res.ptr[0] = cast(float)a.array[0];
+        res.ptr[1] = cast(float)a.array[1];
+        res.ptr[2] = cast(float)a.array[2];
+        res.ptr[3] = cast(float)a.array[3];
+        return res;
+    }
+}
+unittest
+{
+    __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
+    assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements 
+/// in `a` to packed 32-bit integers.
+__m128i _mm_cvtpd_epi32 (__m128d a) @trusted
+{
+    // PERF ARM32
+    static if (LDC_with_SSE2)
+    {
+        return __builtin_ia32_cvtpd2dq(a);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cvtpd2dq(a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // Get current rounding mode.
+        uint fpscr = arm_get_fpcr();
+        long2 i;
+        switch(fpscr & _MM_ROUND_MASK_ARM)
+        {
+            default:
+            case _MM_ROUND_NEAREST_ARM:     i = vcvtnq_s64_f64(a); break;
+            case _MM_ROUND_DOWN_ARM:        i = vcvtmq_s64_f64(a); break;
+            case _MM_ROUND_UP_ARM:          i = vcvtpq_s64_f64(a); break;
+            case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
+        }
+        int4 zero = 0;
+        return cast(__m128i) shufflevectorLDC!(int4, 0, 2, 4, 6)(cast(int4)i, zero); // PERF: this slow down build for nothing, test without shufflevector
+    }
+    else
+    {
+        // PERF ARM32
+        __m128i r = _mm_setzero_si128();
+        r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
+        r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
+        return r;
+    }
+}
+unittest
+{
+    int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
+    assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `v`
+/// to packed 32-bit integers
+__m64 _mm_cvtpd_pi32 (__m128d v) @safe
+{
+    return to_m64(_mm_cvtpd_epi32(v));
+}
+unittest
+{
+    int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
+    assert(A.array[0] == 55 && A.array[1] == 61);
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements 
+/// in `a` to packed single-precision (32-bit) floating-point elements.
+__m128 _mm_cvtpd_ps (__m128d a) pure @trusted
+{
+    static if (LDC_with_SSE2)
+    {
+        return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cvtpd2ps(a);
+    }
+    else
+    { 
+        __m128 r = void;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = a.array[1];
+        r.ptr[2] = 0;
+        r.ptr[3] = 0;
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_set_pd(5.25, 4.0);
+    __m128 B = _mm_cvtpd_ps(A);
+    assert(B.array == [4.0f, 5.25f, 0, 0]);
+}
+
+/// Convert packed 32-bit integers in `v` to packed double-precision 
+/// (64-bit) floating-point elements.
+__m128d _mm_cvtpi32_pd (__m64 v) pure @safe
+{
+    return _mm_cvtepi32_pd(to_m128i(v));
+}
+unittest
+{
+    __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
+    assert(A.array[0] == 4.0 && A.array[1] == -5.0);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements 
+/// in `a` to packed 32-bit integers
+__m128i _mm_cvtps_epi32 (__m128 a) @trusted
+{
+    static if (LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_cvtps2dq(a);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cvtps2dq(a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // Get current rounding mode.
+        uint fpscr = arm_get_fpcr();
+        switch(fpscr & _MM_ROUND_MASK_ARM)
+        {
+            default:
+            case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
+            case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
+            case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
+            case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
+        }
+    }
+    else
+    {
+        __m128i r = void;
+        r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
+        r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
+        r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
+        r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
+        return r;
+    }
+}
+unittest
+{
+    // GDC bug #98607
+    // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
+    // GDC does not provide optimization barrier for rounding mode.
+    // Workarounded with different literals. This bug will likely only manifest in unittest.
+    // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't.
+
+    uint savedRounding = _MM_GET_ROUNDING_MODE();
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+    __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
+    assert(A.array == [1, -2, 54, -3]);
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+    A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
+    assert(A.array == [1, -3, 53, -3]);
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+    A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
+    assert(A.array == [2, -2, 54, -2]);
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+    A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
+    assert(A.array == [1, -2, 53, -2]);
+
+    _MM_SET_ROUNDING_MODE(savedRounding);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements 
+/// in `a` to packed double-precision (64-bit) floating-point elements.
+__m128d _mm_cvtps_pd (__m128 a) pure @trusted
+{
+    static if (LDC_with_optimizations)
+    {
+        // Generates cvtps2pd since LDC 1.0 -O0
+        enum ir = `
+            %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
+            %r = fpext <2 x float> %v to <2 x double>
+            ret <2 x double> %r`;
+        return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cvtps2pd(a);
+    }
+    else
+    {
+        double2 r = void;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = a.array[1];
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
+    assert(A.array[0] == 54.0);
+    assert(A.array[1] == 54.0);
+}
+
+/// Copy the lower double-precision (64-bit) floating-point element of `a`.
+double _mm_cvtsd_f64 (__m128d a) pure @safe
+{
+    return a.array[0];
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element
+/// in `a` to a 32-bit integer.
+int _mm_cvtsd_si32 (__m128d a) @safe
+{
+    static if (LDC_with_SSE2)
+    {
+        return __builtin_ia32_cvtsd2si(a);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cvtsd2si(a);
+    }
+    else
+    {
+        return convertDoubleToInt32UsingMXCSR(a[0]);
+    }
+}
+unittest
+{
+    assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
+long _mm_cvtsd_si64 (__m128d a) @trusted
+{
+    static if (LDC_with_SSE2)
+    {
+        version (X86_64)
+        {
+            return __builtin_ia32_cvtsd2si64(a);
+        }
+        else
+        {
+            // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
+            // using SSE instructions only. So the builtin doesn't exist for this arch.
+            return convertDoubleToInt64UsingMXCSR(a[0]);
+        }
+    }
+    else
+    {
+        return convertDoubleToInt64UsingMXCSR(a.array[0]);
+    }
+}
+unittest
+{
+    assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
+
+    uint savedRounding = _MM_GET_ROUNDING_MODE();
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+    assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+    assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+    assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+    assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
+
+    _MM_SET_ROUNDING_MODE(savedRounding);
+}
+
+deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
+
+/// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
+/// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
+/// to the upper elements of result.
+__m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cvtsd2ss(a, b); 
+    }
+    else
+    {
+        // Generates cvtsd2ss since LDC 1.3 -O0
+        a.ptr[0] = b.array[0];
+        return a;
+    }
+}
+unittest
+{
+    __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
+    assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
+}
+
+/// Get the lower 32-bit integer in `a`.
+int _mm_cvtsi128_si32 (__m128i a) pure @safe
+{
+    return a.array[0];
+}
+
+/// Get the lower 64-bit integer in `a`.
+long _mm_cvtsi128_si64 (__m128i a) pure @safe
+{
+    long2 la = cast(long2)a;
+    return la.array[0];
+}
+deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
+
+/// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
+/// lower element of result, and copy the upper element from `a` to the upper element of result.
+__m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
+{
+    a.ptr[0] = cast(double)b;
+    return a;
+}
+unittest
+{
+    __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
+    assert(a.array == [42.0, 0]);
+}
+
+/// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
+__m128i _mm_cvtsi32_si128 (int a) pure @trusted
+{
+    int4 r = [0, 0, 0, 0];
+    r.ptr[0] = a;
+    return r;
+}
+unittest
+{
+    __m128i a = _mm_cvtsi32_si128(65);
+    assert(a.array == [65, 0, 0, 0]);
+}
+
+/// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
+/// the lower element of result, and copy the upper element from `a` to the upper element of result.
+
+__m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
+{
+    a.ptr[0] = cast(double)b;
+    return a;
+}
+unittest
+{
+    __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
+    assert(a.array == [42.0, 0]);
+}
+
+/// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
+__m128i _mm_cvtsi64_si128 (long a) pure @trusted
+{
+    long2 r = [0, 0];
+    r.ptr[0] = a;
+    return cast(__m128i)(r);
+}
+
+deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
+deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
+
+/// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
+/// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
+// element of result.
+double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
+{
+    a.ptr[0] = b.array[0];
+    return a;
+}
+unittest
+{
+    __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
+    assert(a.array == [42.0, 0]);
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
+long _mm_cvttss_si64 (__m128 a) pure @safe
+{
+    return cast(long)(a.array[0]); // Generates cvttss2si as expected
+}
+unittest
+{
+    assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
+/// Put zeroes in the upper elements of result.
+__m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
+{
+    static if (LDC_with_SSE2)
+    {
+        return __builtin_ia32_cvttpd2dq(a);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_cvttpd2dq(a);
+    }
+    else
+    {
+        // Note: doesn't generate cvttpd2dq as of LDC 1.13
+        __m128i r; // PERF =void;
+        r.ptr[0] = cast(int)a.array[0];
+        r.ptr[1] = cast(int)a.array[1];
+        r.ptr[2] = 0;
+        r.ptr[3] = 0;
+        return r;
+    }
+}
+unittest
+{
+    __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
+    assert(R.array == [-4, 45641, 0, 0]);
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `v` 
+/// to packed 32-bit integers with truncation.
+__m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
+{
+    return to_m64(_mm_cvttpd_epi32(v));
+}
+unittest
+{
+    int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
+    int[2] correct = [-4, 45641];
+    assert(R.array == correct);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
+__m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
+{
+    // x86: Generates cvttps2dq since LDC 1.3 -O2
+    // ARM64: generates fcvtze since LDC 1.8 -O2
+    __m128i r; // PERF = void;
+    r.ptr[0] = cast(int)a.array[0];
+    r.ptr[1] = cast(int)a.array[1];
+    r.ptr[2] = cast(int)a.array[2];
+    r.ptr[3] = cast(int)a.array[3];
+    return r;
+}
+unittest
+{
+    __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
+    assert(R.array == [-4, 45641, 0, 1]);
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
+int _mm_cvttsd_si32 (__m128d a)
+{
+    // Generates cvttsd2si since LDC 1.3 -O0
+    return cast(int)a.array[0];
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
+long _mm_cvttsd_si64 (__m128d a)
+{
+    // Generates cvttsd2si since LDC 1.3 -O0
+    // but in 32-bit instead, it's a long sequence that resort to FPU
+    return cast(long)a.array[0];
+}
+
+deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
+
+/// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
+__m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
+{
+    pragma(inline, true);
+    return a / b;
+}
+
+__m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_divsd(a, b);
+    }
+    else version(DigitalMars)
+    {
+        // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
+        // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
+        asm pure nothrow @nogc @trusted { nop;}
+        a.array[0] = a.array[0] / b.array[0];
+        return a;
+    }
+    else
+    {
+        a.ptr[0] /= b.array[0];
+        return a;
+    }
+}
+unittest
+{
+    __m128d a = [2.0, 4.5];
+    a = _mm_div_sd(a, a);
+    assert(a.array == [1.0, 4.5]);
+}
+
+/// Extract a 16-bit integer from `v`, selected with `index`.
+/// Warning: the returned value is zero-extended to 32-bits.
+int _mm_extract_epi16(__m128i v, int index) pure @safe
+{
+    short8 r = cast(short8)v;
+    return cast(ushort)(r.array[index & 7]);
+}
+unittest
+{
+    __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
+    assert(_mm_extract_epi16(A, 6) == 6);
+    assert(_mm_extract_epi16(A, 0) == 65535);
+    assert(_mm_extract_epi16(A, 5 + 8) == 5);
+}
+
+/// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
+__m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
+{
+    short8 r = cast(short8)v;
+    r.ptr[index & 7] = cast(short)i;
+    return cast(__m128i)r;
+}
+unittest
+{
+    __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+    short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
+    short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
+    assert(R.array == correct);
+}
+
+/// Perform a serializing operation on all load-from-memory instructions that were issued prior 
+/// to this instruction. Guarantees that every load instruction that precedes, in program order, 
+/// is globally visible before any load instruction which follows the fence in program order.
+void _mm_lfence() @trusted
+{
+    version(GNU)
+    {
+        static if (GDC_with_SSE2)
+        {
+            __builtin_ia32_lfence();
+        }
+        else version(X86)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                "lfence;\n" : : : ;
+            }
+        }
+        else __warn_noop();
+    }
+    else static if (LDC_with_SSE2)
+    {
+        __builtin_ia32_lfence();
+    }
+    else static if (LDC_with_ARM64)
+    {
+         __builtin_arm_dmb(9);  // dmb ishld
+    }
+    else static if (DMD_with_asm)
+    {
+        asm nothrow @nogc pure @trusted
+        {
+            lfence;
+        }
+    }
+    else version(LDC)
+    {
+        // When the architecture is unknown, generate a full memory barrier,
+        // as the semantics of sfence do not really match those of atomics.
+        llvm_memory_fence();
+    }
+    else
+        static assert(false);
+}
+unittest
+{
+    _mm_lfence();
+}
+
+/// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+__m128d _mm_load_pd (const(double) * mem_addr) pure
+{
+    pragma(inline, true);
+    __m128d* aligned = cast(__m128d*)mem_addr;
+    return *aligned;
+}
+unittest
+{
+    align(16) double[2] S = [-5.0, 7.0];
+    __m128d R = _mm_load_pd(S.ptr);
+    assert(R.array == S);
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
+/// `mem_addr` does not need to be aligned on any particular boundary.
+__m128d _mm_load_pd1 (const(double)* mem_addr) pure
+{
+    double m = *mem_addr;
+    __m128d r; // PERF =void;
+    r.ptr[0] = m;
+    r.ptr[1] = m;
+    return r;
+}
+unittest
+{
+    double what = 4;
+    __m128d R = _mm_load_pd1(&what);
+    double[2] correct = [4.0, 4];
+    assert(R.array == correct);
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
+/// element. `mem_addr` does not need to be aligned on any particular boundary.
+__m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
+{
+    double2 r = [0, 0];
+    r.ptr[0] = *mem_addr;
+    return r;
+}
+unittest
+{
+    double x = -42;
+    __m128d a = _mm_load_sd(&x);
+    assert(a.array == [-42.0, 0.0]);
+}
+
+/// Load 128-bits of integer data from memory into dst. 
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+__m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @safe
+{
+    pragma(inline, true);
+    return *mem_addr;
+}
+unittest
+{
+    align(16) int[4] correct = [-1, 2, 3, 4];
+    int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
+    assert(A.array == correct);
+}
+
+alias _mm_load1_pd = _mm_load_pd1; ///
+
+/// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
+/// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
+__m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
+{
+    pragma(inline, true);
+    a.ptr[1] = *mem_addr;
+    return a;
+}
+unittest
+{
+    double A = 7.0;
+    __m128d B = _mm_setr_pd(4.0, -5.0);
+    __m128d R = _mm_loadh_pd(B, &A);
+    double[2] correct = [ 4.0, 7.0 ];
+    assert(R.array == correct);
+}
+
+/// Load 64-bit integer from memory into the first element of result. Zero out the other.
+/// Note: strange signature since the memory doesn't have to aligned, and should point to addressable 64-bit, not 128-bit.
+/// You may use `_mm_loadu_si64` instead.
+__m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted
+{
+    pragma(inline, true);
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
+    }
+    else
+    {
+        auto pLong = cast(const(long)*)mem_addr;
+        long2 r = [0, 0];
+        r.ptr[0] = *pLong;
+        return cast(__m128i)(r);
+    }
+}
+unittest
+{
+    long A = 0x7878787870707070;
+    long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
+    long[2] correct = [0x7878787870707070, 0];
+    assert(R.array == correct);
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
+/// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
+__m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
+{
+    a.ptr[0] = *mem_addr;
+    return a;
+}
+unittest
+{
+    double A = 7.0;
+    __m128d B = _mm_setr_pd(4.0, -5.0);
+    __m128d R = _mm_loadl_pd(B, &A);
+    double[2] correct = [ 7.0, -5.0 ];
+    assert(R.array == correct);
+}
+
+/// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+__m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
+{
+    __m128d a = *cast(__m128d*)(mem_addr);
+    __m128d r; // PERF =void;
+    r.ptr[0] = a.array[1];
+    r.ptr[1] = a.array[0];
+    return r;
+}
+unittest
+{
+    align(16) double[2] A = [56.0, -74.0];
+    __m128d R = _mm_loadr_pd(A.ptr);
+    double[2] correct = [-74.0, 56.0];
+    assert(R.array == correct);
+}
+
+/// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
+/// `mem_addr` does not need to be aligned on any particular boundary.
+__m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
+{
+    pragma(inline, true);
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_loadupd(mem_addr); 
+    }
+    else static if (LDC_with_optimizations)
+    {
+        return loadUnaligned!(double2)(mem_addr);
+    }
+    else version(DigitalMars)
+    {
+        // Apparently inside __simd you can use aligned dereferences without fear.
+        // That was issue 23048 on dlang's Bugzilla.
+        static if (DMD_with_DSIMD)
+        {
+            return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr);
+        }
+        else static if (SSESizedVectorsAreEmulated)
+        {
+            // Since this vector is emulated, it doesn't have alignement constraints
+            // and as such we can just cast it.
+            return *cast(__m128d*)(mem_addr);
+        }
+        else
+        {
+            __m128d result;
+            result.ptr[0] = mem_addr[0];
+            result.ptr[1] = mem_addr[1];
+            return result;
+        }
+    }
+    else
+    {
+        __m128d result;
+        result.ptr[0] = mem_addr[0];
+        result.ptr[1] = mem_addr[1];
+        return result;
+    }
+}
+unittest
+{
+    double[2] A = [56.0, -75.0];
+    __m128d R = _mm_loadu_pd(A.ptr);
+    double[2] correct = [56.0, -75.0];
+    assert(R.array == correct);
+}
+
+/// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
+__m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
+{
+    // PERF DMD
+    pragma(inline, true);
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        return loadUnaligned!(__m128i)(cast(int*)mem_addr);
+    }
+    else
+    {
+        const(int)* p = cast(const(int)*)mem_addr;
+        __m128i r = void;
+        r.ptr[0] = p[0];
+        r.ptr[1] = p[1];
+        r.ptr[2] = p[2];
+        r.ptr[3] = p[3];
+        return r;
+    }
+}
+unittest
+{
+    align(16) int[4] correct = [-1, 2, -3, 4];
+    int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
+    assert(A.array == correct);
+}
+
+/// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise.
+__m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted // TODO: should be @system actually
+{
+    static if (DMD_with_DSIMD)
+    {
+        int r = *cast(short*)(mem_addr);
+        return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r);
+    }
+    else version(DigitalMars)
+    {
+        // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672
+        // DMD cannot handle the below code...
+        align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0];
+        r[0] = *cast(short*)(mem_addr);
+        return *cast(int4*)(r.ptr);
+    }
+    else
+    {
+        short r = *cast(short*)(mem_addr);
+        short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
+        result.ptr[0] = r;
+        return cast(__m128i)result;
+    }
+}
+unittest
+{
+    short r = 13;
+    short8 A = cast(short8) _mm_loadu_si16(&r);
+    short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
+    assert(A.array == correct);
+}
+
+/// Load unaligned 32-bit integer from memory into the first element of result.
+__m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted // TODO: should be @system actually
+{
+    pragma(inline, true);
+    int r = *cast(int*)(mem_addr);
+    int4 result = [0, 0, 0, 0];
+    result.ptr[0] = r;
+    return result;
+}
+unittest
+{
+    int r = 42;
+    __m128i A = _mm_loadu_si32(&r);
+    int[4] correct = [42, 0, 0, 0];
+    assert(A.array == correct);
+}
+
+/// Load unaligned 64-bit integer from memory into the first element of result.
+/// Upper 64-bit is zeroed.
+__m128i _mm_loadu_si64 (const(void)* mem_addr) pure @system
+{
+    pragma(inline, true);
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
+    }
+    else
+    {    
+        auto pLong = cast(const(long)*)mem_addr;
+        long2 r = [0, 0];
+        r.ptr[0] = *pLong;
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    long r = 446446446446;
+    long2 A = cast(long2) _mm_loadu_si64(&r);
+    long[2] correct = [446446446446, 0];
+    assert(A.array == correct);
+}
+
+/// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
+/// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
+/// and pack the results in destination.
+__m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // 5 inst with arm64 + LDC 1.32 + -O1
+        enum ir = `            
+            %ia = sext <8 x i16> %0 to <8 x i32>
+            %ib = sext <8 x i16> %1 to <8 x i32>
+            %p = mul <8 x i32> %ia, %ib
+            %p_even = shufflevector <8 x i32> %p, <8 x i32> undef, <4 x i32> <i32 0, i32 2,i32 4, i32 6>
+            %p_odd  = shufflevector <8 x i32> %p, <8 x i32> undef, <4 x i32> <i32 1, i32 3,i32 5, i32 7>            
+            %p_sum = add <4 x i32> %p_even, %p_odd
+            ret <4 x i32> %p_sum`;
+        return cast(__m128i) LDCInlineIR!(ir, int4, short8, short8)(cast(short8)a, cast(short8)b);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        int4 r;
+        foreach(i; 0..4)
+        {
+            r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
+        }
+        return r;
+    }
+}
+unittest
+{
+    short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
+    short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
+    int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
+    int[4] correct = [1, 13, -2147483648, 2*32767*32767];
+    assert(R.array == correct);
+}
+
+/// Conditionally store 8-bit integer elements from `a` into memory using `mask`
+/// (elements are not stored when the highest bit is not set in the corresponding element)
+/// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
+/// boundary.
+void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
+{
+    static if (GDC_with_SSE2)
+    {    
+        return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // PERF: catastrophic on ARM32
+        byte16 bmask  = cast(byte16)mask;
+        byte16 shift = 7;
+        bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
+        mask = cast(__m128i) bmask;
+        __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
+        dest = (a & mask) | (dest & ~mask);
+        storeUnaligned!__m128i(dest, cast(int*)mem_addr);
+    }
+    else
+    {
+        byte16 b = cast(byte16)a;
+        byte16 m = cast(byte16)mask;
+        byte* dest = cast(byte*)(mem_addr);
+        foreach(j; 0..16)
+        {
+            if (m.array[j] & 128)
+            {
+                dest[j] = b.array[j];
+            }
+        }
+    }
+}
+unittest
+{
+    ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
+    __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
+    __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
+    _mm_maskmoveu_si128(A, mask, dest.ptr);
+    ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
+    assert(dest == correct);
+}
+
+/// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
+__m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b);
+    }
+    else version(LDC)
+    {
+        // x86: pmaxsw since LDC 1.0 -O1
+        // ARM: smax.8h since LDC 1.5 -01
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            short8 greater = sa > sb;
+        else
+            short8 greater = greaterMask!short8(sa, sb);
+        return cast(__m128i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+    {
+        __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, lowerShorts);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
+                                          _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
+    short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
+__m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pmaxub128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else version(LDC)
+    {
+        // x86: pmaxub since LDC 1.0.0 -O1
+        // ARM64: umax.16b since LDC 1.5.0 -O1
+        // PERF: catastrophic on ARM32
+        ubyte16 sa = cast(ubyte16)a;
+        ubyte16 sb = cast(ubyte16)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            ubyte16 greater = (cast(ubyte16)a > cast(ubyte16)b);
+        else
+            ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
+        return cast(__m128i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+    {
+        // PERF: use algorithm from _mm_max_epu16
+        __m128i value128 = _mm_set1_epi8(-128);
+        __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
+        __m128i aTob = a ^ b; // a ^ (a ^ b) == b
+        __m128i mask = aTob & higher;
+        return b ^ mask;
+
+    }
+}
+unittest
+{
+    byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
+                                         _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
+    byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
+    assert(R.array == correct);
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
+/// packed maximum values.
+__m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_maxpd(a, b);
+    }
+    else
+    {
+        // x86: Generates maxpd starting with LDC 1.9 -O2
+        a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
+        a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
+        return a;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(4.0, 1.0);
+    __m128d B = _mm_setr_pd(1.0, 8.0);
+    __m128d M = _mm_max_pd(A, B);
+    assert(M.array[0] == 4.0);
+    assert(M.array[1] == 8.0);
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
+/// lower element of result, and copy the upper element from `a` to the upper element of result.
+__m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_maxsd(a, b);
+    }
+    else
+    {
+         __m128d r = a;
+        // Generates maxsd starting with LDC 1.3
+        r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.0, 1.0);
+    __m128d B = _mm_setr_pd(4.0, 2.0);
+    __m128d M = _mm_max_sd(A, B);
+    assert(M.array[0] == 4.0);
+    assert(M.array[1] == 1.0);
+}
+
+/// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
+/// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
+/// is globally visible before any memory instruction which follows the fence in program order.
+void _mm_mfence() @trusted // not pure!
+{
+    version(GNU)
+    {
+        static if (GDC_with_SSE2)
+        {
+            __builtin_ia32_mfence();
+        }
+        else version(X86)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                "mfence;\n" : : : ;
+            }
+        }
+        else __warn_noop();
+    }
+    else static if (LDC_with_SSE2)
+    {
+        __builtin_ia32_mfence();
+    }
+    else static if (DMD_with_asm)
+    {
+        asm nothrow @nogc pure @trusted
+        {
+            mfence;
+        }
+    }
+    else version(LDC)
+    {
+        // Note: will generate the DMB ish instruction on ARM
+        llvm_memory_fence();
+    }
+    else
+        static assert(false);
+}
+unittest
+{
+    _mm_mfence();
+}
+
+/// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
+__m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b);
+    }
+    else version(LDC)
+    {
+        // x86: pminsw since LDC 1.0 -O1
+        // ARM64: smin.8h since LDC 1.5 -01
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            short8 greater = sa > sb;
+        else
+            short8 greater = greaterMask!short8(sa, sb);
+        return cast(__m128i)( (~greater & sa) | (greater & sb) );
+    }
+    else
+    {
+        __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, lowerShorts);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
+                                          _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
+    short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
+__m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pminub128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else version(LDC)
+    {
+        // x86: pminub since LDC 1.0.0 -O1
+        // ARM: umin.16b since LDC 1.5.0 -O1
+        // PERF: catastrophic on ARM32
+        ubyte16 sa = cast(ubyte16)a;
+        ubyte16 sb = cast(ubyte16)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            ubyte16 greater = (cast(ubyte16)a > cast(ubyte16)b);
+        else
+            ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
+        return cast(__m128i)( (~greater & sa) | (greater & sb) );
+    }
+    else
+    {
+        // PERF: use the algorithm from _mm_max_epu16
+        __m128i value128 = _mm_set1_epi8(-128);
+        __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
+        __m128i aTob = a ^ b; // a ^ (a ^ b) == b
+        __m128i mask = aTob & lower;
+        return b ^ mask;
+    }
+}
+unittest
+{
+    byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
+                                         _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
+    byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
+    assert(R.array == correct);
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
+__m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_minpd(a, b);
+    }
+    else
+    {
+        // Generates minpd starting with LDC 1.9
+        a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
+        a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
+        return a;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.0, 2.0);
+    __m128d B = _mm_setr_pd(4.0, 1.0);
+    __m128d M = _mm_min_pd(A, B);
+    assert(M.array[0] == 1.0);
+    assert(M.array[1] == 1.0);
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
+/// the lower element of result, and copy the upper element from `a` to the upper element of result.
+__m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_minsd(a, b);
+    }
+    else
+    {
+        // Generates minsd starting with LDC 1.3
+        __m128d r = a;
+        r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.0, 3.0);
+    __m128d B = _mm_setr_pd(4.0, 2.0);
+    __m128d M = _mm_min_sd(A, B);
+    assert(M.array[0] == 1.0);
+    assert(M.array[1] == 3.0);
+}
+
+/// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
+__m128i _mm_move_epi64 (__m128i a) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        // slightly better with GDC -O0
+        return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 
+    }
+    else
+    {
+        long2 result = [ 0, 0 ];
+        long2 la = cast(long2) a;
+        result.ptr[0] = la.array[0];
+        return cast(__m128i)(result);
+    }
+}
+unittest
+{
+    long2 A = [13, 47];
+    long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
+    long[2] correct = [13, 0];
+    assert(B.array == correct);
+}
+
+/// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
+/// the upper element from `a` to the upper element of dst.
+__m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_movsd(a, b); 
+    }
+    else
+    {
+        b.ptr[1] = a.array[1];
+        return b;
+    }
+}
+unittest
+{
+    double2 A = [13.0, 47.0];
+    double2 B = [34.0, 58.0];
+    double2 C = _mm_move_sd(A, B);
+    double[2] correct = [34.0, 47.0];
+    assert(C.array == correct);
+}
+
+/// Create mask from the most significant bit of each 8-bit element in `v`.
+int _mm_movemask_epi8 (__m128i a) pure @trusted
+{
+    // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return __builtin_ia32_pmovmskb128(cast(byte16)a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
+        // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
+        // SO there might be something a bit faster, but this one is reasonable and branchless.
+        byte8 mask_shift;
+        mask_shift.ptr[0] = 7;
+        mask_shift.ptr[1] = 6;
+        mask_shift.ptr[2] = 5;
+        mask_shift.ptr[3] = 4;
+        mask_shift.ptr[4] = 3;
+        mask_shift.ptr[5] = 2;
+        mask_shift.ptr[6] = 1;
+        mask_shift.ptr[7] = 0;
+        byte8 mask_and = byte8(-128);
+        byte8 lo = vget_low_u8(cast(byte16)a);
+        byte8 hi = vget_high_u8(cast(byte16)a);
+        lo = vand_u8(lo, mask_and);
+        lo = vshr_u8(lo, mask_shift);
+        hi = vand_u8(hi, mask_and);
+        hi = vshr_u8(hi, mask_shift);
+        lo = vpadd_u8(lo,lo);
+        lo = vpadd_u8(lo,lo);
+        lo = vpadd_u8(lo,lo);
+        hi = vpadd_u8(hi,hi);
+        hi = vpadd_u8(hi,hi);
+        hi = vpadd_u8(hi,hi);
+        return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
+    }
+    else
+    {
+        byte16 ai = cast(byte16)a;
+        int r = 0;
+        foreach(bit; 0..16)
+        {
+            if (ai.array[bit] < 0) r += (1 << bit);
+        }
+        return r;
+    }
+}
+unittest
+{
+    assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
+}
+
+/// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS
+int _mm_movemask_epi16 (__m128i a) pure @trusted
+{
+    return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128()));
+}
+unittest
+{
+    assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8)));
+}
+
+/// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
+/// loating-point element in `v`.
+int _mm_movemask_pd(__m128d v) pure @safe
+{
+    // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
+    static if (GDC_or_LDC_with_SSE2)
+    {
+        return __builtin_ia32_movmskpd(v);
+    }
+    else
+    {
+        long2 lv = cast(long2)v;
+        int r = 0;
+        if (lv.array[0] < 0) r += 1;
+        if (lv.array[1] < 0) r += 2;
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
+    assert(_mm_movemask_pd(A) == 2);
+}
+
+/// Copy the lower 64-bit integer in `v`.
+__m64 _mm_movepi64_pi64 (__m128i v) pure @safe
+{
+    long2 lv = cast(long2)v;
+    return long1(lv.array[0]);
+}
+unittest
+{
+    __m128i A = _mm_set_epi64x(-1, -2);
+    __m64 R = _mm_movepi64_pi64(A);
+    assert(R.array[0] == -2);
+}
+
+/// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
+__m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
+{
+    long2 r;
+    r.ptr[0] = a.array[0];
+    r.ptr[1] = 0;
+    return cast(__m128i)r;
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, 
+/// and store the unsigned 64-bit results.
+__m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
+{    
+    // PERF DMD D_SIMD
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pmuludq128 (a, b);
+    }
+    else
+    {
+        version(LDC)
+        {
+            static if (__VERSION__ >= 2088)
+            {
+                // Need LLVM9 for proper optimization
+                long2 la, lb;
+                la.ptr[0] = cast(uint)a.array[0];
+                la.ptr[1] = cast(uint)a.array[2];
+                lb.ptr[0] = cast(uint)b.array[0];
+                lb.ptr[1] = cast(uint)b.array[2];
+            }
+            else
+            {
+                __m128i zero;
+                zero = 0;
+                long2 la = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(a, zero);
+                long2 lb = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(b, zero);
+            }
+        }
+        else
+        {
+            long2 la, lb;
+            la.ptr[0] = cast(uint)a.array[0];
+            la.ptr[1] = cast(uint)a.array[2];
+            lb.ptr[0] = cast(uint)b.array[0];
+            lb.ptr[1] = cast(uint)b.array[2];
+        }
+
+        version(DigitalMars)
+        {
+            // DMD has no long2 mul
+            la.ptr[0] *= lb.array[0];
+            la.ptr[1] *= lb.array[1];
+            return cast(__m128i)(la);
+        }
+        else
+        {
+            static if (__VERSION__ >= 2076)
+            {
+                return cast(__m128i)(la * lb);
+            }
+            else
+            {
+                // long2 mul not supported before LDC 1.5
+                la.ptr[0] *= lb.array[0];
+                la.ptr[1] *= lb.array[1];
+                return cast(__m128i)(la);
+            }
+        }
+    }
+}
+unittest
+{
+    __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
+    __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
+    __m128i C = _mm_mul_epu32(A, B);
+    long2 LC = cast(long2)C;
+    assert(LC.array[0] == 18446744065119617025uL);
+    assert(LC.array[1] == 12723420444339690338uL);
+}
+
+/// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
+__m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
+{
+    pragma(inline, true);
+    return a * b;
+}
+unittest
+{
+    __m128d a = [-2.0, 1.5];
+    a = _mm_mul_pd(a, a);
+    assert(a.array == [4.0, 2.25]);
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
+/// element of result, and copy the upper element from `a` to the upper element of result.
+__m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
+{
+    version(DigitalMars)
+    {    
+        // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
+        // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
+        asm pure nothrow @nogc @trusted { nop;}
+        a.array[0] = a.array[0] * b.array[0];
+        return a;
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_mulsd(a, b);
+    }
+    else
+    {
+        a.ptr[0] *= b.array[0];
+        return a;
+    }
+}
+unittest
+{
+    __m128d a = [-2.0, 1.5];
+    a = _mm_mul_sd(a, a);
+    assert(a.array == [4.0, 1.5]);
+}
+
+/// Multiply the low unsigned 32-bit integers from `a` and `b`, 
+/// and get an unsigned 64-bit result.
+__m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
+}
+unittest
+{
+    __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
+    __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
+    __m64 C = _mm_mul_su32(A, B);
+    assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
+}
+
+/// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
+/// high 16 bits of the intermediate integers.
+__m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
+    }
+    else
+    {
+        // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
+        //        PERF: it seems the simde solution has one less instruction in ARM64.
+        // PERF: Catastrophic in ARM32.
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        short8 r = void;
+        r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
+        r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
+        r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
+        r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
+        r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
+        r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
+        r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
+        r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
+    __m128i B = _mm_set1_epi16(16384);
+    short8 R = cast(short8)_mm_mulhi_epi16(A, B);
+    short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
+    assert(R.array == correct);
+}
+
+/// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
+/// high 16 bits of the intermediate integers.
+__m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
+    }
+    else
+    {
+        // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
+        //      it seems the simde solution has one less instruction in ARM64
+        // PERF: Catastrophic in ARM32.
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        short8 r = void;
+        r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
+        r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
+        r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
+        r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
+        r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
+        r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
+        r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
+        r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
+    __m128i B = _mm_set1_epi16(16384);
+    short8 R = cast(short8)_mm_mulhi_epu16(A, B);
+    short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
+    assert(R.array == correct);
+}
+
+/// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
+/// bits of the intermediate integers.
+__m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
+{
+    return cast(__m128i)(cast(short8)a * cast(short8)b);
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
+    __m128i B = _mm_set1_epi16(16384);
+    short8 R = cast(short8)_mm_mullo_epi16(A, B);
+    short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
+    assert(R.array == correct);
+}
+
+/// Compute the bitwise NOT of 128 bits in `a`. #BONUS
+__m128i _mm_not_si128 (__m128i a) pure @safe
+{
+    return ~a;
+}
+unittest
+{
+    __m128i A = _mm_set1_epi32(-748);
+    int4 notA = cast(int4) _mm_not_si128(A);
+    int[4] correct = [747, 747, 747, 747];
+    assert(notA.array == correct);
+}
+
+/// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
+__m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
+}
+
+/// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
+__m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
+{
+    pragma(inline, true);
+    return a | b;
+}
+
+/// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
+__m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PACKSSDW, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_packssdw128(a, b);
+    }    
+    else static if (LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_packssdw128(a, b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        short4 ra = vqmovn_s32(cast(int4)a);
+        short4 rb = vqmovn_s32(cast(int4)b);
+        return cast(__m128i)vcombine_s16(ra, rb);
+    }
+    else
+    {
+        // PERF: catastrophic on ARM32
+        short8 r;
+        r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
+        r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
+        r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
+        r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
+        r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
+        r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
+        r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
+        r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
+    short8 R = cast(short8) _mm_packs_epi32(A, A);
+    short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
+    assert(R.array == correct);
+}
+
+/// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
+__m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PACKSSWB, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
+        byte8 ra = vqmovn_s16(cast(short8)a);
+        byte8 rb = vqmovn_s16(cast(short8)b);
+        return cast(__m128i)vcombine_s8(ra, rb);
+    }
+    else
+    {
+        // PERF: ARM32 is missing
+        byte16 r;
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        foreach(i; 0..8)
+            r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
+        foreach(i; 0..8)
+            r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
+    byte16 R = cast(byte16) _mm_packs_epi16(A, A);
+    byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
+                        127, -128, 127, 0, 127, -128, 127, 0];
+    assert(R.array == correct);
+}
+
+/// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
+__m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD catastrophic
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PACKUSWB, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
+        byte8 ra = vqmovun_s16(cast(short8)a);
+        byte8 rb = vqmovun_s16(cast(short8)b);
+        return cast(__m128i)vcombine_s8(ra, rb);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        align(16) ubyte[16] result = void;
+        for (int i = 0; i < 8; ++i)
+        {
+            short s = sa[i];
+            if (s < 0) s = 0;
+            if (s > 255) s = 255;
+            result[i] = cast(ubyte)s;
+
+            s = sb[i];
+            if (s < 0) s = 0;
+            if (s > 255) s = 255;
+            result[i+8] = cast(ubyte)s;
+        }
+        return *cast(__m128i*)(result.ptr);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
+    byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
+    static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
+                                                0, 255, 0, 255, 255, 2, 1, 0];
+    foreach(i; 0..16)
+        assert(AA.array[i] == cast(byte)(correctResult[i]));
+}
+
+/// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
+/// and power consumption of spin-wait loops.
+void _mm_pause() @trusted
+{
+    version(GNU)
+    {
+        static if (GDC_with_SSE2)
+        {
+            __builtin_ia32_pause();
+        }
+        else version(X86)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                "pause;\n" : : : ;
+            }
+        }
+        else __warn_noop();
+    }
+    else static if (LDC_with_SSE2)
+    {
+        __builtin_ia32_pause();
+    }
+    else static if (DMD_with_asm)
+    {
+        asm nothrow @nogc pure @trusted
+        {
+            rep; nop; // F3 90 =  pause
+        }
+    }
+    else version (LDC)
+    {
+        // PERF: Do nothing currently , could be the "yield" intruction on ARM.
+    }
+    else
+        static assert(false);
+}
+unittest
+{
+    _mm_pause();
+}
+
+/// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
+/// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
+/// low 16 bits of 64-bit elements in result.
+__m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
+
+        // PERF: Looks suboptimal vs addp
+        ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
+        ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
+        ushort8 r = 0;
+        r[0] = r0;
+        r[4] = r4;
+        return cast(__m128i) r;
+    }
+    else
+    {
+        // PERF: ARM32 is lacking
+        byte16 ab = cast(byte16)a;
+        byte16 bb = cast(byte16)b;
+        ubyte[16] t;
+        foreach(i; 0..16)
+        {
+            int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
+            if (diff < 0) diff = -diff;
+            t[i] = cast(ubyte)(diff);
+        }
+        int4 r = _mm_setzero_si128();
+        r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
+        r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
+        return r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
+    __m128i B = _mm_set1_epi8(1);
+    __m128i R = _mm_sad_epu8(A, B);
+    int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
+                      0,
+                      23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
+                      0];
+    assert(R.array == correct);
+}
+
+/// Set packed 16-bit integers with the supplied values.
+__m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
+{
+    short8 r = void;
+    r.ptr[0] = e0;
+    r.ptr[1] = e1;
+    r.ptr[2] = e2;
+    r.ptr[3] = e3;
+    r.ptr[4] = e4;
+    r.ptr[5] = e5;
+    r.ptr[6] = e6;
+    r.ptr[7] = e7;
+    return cast(__m128i) r;
+}
+unittest
+{
+    __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+    short8 B = cast(short8) A;
+    foreach(i; 0..8)
+        assert(B.array[i] == i);
+}
+
+/// Set packed 32-bit integers with the supplied values.
+__m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
+{
+    // PERF: does a constant inline correctly? vs int4 field assignment
+    align(16) int[4] r = [e0, e1, e2, e3];
+    return *cast(int4*)&r;
+}
+unittest
+{
+    __m128i A = _mm_set_epi32(3, 2, 1, 0);
+    foreach(i; 0..4)
+        assert(A.array[i] == i);
+        
+    static if (__VERSION__ >= 2094)
+        enum __m128i B = _mm_setr_epi32(0, 1, 2, 3);
+}
+
+/// Set packed 64-bit integers with the supplied values.
+__m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
+{
+    pragma(inline, true);
+    long2 r = void;
+    r.ptr[0] = e0.array[0];
+    r.ptr[1] = e1.array[0];
+    return cast(__m128i)(r);
+}
+unittest
+{
+    __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
+    long2 B = cast(long2) A;
+    assert(B.array[0] == 5678);
+    assert(B.array[1] == 1234);
+}
+
+/// Set packed 64-bit integers with the supplied values.
+__m128i _mm_set_epi64x (long e1, long e0) pure @trusted
+{
+    pragma(inline, true);
+    long2 r = void;
+    r.ptr[0] = e0;
+    r.ptr[1] = e1;
+    return cast(__m128i)(r);
+}
+unittest
+{
+    __m128i A = _mm_set_epi64x(1234, -5678);
+    long2 B = cast(long2) A;
+    assert(B.array[0] == -5678);
+    assert(B.array[1] == 1234);
+}
+
+/// Set packed 8-bit integers with the supplied values.
+__m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
+                      byte e11, byte e10, byte e9, byte e8,
+                      byte e7, byte e6, byte e5, byte e4,
+                      byte e3, byte e2, byte e1, byte e0) pure @trusted
+{
+    align(16) byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
+                                 e8, e9, e10, e11, e12, e13, e14, e15];
+    return *cast(__m128i*)(result.ptr);
+}
+unittest
+{
+    byte16 R = cast(byte16) _mm_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
+    byte[16] correct = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1];
+    assert(R.array == correct);
+}
+
+/// Set packed double-precision (64-bit) floating-point elements with the supplied values.
+__m128d _mm_set_pd (double e1, double e0) pure @trusted
+{
+    pragma(inline, true);
+    double2 r = void;
+    r.ptr[0] = e0;
+    r.ptr[1] = e1;
+    return r;
+}
+unittest
+{
+    __m128d A = _mm_set_pd(61.0, 55.0);
+    double[2] correct = [55.0, 61.0];
+    assert(A.array == correct);
+}
+
+/// Broadcast double-precision (64-bit) floating-point value `a` to all element.
+__m128d _mm_set_pd1 (double a) pure @trusted
+{
+    pragma(inline, true);
+    __m128d r = void;
+    r.ptr[0] = a;
+    r.ptr[1] = a;
+    return r;
+}
+unittest
+{
+    __m128d A = _mm_set_pd1(61.0);
+    double[2] correct = [61.0, 61.0];
+    assert(A.array == correct);
+}
+
+/// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
+/// and zero the upper element.
+__m128d _mm_set_sd (double a) pure @trusted
+{
+    double2 r = void;
+    r.ptr[0] = a;
+    r.ptr[1] = 0.0;
+    return r;
+}
+unittest
+{
+    __m128d A = _mm_set_sd(61.0);
+    double[2] correct = [61.0, 0.0];
+    assert(A.array == correct);
+}
+
+/// Broadcast 16-bit integer a to all elements of dst.
+__m128i _mm_set1_epi16 (short a) pure @trusted
+{
+    version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
+    {
+        short8 v = a;
+        return cast(__m128i) v;
+    }
+    else
+    {
+        pragma(inline, true);
+        return cast(__m128i)(short8(a));
+    }
+}
+unittest
+{
+    short8 a = cast(short8) _mm_set1_epi16(31);
+    for (int i = 0; i < 8; ++i)
+        assert(a.array[i] == 31);
+}
+
+/// Broadcast 32-bit integer `a` to all elements.
+__m128i _mm_set1_epi32 (int a) pure @trusted
+{
+    pragma(inline, true);
+    return cast(__m128i)(int4(a));
+}
+unittest
+{
+    int4 A = cast(int4) _mm_set1_epi32(31);
+    for (int i = 0; i < 4; ++i)
+        assert(A.array[i] == 31);
+
+    // compile-time should work
+    static if (__VERSION__ >= 2094)
+        enum __m128i B = _mm_set1_epi32(3); 
+}
+
+/// Broadcast 64-bit integer `a` to all elements.
+__m128i _mm_set1_epi64 (__m64 a) pure @safe
+{
+    return _mm_set_epi64(a, a);
+}
+unittest
+{
+    long b = 0x1DEADCAFE; 
+    __m64 a;
+    a.ptr[0] = b;
+    long2 c = cast(long2) _mm_set1_epi64(a);
+    assert(c.array[0] == b);
+    assert(c.array[1] == b);
+}
+
+/// Broadcast 64-bit integer `a` to all elements
+__m128i _mm_set1_epi64x (long a) pure @trusted
+{
+    long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
+    return cast(__m128i)(b);
+}
+unittest
+{
+    long b = 0x1DEADCAFE;
+    long2 c = cast(long2) _mm_set1_epi64x(b);
+    for (int i = 0; i < 2; ++i)
+        assert(c.array[i] == b);
+}
+
+/// Broadcast 8-bit integer `a` to all elements.
+__m128i _mm_set1_epi8 (byte a) pure @trusted
+{
+    pragma(inline, true);
+    byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
+    return cast(__m128i)(b);
+}
+unittest
+{
+    byte16 b = cast(byte16) _mm_set1_epi8(31);
+    for (int i = 0; i < 16; ++i)
+        assert(b.array[i] == 31);
+}
+
+alias _mm_set1_pd = _mm_set_pd1;
+
+/// Set packed 16-bit integers with the supplied values in reverse order.
+__m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
+                        short e3, short e2, short e1, short e0) pure @trusted
+{
+    short8 r = void;
+    r.ptr[0] = e7;
+    r.ptr[1] = e6;
+    r.ptr[2] = e5;
+    r.ptr[3] = e4;
+    r.ptr[4] = e3;
+    r.ptr[5] = e2;
+    r.ptr[6] = e1;
+    r.ptr[7] = e0;
+    return cast(__m128i)(r);
+}
+unittest
+{
+    short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
+    short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
+    assert(A.array == correct);
+}
+
+/// Set packed 32-bit integers with the supplied values in reverse order.
+__m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
+{
+    if (__ctfe)
+    {
+        __m128i r;
+        r.ptr[0] = e3;
+        r.ptr[1] = e2;
+        r.ptr[2] = e1;
+        r.ptr[3] = e0;
+        return r;
+    }
+    else
+    {
+        // Performs better than = void; with GDC
+        pragma(inline, true);
+        align(16) int[4] result = [e3, e2, e1, e0];
+        return *cast(__m128i*)(result.ptr);
+    }
+}
+unittest
+{
+    int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
+    int[4] correct = [-1, 0, -2147483648, 2147483647];
+    assert(A.array == correct);
+    
+    // compile-time should work
+    static if (__VERSION__ >= 2094)
+        enum __m128i B = _mm_setr_epi32(0, 1, 2, 3);
+}
+
+/// Set packed 64-bit integers with the supplied values in reverse order.
+__m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
+{
+    long2 r = void;
+    r.ptr[0] = e1;
+    r.ptr[1] = e0;
+    return cast(__m128i)(r);
+}
+unittest
+{
+    long2 A = cast(long2) _mm_setr_epi64(-1, 0);
+    long[2] correct = [-1, 0];
+    assert(A.array == correct);
+}
+
+/// Set packed 8-bit integers with the supplied values in reverse order.
+__m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
+                       byte e11, byte e10, byte e9,  byte e8,
+                       byte e7,  byte e6,  byte e5,  byte e4,
+                       byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
+{
+    align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
+                                 e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
+    return *cast(__m128i*)(result.ptr);
+}
+unittest
+{
+    byte16 R = cast(byte16) _mm_setr_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
+    byte[16] correct = [-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14];
+    assert(R.array == correct);
+}
+
+/// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
+__m128d _mm_setr_pd (double e1, double e0) pure @trusted
+{
+    pragma(inline, true);
+    double2 result;
+    result.ptr[0] = e1;
+    result.ptr[1] = e0;
+    return result;
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(61.0, 55.0);
+    double[2] correct = [61.0, 55.0];
+    assert(A.array == correct);
+}
+
+/// Return vector of type `__m128d` with all elements set to zero.
+__m128d _mm_setzero_pd() pure @trusted
+{
+    pragma(inline, true);
+    double2 r = void;
+    r.ptr[0] = 0.0;
+    r.ptr[1] = 0.0;
+    return r;
+}
+unittest
+{
+    __m128d A = _mm_setzero_pd();
+    double[2] correct = [0.0, 0.0];
+    assert(A.array == correct);
+}
+
+/// Return vector of type `__m128i` with all elements set to zero.
+__m128i _mm_setzero_si128() pure @trusted
+{
+    pragma(inline, true);
+    int4 r = void;
+    r.ptr[0] = 0;
+    r.ptr[1] = 0;
+    r.ptr[2] = 0;
+    r.ptr[3] = 0;
+    return r;
+}
+unittest
+{
+    __m128i A = _mm_setzero_si128();
+    int[4] correct = [0, 0, 0, 0];
+    assert(A.array == correct);
+}
+
+/// Shuffle 32-bit integers in `a` using the control in `imm8`.
+/// See_also: `_MM_SHUFFLE`.
+__m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @trusted
+{
+    // PERF DMD D_SIMD
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_pshufd(a, imm8);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        return shufflevectorLDC!(int4, (imm8 >> 0) & 3,
+                                 (imm8 >> 2) & 3,
+                                 (imm8 >> 4) & 3,
+                                 (imm8 >> 6) & 3)(a, a);
+    }
+    else
+    {
+        int4 r = void;
+        r.ptr[0] = a.ptr[(imm8 >> 0) & 3];
+        r.ptr[1] = a.ptr[(imm8 >> 2) & 3];
+        r.ptr[2] = a.ptr[(imm8 >> 4) & 3];
+        r.ptr[3] = a.ptr[(imm8 >> 6) & 3];
+        return r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(0, 1, 2, 3);
+    enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
+    int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
+    int[4] expectedB = [ 3, 2, 1, 0 ];
+    assert(B.array == expectedB);
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
+/// See_also: `_MM_SHUFFLE2`.
+__m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @trusted
+{
+    // PERF DMD D_SIMD
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_shufpd(a, b, imm8);
+    }
+    else version(LDC)
+    {
+        return shufflevectorLDC!(double2, 0 + ( imm8 & 1 ),
+                                 2 + ( (imm8 >> 1) & 1 ))(a, b);
+    }
+    else
+    {
+        double2 r = void;
+        r.ptr[0] = a.array[imm8 & 1];
+        r.ptr[1] = b.array[(imm8 >> 1) & 1];
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(0.5, 2.0);
+    __m128d B = _mm_setr_pd(4.0, 5.0);
+    enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
+    __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
+    double[2] correct = [ 2.0, 5.0 ];
+    assert(R.array == correct);
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
+/// 64 bits of result, with the low 64 bits being copied from from `a` to result.
+/// See also: `_MM_SHUFFLE`.
+__m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PSHUFHW, a, a, cast(ubyte)imm8);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        return cast(__m128i) shufflevectorLDC!(short8, 0, 1, 2, 3,
+                                          4 + ( (imm8 >> 0) & 3 ),
+                                          4 + ( (imm8 >> 2) & 3 ),
+                                          4 + ( (imm8 >> 4) & 3 ),
+                                          4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
+    }
+    else
+    {
+        short8 r = cast(short8)a;
+        short8 sa = cast(short8)a;
+        r.ptr[4] = sa.array[4 + ( (imm8 >> 0) & 3 ) ];
+        r.ptr[5] = sa.array[4 + ( (imm8 >> 2) & 3 ) ];
+        r.ptr[6] = sa.array[4 + ( (imm8 >> 4) & 3 ) ];
+        r.ptr[7] = sa.array[4 + ( (imm8 >> 6) & 3 ) ];
+        return cast(__m128i) r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+    enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
+    short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
+    short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
+    assert(C.array == expectedC);
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
+/// bits of result, with the high 64 bits being copied from from `a` to result.
+/// See_also: `_MM_SHUFFLE`.
+__m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PSHUFLW, a, a, cast(ubyte)imm8);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        return cast(__m128i) shufflevectorLDC!(short8, ( (imm8 >> 0) & 3 ),
+                                                       ( (imm8 >> 2) & 3 ),
+                                                       ( (imm8 >> 4) & 3 ),
+                                                       ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
+    }
+    else
+    {
+        short8 r = cast(short8)a;
+        short8 sa = cast(short8)a;
+        r.ptr[0] = sa.array[(imm8 >> 0) & 3];
+        r.ptr[1] = sa.array[(imm8 >> 2) & 3];
+        r.ptr[2] = sa.array[(imm8 >> 4) & 3];
+        r.ptr[3] = sa.array[(imm8 >> 6) & 3];
+        return cast(__m128i) r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+    enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
+    short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
+    short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
+    assert(B.array == expectedB);
+}
+
+/// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 31, result is defined to be all zeroes.
+/// Note: prefer `_mm_slli_epi32`, less of a trap.
+__m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
+{
+    static if (GDC_or_LDC_with_SSE2)
+    {
+        return __builtin_ia32_pslld128(a, count);
+    }
+    else
+    {
+        int4 r = void;
+        long2 lc = cast(long2)count;
+        ulong bits = cast(ulong)(lc.array[0]);
+        foreach(i; 0..4)
+            r[i] = cast(uint)(a[i]) << bits;
+        if (bits > 31)
+            r = int4(0);
+        return r;
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m128i A = _mm_setr_epi32(4, -9, 11, -2147483648);
+    int[4] correct0  = A.array;
+    int[4] correctX  = [0, 0, 0, 0];
+    int[4] correct2  = [16, -36, 44, 0];
+    int4 B0 = cast(int4) _mm_sll_epi32(A, shift0);
+    int4 BX = cast(int4) _mm_sll_epi32(A, shiftX);
+    int4 B2 = cast(int4) _mm_sll_epi32(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 63, result is defined to be all zeroes.
+/// Note: prefer `_mm_slli_epi64`, less of a trap.
+__m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
+{
+    static if (GDC_or_LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
+    }
+    else
+    {
+        // ARM: good since LDC 1.12 -O2
+        // ~but -O0 version is catastrophic
+        long2 r = void;
+        long2 sa = cast(long2)a;
+        long2 lc = cast(long2)count;
+        ulong bits = cast(ulong)(lc.array[0]);
+        foreach(i; 0..2)
+            r.array[i] = cast(ulong)(sa.array[i]) << bits;
+        if (bits > 63)
+            r = long2(0);
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m128i A = _mm_setr_epi64(4, -9);
+    long[2] correct0  = [ 4,  -9];
+    long[2] correctX  = [ 0,   0];
+    long[2] correct2  = [16, -36];
+    long2 B0 = cast(long2) _mm_sll_epi64(A, shift0);
+    long2 BX = cast(long2) _mm_sll_epi64(A, shiftX);
+    long2 B2 = cast(long2) _mm_sll_epi64(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 15, result is defined to be all zeroes.
+/// Warning: prefer `_mm_slli_epi16`, less of a trap.
+__m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
+{
+    static if (GDC_or_LDC_with_SSE2)
+    {
+        return cast(__m128i)__builtin_ia32_psllw128(cast(short8)a, cast(short8)count);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        long2 lc = cast(long2)count;
+        ulong bits = cast(ulong)(lc.array[0]);
+        short8 r = void;
+        foreach(i; 0..8)
+            r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
+        if (bits > 15)
+            r = short8(0);
+        return cast(int4)r;
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m128i A = _mm_setr_epi16(4, -8, 11, -32768, 4, -8, 11, -32768);
+    short[8] correct0  = (cast(short8)A).array;
+    short[8] correctX  = [0, 0, 0, 0, 0, 0, 0, 0]; 
+    short[8] correct2  = [16, -32, 44, 0, 16, -32, 44, 0];
+    short8 B0 = cast(short8) _mm_sll_epi16(A, shift0);
+    short8 BX = cast(short8) _mm_sll_epi16(A, shiftX);
+    short8 B2 = cast(short8) _mm_sll_epi16(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
+__m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
+    }
+    else
+    {
+        // Note: the intrinsics guarantee imm8[0..7] is taken, however
+        //       D says "It's illegal to shift by the same or more bits 
+        //       than the size of the quantity being shifted"
+        //       and it's UB instead.
+        int4 r = _mm_setzero_si128();
+
+        ubyte count = cast(ubyte) imm8;
+        if (count > 31)
+            return r;
+        
+        foreach(i; 0..4)
+            r.array[i] = cast(uint)(a.array[i]) << count;
+        return r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(0, 2, 3, -4);
+    __m128i B = _mm_slli_epi32(A, 1);
+    __m128i B2 = _mm_slli_epi32(A, 1 + 256);
+    int[4] expectedB = [ 0, 4, 6, -8];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    __m128i C = _mm_slli_epi32(A, 0);
+    int[4] expectedC = [ 0, 2, 3, -4];
+    assert(C.array == expectedC);
+
+    __m128i D = _mm_slli_epi32(A, 65);
+    int[4] expectedD = [ 0, 0, 0, 0];
+    assert(D.array == expectedD);
+}
+
+/// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
+__m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
+    }
+    else
+    {
+        long2 sa = cast(long2)a;
+
+        // Note: the intrinsics guarantee imm8[0..7] is taken, however
+        //       D says "It's illegal to shift by the same or more bits 
+        //       than the size of the quantity being shifted"
+        //       and it's UB instead.
+        long2 r = cast(long2) _mm_setzero_si128();
+        ubyte count = cast(ubyte) imm8;
+        if (count > 63)
+            return cast(__m128i)r;
+
+        r.ptr[0] = cast(ulong)(sa.array[0]) << count;
+        r.ptr[1] = cast(ulong)(sa.array[1]) << count;
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi64(8, -4);
+    long2 B = cast(long2) _mm_slli_epi64(A, 1);
+    long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
+    long[2] expectedB = [ 16, -8];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    long2 C = cast(long2) _mm_slli_epi64(A, 0);
+    long[2] expectedC = [ 8, -4];
+    assert(C.array == expectedC);
+
+    long2 D = cast(long2) _mm_slli_epi64(A, 64);
+    long[2] expectedD = [ 0, -0];
+    assert(D.array == expectedD);
+}
+
+/// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
+__m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        short8 sa = cast(short8)a;
+        short8 r = cast(short8)_mm_setzero_si128();
+        ubyte count = cast(ubyte) imm8;
+        if (count > 15)
+            return cast(__m128i)r;
+        r = sa << short8(count);
+        return cast(__m128i)r;
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        short8 r = cast(short8)_mm_setzero_si128();
+        ubyte count = cast(ubyte) imm8;
+        if (count > 15)
+            return cast(__m128i)r;
+        foreach(i; 0..8)
+            r.ptr[i] = cast(short)(sa.array[i] << count);
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
+    short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
+    short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
+    short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
+    short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
+    assert(C.array == expectedC);
+}
+
+
+/// Shift `a` left by `bytes` bytes while shifting in zeros.
+__m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
+{
+    static if (bytes & 0xF0)
+    {
+        return _mm_setzero_si128();
+    }
+    else static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd_ib(XMM.PSLLDQ, op, bytes);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        pragma(inline, true); // else it doesn't seem to be inlined at all by GDC TODO _mm_srli_si128
+        return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 
+    }
+    else static if (LDC_with_optimizations)
+    {
+        return cast(__m128i) shufflevectorLDC!(byte16,
+                                               16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
+                                               22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
+                                               28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
+                                               (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
+    }
+    else static if (DMD_with_32bit_asm)
+    {
+        asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
+        {
+            movdqu XMM0, op;
+            pslldq XMM0, bytes;
+            movdqu op, XMM0;
+        }
+        return op;
+    }
+    else
+    {
+        byte16 A = cast(byte16)op;
+        byte16 R = void;
+        for (int n = 15; n >= bytes; --n)
+            R.ptr[n] = A.array[n-bytes];
+        for (int n = bytes-1; n >= 0; --n)
+            R.ptr[n] = 0;
+        return cast(__m128i)R;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+    short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
+    short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
+    assert(R.array == correct);
+
+    __m128i B = _mm_slli_si128!16(_mm_set1_epi32(-1));
+    int[4] expectedB = [0, 0, 0, 0];
+    assert(B.array == expectedB);
+}
+
+/// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
+__m128d _mm_sqrt_pd(__m128d vec) pure @trusted
+{
+    version(LDC)
+    {
+        // Disappeared with LDC 1.11
+        static if (__VERSION__ < 2081)
+            return __builtin_ia32_sqrtpd(vec);
+        else
+        {
+            // PERF: use llvm_sqrt on the vector
+            vec.array[0] = llvm_sqrt(vec.array[0]); 
+            vec.array[1] = llvm_sqrt(vec.array[1]);
+            return vec;
+        }
+    }
+    else static if (GDC_with_SSE2)    
+    {
+        return __builtin_ia32_sqrtpd(vec);
+    }
+    else
+    {
+        vec.ptr[0] = sqrt(vec.array[0]);
+        vec.ptr[1] = sqrt(vec.array[1]);
+        return vec;
+    }
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
+/// the lower element of result, and copy the upper element from `a` to the upper element of result.
+__m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
+{
+    // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
+    //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
+    //        The quadword at bits 127:64 of the destination operand remains unchanged."
+    version(LDC)
+    {
+        // Disappeared with LDC 1.11
+        static if (__VERSION__ < 2081)
+        {
+            __m128d c = __builtin_ia32_sqrtsd(b);
+            a[0] = c[0];
+            return a;
+        }
+        else
+        {
+            a.array[0] = llvm_sqrt(b.array[0]);
+            return a;
+        }
+    }
+    else static if (GDC_with_SSE2)
+    {
+        __m128d c = __builtin_ia32_sqrtsd(b);
+        a.ptr[0] = c.array[0];
+        return a;
+    }
+    else
+    {
+        a.ptr[0] = sqrt(b.array[0]);
+        return a;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.0, 3.0);
+    __m128d B = _mm_setr_pd(4.0, 5.0);
+    __m128d R = _mm_sqrt_sd(A, B);
+    double[2] correct = [2.0, 3.0 ];
+    assert(R.array == correct);
+}
+
+/// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 15, result is defined to be all sign bits.
+/// Warning: prefer `_mm_srai_epi16`, less of a trap.
+__m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
+{
+    static if (GDC_or_LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        long2 lc = cast(long2)count;
+        ulong bits = cast(ulong)(lc.array[0]);
+        if (bits > 15) 
+            bits = 15;
+        short8 r = void;
+        foreach(i; 0..8)
+            r.ptr[i] = cast(short)(sa.array[i] >> bits);
+        return cast(int4)r;
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m128i A = _mm_setr_epi16(4, -9, 11, -32768, 4, -8, 11, -32768);
+    short[8] correct0  = (cast(short8)A).array;
+    short[8] correctX  = [0, -1, 0, -1, 0, -1, 0, -1]; 
+    short[8] correct2  =      [1, -3,  2, -8192,  1, -2,  2, -8192];
+    short8 B0 = cast(short8) _mm_sra_epi16(A, shift0);
+    short8 BX = cast(short8) _mm_sra_epi16(A, shiftX);
+    short8 B2 = cast(short8) _mm_sra_epi16(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 31, result is defined to be all sign bits.
+/// Note: prefer `_mm_srai_epi32`, less of a trap.
+__m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
+{
+    static if (GDC_or_LDC_with_SSE2)
+    {
+        return __builtin_ia32_psrad128(a, count);
+    }
+    else
+    {    
+        int4 r = void;
+        long2 lc = cast(long2)count;
+        ulong bits = cast(ulong)(lc.array[0]);
+        if (bits > 31)
+            bits = 31;
+        r.ptr[0] = (a.array[0] >> bits);
+        r.ptr[1] = (a.array[1] >> bits);
+        r.ptr[2] = (a.array[2] >> bits);
+        r.ptr[3] = (a.array[3] >> bits);
+        return r;
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m128i A = _mm_setr_epi32(4, -9, 11, -2147483648);
+    int[4] correct0  = A.array;
+    int[4] correctX  = [0, -1, 0, -1]; 
+    int[4] correct2  = [1, -3, 2, -536870912];
+    int4 B0 = cast(int4) _mm_sra_epi32(A, shift0);
+    int4 BX = cast(int4) _mm_sra_epi32(A, shiftX);
+    int4 B2 = cast(int4) _mm_sra_epi32(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
+__m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        short8 sa = cast(short8)a;
+        ubyte count = cast(ubyte)imm8;
+        if (count > 15) 
+            count = 15;
+        short8 r = sa >> short8(count);
+        return cast(__m128i)r;
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        short8 r = void;
+
+        // Note: the intrinsics guarantee imm8[0..7] is taken, however
+        //       D says "It's illegal to shift by the same or more bits 
+        //       than the size of the quantity being shifted"
+        //       and it's UB instead.
+        ubyte count = cast(ubyte)imm8;
+        if (count > 15) 
+            count = 15;
+        foreach(i; 0..8)
+            r.ptr[i] = cast(short)(sa.array[i] >> count);
+        return cast(int4)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
+    short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
+    short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
+    short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
+    short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
+    assert(C.array == expectedC);
+}
+
+/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
+__m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
+{
+    static if (LDC_with_SSE2)
+    {
+        return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
+    }
+    else
+    {
+        int4 r = void;
+
+        // Note: the intrinsics guarantee imm8[0..7] is taken, however
+        //       D says "It's illegal to shift by the same or more bits 
+        //       than the size of the quantity being shifted"
+        //       and it's UB instead.
+        // See Issue: #56
+        ubyte count = cast(ubyte) imm8;
+        if (count > 31)
+            count = 31;
+
+        r.ptr[0] = (a.array[0] >> count);
+        r.ptr[1] = (a.array[1] >> count);
+        r.ptr[2] = (a.array[2] >> count);
+        r.ptr[3] = (a.array[3] >> count);
+        return r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(0, 2, 3, -4);
+    __m128i B = _mm_srai_epi32(A, 1);
+    __m128i B2 = _mm_srai_epi32(A, 1 + 256);
+    int[4] expectedB = [ 0, 1, 1, -2];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    __m128i C = _mm_srai_epi32(A, 32);
+    int[4] expectedC = [ 0, 0, 0, -1];
+    assert(C.array == expectedC);
+
+    __m128i D = _mm_srai_epi32(A, 0);
+    int[4] expectedD = [ 0, 2, 3, -4];
+    assert(D.array == expectedD);
+}
+
+/// Shift packed 16-bit integers in `a` right by `count` while shifting in zeros.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 15, result is defined to be all zeroes.
+/// Warning: prefer `_mm_srli_epi16`, less of a trap.
+__m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
+{
+    // PERF ARM64
+    static if (GDC_or_LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        long2 lc = cast(long2)count;
+        ulong bits = cast(ulong)(lc.array[0]);
+        short8 r = void;
+        foreach(i; 0..8)
+            r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
+        if (bits > 15)
+            r = short8(0);
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m128i A = _mm_setr_epi16(4, -8, 11, -32768, 4, -8, 11, -32768);
+    short[8] correct0  = (cast(short8)A).array;
+    short[8] correctX  = [0, 0, 0, 0, 0, 0, 0, 0]; 
+    short[8] correct2  = [1, 16382, 2, 8192, 1, 16382, 2, 8192];
+    short8 B0 = cast(short8) _mm_srl_epi16(A, shift0);
+    short8 BX = cast(short8) _mm_srl_epi16(A, shiftX);
+    short8 B2 = cast(short8) _mm_srl_epi16(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 32-bit integers in `a` right by `count` while shifting in zeros.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 31, result is defined to be all zeroes.
+/// Note: prefer `_mm_srli_epi32`, less of a trap.
+__m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
+{
+    static if (GDC_or_LDC_with_SSE2)
+    {
+        return __builtin_ia32_psrld128(a, count);
+    }
+    else
+    {
+        int4 r = void;
+        long2 lc = cast(long2)count;
+        ulong bits = cast(ulong)(lc.array[0]);
+        r.ptr[0] = cast(uint)(a.array[0]) >> bits;
+        r.ptr[1] = cast(uint)(a.array[1]) >> bits;
+        r.ptr[2] = cast(uint)(a.array[2]) >> bits;
+        r.ptr[3] = cast(uint)(a.array[3]) >> bits;
+        if (bits > 31) // Same semantics as x86 instruction
+            r = int4(0);
+        return r;
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m128i A = _mm_setr_epi32(4, -8, 11, -0x80000000);
+    int[4] correct0  = A.array;
+    int[4] correctX  = [0, 0, 0, 0]; 
+    int[4] correct2  = [1, 1073741822, 2, 536870912];
+    int4 B0 = cast(int4) _mm_srl_epi32(A, shift0);
+    int4 BX = cast(int4) _mm_srl_epi32(A, shiftX);
+    int4 B2 = cast(int4) _mm_srl_epi32(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 64-bit integers in `a` right by `count` while shifting in zeroes.
+/// Bit-shift is a single value in the low-order 64-bit of `count`. 
+/// If bit-shift > 63, result is defined to be all zeroes.
+/// Note: prefer `_mm_srli_epi64`, less of a trap.
+__m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
+{
+    static if (GDC_or_LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
+    }
+    else
+    {
+        long2 r;
+        long2 sa = cast(long2)a;
+        long2 lc = cast(long2)count;
+        ulong bits = cast(ulong)(lc.array[0]);
+        r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
+        r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
+        if (bits > 63)
+            r = long2(0);
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i shift0 = _mm_setzero_si128();
+    __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
+    __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
+    __m128i A = _mm_setr_epi64(4, -9);
+    long[2] correct0  = [4, -9];
+    long[2] correctX  = [0,  0];
+    long[2] correct2  = [1, 4611686018427387901];
+    long2 B0 = cast(long2) _mm_srl_epi64(A, shift0);
+    long2 BX = cast(long2) _mm_srl_epi64(A, shiftX);
+    long2 B2 = cast(long2) _mm_srl_epi64(A, shift2);
+    assert(B0.array == correct0);
+    assert(BX.array == correctX);
+    assert(B2.array == correct2);
+}
+
+/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
+__m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        short8 sa = cast(short8)a;
+        short8 r = cast(short8) _mm_setzero_si128();
+
+        ubyte count = cast(ubyte)imm8;
+        if (count >= 16)
+            return cast(__m128i)r;
+
+        r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
+        return cast(__m128i)r;
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        ubyte count = cast(ubyte)imm8;
+
+        short8 r = cast(short8) _mm_setzero_si128();
+        if (count >= 16)
+            return cast(__m128i)r;
+
+        foreach(i; 0..8)
+            r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
+    short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
+    short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
+    short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
+    short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(C.array == expectedC);
+
+    short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
+    short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
+    assert(D.array == expectedD);
+}
+
+
+/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
+__m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_SSE2)
+    {
+        return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
+    }
+    else
+    {
+        ubyte count = cast(ubyte) imm8;
+
+        // Note: the intrinsics guarantee imm8[0..7] is taken, however
+        //       D says "It's illegal to shift by the same or more bits 
+        //       than the size of the quantity being shifted"
+        //       and it's UB instead.
+        int4 r = _mm_setzero_si128();
+        if (count >= 32)
+            return r;
+        r.ptr[0] = a.array[0] >>> count;
+        r.ptr[1] = a.array[1] >>> count;
+        r.ptr[2] = a.array[2] >>> count;
+        r.ptr[3] = a.array[3] >>> count;
+        return r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(0, 2, 3, -4);
+    __m128i B = _mm_srli_epi32(A, 1);
+    __m128i B2 = _mm_srli_epi32(A, 1 + 256);
+    int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+ 
+    __m128i C = _mm_srli_epi32(A, 255);
+    int[4] expectedC = [ 0, 0, 0, 0 ];
+    assert(C.array == expectedC);
+}
+
+/// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
+__m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_or_LDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
+    }
+    else
+    {
+        long2 r = cast(long2) _mm_setzero_si128();
+        long2 sa = cast(long2)a;
+
+        ubyte count = cast(ubyte) imm8;
+        if (count >= 64)
+            return cast(__m128i)r;
+
+        r.ptr[0] = sa.array[0] >>> count;
+        r.ptr[1] = sa.array[1] >>> count;
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi64(8, -4);
+    long2 B = cast(long2) _mm_srli_epi64(A, 1);
+    long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
+    long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
+    assert(B.array == expectedB);
+    assert(B2.array == expectedB);
+
+    long2 C = cast(long2) _mm_srli_epi64(A, 64);
+    long[2] expectedC = [ 0, 0 ];
+    assert(C.array == expectedC);
+}
+
+/// Shift `v` right by `bytes` bytes while shifting in zeros.
+__m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @trusted
+{
+    static if (bytes & 0xF0)
+    {
+        return _mm_setzero_si128();
+    }
+    else static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd_ib(XMM.PSRLDQ, v, bytes);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
+    }
+    else static if (DMD_with_32bit_asm)
+    {
+        asm pure nothrow @nogc @trusted
+        {
+            movdqu XMM0, v;
+            psrldq XMM0, bytes;
+            movdqu v, XMM0;
+        }
+        return v;
+    }
+    else static if (LDC_with_optimizations)
+    {
+        return cast(__m128i) shufflevectorLDC!(byte16,
+                                               bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
+                                               bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
+                                               (cast(byte16) v, cast(byte16)_mm_setzero_si128());
+    }
+    else
+    {
+        byte16 A = cast(byte16)v;
+        byte16 R = void;
+        for (int n = 0; n < bytes; ++n)
+            R.ptr[15-n] = 0;
+        for (int n = bytes; n < 16; ++n)
+            R.ptr[15-n] = A.array[15 - n + bytes];
+        return cast(__m128i)R;
+    }
+}
+unittest
+{
+    __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, -2, 1));
+    int[4] correct = [-2, 3, 4, 0];
+    assert(R.array == correct);
+
+    __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
+    int[4] expectedA = [0, 0, 0, 0];
+    assert(A.array == expectedA);
+}
+
+/// Shift `v` right by `bytes` bytes while shifting in zeros.
+/// #BONUS
+__m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
+{
+    return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
+}
+unittest
+{
+    __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
+    float[4] correct = [3.0f, 4.0f, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Shift `v` right by `bytes` bytes while shifting in zeros.
+/// #BONUS
+__m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
+{
+    return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
+}
+
+/// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
+{
+    pragma(inline, true);
+    __m128d* aligned = cast(__m128d*)mem_addr;
+    *aligned = a;
+}
+unittest
+{
+    align(16) double[2] A;
+    __m128d B = _mm_setr_pd(-8.0, 9.0);
+    _mm_store_pd(A.ptr, B);
+    assert(A == [-8.0, 9.0]);
+}
+
+/// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
+{
+    __m128d* aligned = cast(__m128d*)mem_addr;
+    __m128d r; // PERF =void;
+    r.ptr[0] = a.array[0];
+    r.ptr[1] = a.array[0];
+    *aligned = r;
+}
+
+/// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
+/// be aligned on any particular boundary.
+void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
+{
+    pragma(inline, true);
+    *mem_addr = a.array[0];
+}
+
+/// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
+/// general-protection exception may be generated.
+void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
+{
+    pragma(inline, true);
+    *mem_addr = a;
+}
+
+alias _mm_store1_pd = _mm_store_pd1; ///
+
+/// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
+void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
+{
+    pragma(inline, true);
+    *mem_addr = a.array[1];
+}
+
+// Note: `mem_addr` doesn't have to actually be aligned, which breaks
+// expectations from the user point of view. This problem also exist in C++.
+void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
+{
+    pragma(inline, true);
+    long* dest = cast(long*)mem_addr;
+    long2 la = cast(long2)a;
+    *dest = la.array[0];
+}
+unittest
+{
+    long[3] A = [1, 2, 3];
+    _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
+    long[3] correct = [1, 0x1_0000_0000, 3];
+    assert(A == correct);
+}
+
+/// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
+void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
+{
+    pragma(inline, true);
+    *mem_addr = a.array[0];
+}
+
+/// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse 
+/// order. `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 
+/// may be generated.
+void _mm_storer_pd (double* mem_addr, __m128d a) pure @system
+{
+    __m128d reversed = void;
+    reversed.ptr[0] = a.array[1];
+    reversed.ptr[1] = a.array[0];
+    *cast(__m128d*)mem_addr = reversed;
+}
+unittest
+{
+    align(16) double[2] A = [0.0, 1.0];
+    _mm_storer_pd(A.ptr, _mm_setr_pd(2.0, 3.0));
+    assert(A[0] == 3.0 && A[1] == 2.0);
+}
+
+/// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 
+/// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
+void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system
+{
+    // PERF DMD
+    pragma(inline, true);
+    static if (GDC_with_SSE2)
+    {
+        __builtin_ia32_storeupd(mem_addr, a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        storeUnaligned!double2(a, mem_addr);
+    }
+    else
+    {
+        mem_addr[0] = a.array[0];
+        mem_addr[1] = a.array[1];
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(3.0, 4.0);
+    align(16) double[4] R = [0.0, 0, 0, 0];
+    double[2] correct = [3.0, 4.0];
+    _mm_storeu_pd(&R[1], A);
+    assert(R[1..3] == correct);
+}
+
+/// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
+/// boundary.
+void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned. Make it @system
+{
+    // PERF: DMD
+    pragma(inline, true);
+    static if (GDC_with_SSE2)
+    {
+        __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        storeUnaligned!__m128i(a, cast(int*)mem_addr);
+    }
+    else
+    {
+        int* p = cast(int*)mem_addr;
+        p[0] = a.array[0];
+        p[1] = a.array[1];
+        p[2] = a.array[2];
+        p[3] = a.array[3];
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(1, 2, 3, 4);
+    align(16) int[6] R = [0, 0, 0, 0, 0, 0];
+    int[4] correct = [1, 2, 3, 4];
+    _mm_storeu_si128(cast(__m128i*)(&R[1]), A);
+    assert(R[1..5] == correct);
+}
+
+/// Store 16-bit integer from the first element of `a` into memory. 
+/// `mem_addr` does not need to be aligned on any particular boundary.
+void _mm_storeu_si16 (void* mem_addr, __m128i a) pure @system
+{
+    short* dest = cast(short*)mem_addr;
+    *dest = (cast(short8)a).array[0];
+}
+unittest
+{
+    short[2] arr = [-24, 12];
+    _mm_storeu_si16(&arr[1], _mm_set1_epi16(26));
+    short[2] correct = [-24, 26];
+    assert(arr == correct);
+}
+
+/// Store 32-bit integer from the first element of `a` into memory. 
+/// `mem_addr` does not need to be aligned on any particular boundary.
+void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted // TODO should really be @ssytem
+{
+    pragma(inline, true);
+    int* dest = cast(int*)mem_addr;
+    *dest = a.array[0];
+}
+unittest
+{
+    int[2] arr = [-24, 12];
+    _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
+    assert(arr == [-24, -1]);
+}
+
+/// Store 64-bit integer from the first element of `a` into memory. 
+/// `mem_addr` does not need to be aligned on any particular boundary.
+void _mm_storeu_si64 (void* mem_addr, __m128i a) pure @system
+{
+    pragma(inline, true);
+    long* dest = cast(long*)mem_addr;
+    long2 la = cast(long2)a;
+    *dest = la.array[0];
+}
+unittest
+{
+    long[3] A = [1, 2, 3];
+    _mm_storeu_si64(&A[1], _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
+    long[3] correct = [1, 0x1_0000_0000, 3];
+    assert(A == correct);
+}
+
+/// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
+/// from `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte
+/// boundary or a general-protection exception may be generated.
+/// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
+void _mm_stream_pd (double* mem_addr, __m128d a) pure @system
+{
+    // PERF DMD D_SIMD
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_movntpd(mem_addr, a); 
+    }
+    else static if (LDC_with_InlineIREx && LDC_with_optimizations)
+    {
+        enum prefix = `!0 = !{ i32 1 }`;
+        enum ir = `
+            store <2 x double> %1, <2 x double>* %0, align 16, !nontemporal !0
+            ret void`;
+        LDCInlineIREx!(prefix, ir, "", void, double2*, double2)(cast(double2*)mem_addr, a);
+    }
+    else
+    {
+        // Regular store instead.
+        __m128d* dest = cast(__m128d*)mem_addr;
+        *dest = a;
+    }
+}
+unittest
+{
+    align(16) double[2] A;
+    __m128d B = _mm_setr_pd(-8.0, 9.0);
+    _mm_stream_pd(A.ptr, B);
+    assert(A == [-8.0, 9.0]);
+}
+
+/// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception
+/// may be generated.
+/// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
+void _mm_stream_si128 (__m128i* mem_addr, __m128i a) pure @trusted
+{
+    // PERF DMD D_SIMD
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_movntdq (cast(long2*)mem_addr, cast(long2)a); 
+    }
+    else static if (LDC_with_InlineIREx && LDC_with_optimizations)
+    {
+        enum prefix = `!0 = !{ i32 1 }`;
+        enum ir = `
+            store <4 x i32> %1, <4 x i32>* %0, align 16, !nontemporal !0
+            ret void`;
+        LDCInlineIREx!(prefix, ir, "", void, int4*, int4)(cast(int4*)mem_addr, a);
+    }
+    else
+    {
+        // Regular store instead.
+        __m128i* dest = cast(__m128i*)mem_addr;
+        *dest = a;
+    }
+}
+unittest
+{
+    align(16) int[4] A;
+    __m128i B = _mm_setr_epi32(-8, 9, 10, -11);
+    _mm_stream_si128(cast(__m128i*)A.ptr, B);
+    assert(A == [-8, 9, 10, -11]);
+}
+
+/// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
+/// pollution. If the cache line containing address `mem_addr` is already in the cache,
+/// the cache will be updated.
+/// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
+void _mm_stream_si32 (int* mem_addr, int a) pure @trusted
+{
+    // PERF DMD D_SIMD
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_movnti(mem_addr, a);
+    }
+    else static if (LDC_with_InlineIREx && LDC_with_optimizations)
+    {
+        enum prefix = `!0 = !{ i32 1 }`;
+        enum ir = `
+            store i32 %1, i32* %0, !nontemporal !0
+            ret void`;
+        LDCInlineIREx!(prefix, ir, "", void, int*, int)(mem_addr, a);
+    }
+    else
+    {
+        // Regular store instead.
+        *mem_addr = a;
+    }
+}
+unittest
+{
+    int A;
+    _mm_stream_si32(&A, -34);
+    assert(A == -34);
+}
+
+/// Store 64-bit integer a into memory using a non-temporal hint to minimize
+/// cache pollution. If the cache line containing address `mem_addr` is already
+/// in the cache, the cache will be updated.
+/// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
+void _mm_stream_si64 (long* mem_addr, long a) pure @trusted
+{
+    // PERF DMD D_SIMD
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_movnti64(mem_addr, a);
+    }
+    else static if (LDC_with_InlineIREx && LDC_with_optimizations)
+    {
+        enum prefix = `!0 = !{ i32 1 }`;
+        enum ir = `
+            store i64 %1, i64* %0, !nontemporal !0
+            ret void`;
+        LDCInlineIREx!(prefix, ir, "", void, long*, long)(mem_addr, a);
+
+    }
+    else
+    {
+        // Regular store instead.
+        *mem_addr = a;
+    }
+}
+unittest
+{
+    long A;
+    _mm_stream_si64(&A, -46);
+    assert(A == -46);
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
+__m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m128i)(cast(short8)a - cast(short8)b);
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(16,  32767, 1, 2,    3, 4, 6, 6);
+    __m128i B = _mm_setr_epi16(15, -32768, 6, 8, 1000, 1, 5, 6);
+    short8 C = cast(short8) _mm_sub_epi16(A, B);
+    short[8] correct =        [ 1,     -1,-5,-6, -997, 3, 1, 0];
+    assert(C.array == correct);
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
+__m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m128i)(cast(int4)a - cast(int4)b);
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(16, int.max, 1, 8);
+    __m128i B = _mm_setr_epi32(15, int.min, 6, 2);
+    int4 C = cast(int4) _mm_sub_epi32(A, B);
+    int[4] correct =          [ 1,      -1,-5, 6];
+    assert(C.array == correct);
+}
+
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
+__m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m128i)(cast(long2)a - cast(long2)b);
+}
+unittest
+{
+    __m128i A = _mm_setr_epi64(  16, long.max);
+    __m128i B = _mm_setr_epi64( 199, long.min);
+    long2 C = cast(long2) _mm_sub_epi64(A, B);
+    long[2] correct =         [-183,       -1];
+    assert(C.array == correct);
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
+__m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m128i)(cast(byte16)a - cast(byte16)b);
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(16,  127, 1, 2, 3, 4, 6, 6, 16,  127, 1, 2, 3, 4, 6, 6);
+    __m128i B = _mm_setr_epi8(15, -128, 6, 8, 3, 1, 5, 6, 16,  127, 1, 2, 3, 4, 6, 6);
+    byte16 C = cast(byte16) _mm_sub_epi8(A, B);
+    byte[16] correct =       [ 1,   -1,-5,-6, 0, 3, 1, 0,  0,    0, 0, 0, 0, 0, 0, 0];
+    assert(C.array == correct);
+}
+
+/// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
+/// floating-point elements in `a`.
+__m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
+{
+    pragma(inline, true);
+    return a - b;
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(4000.0, -8.0);
+    __m128d B = _mm_setr_pd(12.0, -8450.0);
+    __m128d C = _mm_sub_pd(A, B);
+    double[2] correct =     [3988.0, 8442.0];
+    assert(C.array == correct);
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
+/// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
+/// upper element of result.
+__m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
+{
+    version(DigitalMars)
+    {
+        // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
+        // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
+        asm pure nothrow @nogc @trusted { nop;}
+        a[0] = a[0] - b[0];
+        return a;
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_subsd(a, b);
+    }
+    else
+    {
+        a.ptr[0] -= b.array[0];
+        return a;
+    }
+}
+unittest
+{
+    __m128d a = [1.5, -2.0];
+    a = _mm_sub_sd(a, a);
+    assert(a.array == [0.0, -2.0]);
+}
+
+/// Subtract 64-bit integer `b` from 64-bit integer `a`.
+__m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
+{
+    pragma(inline, true);
+    return a - b;
+}
+unittest
+{
+    __m64 A, B;
+    A = -1214;
+    B = 489415;
+    __m64 C = _mm_sub_si64(B, A);
+    assert(C.array[0] == 489415 + 1214);
+}
+
+/// Subtract packed signed 16-bit integers in `b` from packed 16-bit integers in `a` using
+/// saturation.
+__m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD psubsw
+    static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m128i) inteli_llvm_subs!short8(cast(short8)a, cast(short8)b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
+    }
+    else
+    {
+        short[8] res; // PERF =void;
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        foreach(i; 0..8)
+            res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
+        return _mm_loadu_si128(cast(int4*)res.ptr);
+    }
+}
+unittest
+{
+    short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
+                                             _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
+    static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
+    assert(res.array == correctResult);
+}
+
+/// Subtract packed signed 8-bit integers in `b` from packed 8-bit integers in `a` using
+/// saturation.
+__m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
+{
+    static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m128i) inteli_llvm_subs!byte16(cast(byte16)a, cast(byte16)b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
+    }
+    else
+    {
+        byte[16] res; // PERF =void;
+        byte16 sa = cast(byte16)a;
+        byte16 sb = cast(byte16)b;
+        foreach(i; 0..16)
+            res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
+        return _mm_loadu_si128(cast(int4*)res.ptr);
+    }
+}
+unittest
+{
+    byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
+                                            _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+    static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(res.array == correctResult);
+}
+
+/// Subtract packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
+__m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
+{
+    static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m128i) inteli_llvm_subus!short8(cast(short8)a, cast(short8)b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
+    }
+    else
+    {
+        short[8] res; // PERF =void;
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        foreach(i; 0..8)
+        {
+            int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
+            res[i] = saturateSignedIntToUnsignedShort(sum);
+        }
+        return _mm_loadu_si128(cast(int4*)res.ptr);
+    }
+}
+unittest
+{
+    short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
+                                           _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
+    static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Subtract packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
+__m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
+{
+    static if(LDC_with_saturated_intrinsics)
+    {
+        return cast(__m128i) inteli_llvm_subus!byte16(cast(byte16)a, cast(byte16)b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
+    }
+    else
+    {
+        ubyte[16] res; // PERF =void;
+        byte16 sa = cast(byte16)a;
+        byte16 sb = cast(byte16)b;
+        foreach(i; 0..16)
+            res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
+        return _mm_loadu_si128(cast(int4*)res.ptr);
+    }
+}
+unittest
+{
+    byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
+                                            _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+    static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(res.array == correctResult);
+}
+
+// Note: the only difference between these intrinsics is the signalling
+//       behaviour of quiet NaNs. This is incorrect but the case where
+//       you would want to differentiate between qNaN and sNaN and then
+//       treat them differently on purpose seems extremely rare.
+alias _mm_ucomieq_sd = _mm_comieq_sd; ///
+alias _mm_ucomige_sd = _mm_comige_sd; ///
+alias _mm_ucomigt_sd = _mm_comigt_sd; ///
+alias _mm_ucomile_sd = _mm_comile_sd; ///
+alias _mm_ucomilt_sd = _mm_comilt_sd; ///
+alias _mm_ucomineq_sd = _mm_comineq_sd; ///
+
+/// Return vector of type `__m128d` with undefined elements.
+__m128d _mm_undefined_pd() pure @safe
+{
+    pragma(inline, true);
+    __m128d result = void;
+    return result;
+}
+
+/// Return vector of type `__m128i` with undefined elements.
+__m128i _mm_undefined_si128() pure @safe
+{
+    pragma(inline, true);
+    __m128i result = void;
+    return result;
+}
+
+/// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
+__m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PUNPCKHWD, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+                   ret <8 x i16> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
+    }
+    else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
+    {
+        asm pure nothrow @nogc @trusted
+        {
+            movdqu XMM0, a;
+            movdqu XMM1, b;
+            punpckhwd XMM0, XMM1;
+            movdqu a, XMM0;
+        }
+        return a;
+    }   
+    else
+    {
+        short8 r = void;
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        r.ptr[0] = sa.array[4];
+        r.ptr[1] = sb.array[4];
+        r.ptr[2] = sa.array[5];
+        r.ptr[3] = sb.array[5];
+        r.ptr[4] = sa.array[6];
+        r.ptr[5] = sb.array[6];
+        r.ptr[6] = sa.array[7];
+        r.ptr[7] = sb.array[7];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
+    __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
+    short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
+    short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
+__m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PUNPCKHDQ, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_punpckhdq128(a, b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+                   ret <4 x i32> %r`;
+        return LDCInlineIR!(ir, int4, int4, int4)(cast(int4)a, cast(int4)b);
+    }
+    else
+    {
+        __m128i r = void;
+        r.ptr[0] = a.array[2];
+        r.ptr[1] = b.array[2];
+        r.ptr[2] = a.array[3];
+        r.ptr[3] = b.array[3];
+        return r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(1, 2, 3, 4);
+    __m128i B = _mm_setr_epi32(5, 6, 7, 8);
+    __m128i C = _mm_unpackhi_epi32(A, B);
+    int[4] correct = [3, 7, 4, 8];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
+__m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
+    }
+    else
+    {
+        __m128i r = cast(__m128i)b;
+        r[0] = a[2];
+        r[1] = a[3];
+        return r; 
+    }
+}
+unittest // Issue #36
+{
+    __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
+    __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
+    long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
+    long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
+__m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PUNPCKHBW, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+                   ret <16 x i8> %r`;
+        return cast(__m128i)LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
+    }
+    else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
+    {
+        asm pure nothrow @nogc @trusted
+        {
+            movdqu XMM0, a;
+            movdqu XMM1, b;
+            punpckhbw XMM0, XMM1;
+            movdqu a, XMM0;
+        }
+        return a;
+    }
+    else
+    {
+        byte16 r = void;
+        byte16 ba = cast(byte16)a;
+        byte16 bb = cast(byte16)b;
+        r.ptr[0] = ba.array[8];
+        r.ptr[1] = bb.array[8];
+        r.ptr[2] = ba.array[9];
+        r.ptr[3] = bb.array[9];
+        r.ptr[4] = ba.array[10];
+        r.ptr[5] = bb.array[10];
+        r.ptr[6] = ba.array[11];
+        r.ptr[7] = bb.array[11];
+        r.ptr[8] = ba.array[12];
+        r.ptr[9] = bb.array[12];
+        r.ptr[10] = ba.array[13];
+        r.ptr[11] = bb.array[13];
+        r.ptr[12] = ba.array[14];
+        r.ptr[13] = bb.array[14];
+        r.ptr[14] = ba.array[15];
+        r.ptr[15] = bb.array[15];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
+    __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+    byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
+    byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
+__m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @trusted
+{
+    // PERF DMD D_SIMD
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_unpckhpd(a, b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 1, i32 3>
+                   ret <2 x double> %r`;
+        return LDCInlineIR!(ir, double2, double2, double2)(a, b);
+    }
+    else
+    {
+        double2 r = void;
+        r.ptr[0] = a.array[1];
+        r.ptr[1] = b.array[1];
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(4.0, 6.0);
+    __m128d B = _mm_setr_pd(7.0, 9.0);
+    __m128d C = _mm_unpackhi_pd(A, B);
+    double[2] correct = [6.0, 9.0];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
+__m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PUNPCKLWD, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+            ret <8 x i16> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
+    }
+    else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
+    {
+        asm pure nothrow @nogc @trusted
+        {
+            movdqu XMM0, a;
+            movdqu XMM1, b;
+            punpcklwd XMM0, XMM1;
+            movdqu a, XMM0;
+        }
+        return a;
+    }
+    else
+    {
+        short8 r = void;
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        r.ptr[0] = sa.array[0];
+        r.ptr[1] = sb.array[0];
+        r.ptr[2] = sa.array[1];
+        r.ptr[3] = sb.array[1];
+        r.ptr[4] = sa.array[2];
+        r.ptr[5] = sb.array[2];
+        r.ptr[6] = sa.array[3];
+        r.ptr[7] = sb.array[3];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+    __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+    short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
+    short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
+__m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PUNPCKLDQ, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_punpckldq128(a, b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+            ret <4 x i32> %r`;
+        return LDCInlineIR!(ir, int4, int4, int4)(cast(int4)a, cast(int4)b);
+    }
+    else
+    {
+        __m128i r;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = b.array[0];
+        r.ptr[2] = a.array[1];
+        r.ptr[3] = b.array[1];
+        return r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(1, 2, 3, 4);
+    __m128i B = _mm_setr_epi32(5, 6, 7, 8);
+    __m128i C = _mm_unpacklo_epi32(A, B);
+    int[4] correct = [1, 5, 2, 6];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
+__m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
+{
+    static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
+    }
+    else
+    {
+        long2 lA = cast(long2)a;
+        long2 lB = cast(long2)b;
+        long2 R; // PERF =void;
+        R.ptr[0] = lA.array[0];
+        R.ptr[1] = lB.array[0];
+        return cast(__m128i)R;
+    }
+}
+unittest // Issue #36
+{
+    __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
+    __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
+    long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
+    long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
+__m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i) __simd(XMM.PUNPCKLBW, a, b);
+    }
+    else static if (GDC_with_SSE2)
+    {
+        return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+            ret <16 x i8> %r`;
+        return cast(__m128i)LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
+    }
+    else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
+    {
+        asm pure nothrow @nogc @trusted
+        {
+            movdqu XMM0, a;
+            movdqu XMM1, b;
+            punpcklbw XMM0, XMM1;
+            movdqu a, XMM0;
+        }
+        return a;
+    }
+    else
+    {
+        byte16 r = void;
+        byte16 ba = cast(byte16)a;
+        byte16 bb = cast(byte16)b;
+        r.ptr[0] = ba.array[0];
+        r.ptr[1] = bb.array[0];
+        r.ptr[2] = ba.array[1];
+        r.ptr[3] = bb.array[1];
+        r.ptr[4] = ba.array[2];
+        r.ptr[5] = bb.array[2];
+        r.ptr[6] = ba.array[3];
+        r.ptr[7] = bb.array[3];
+        r.ptr[8] = ba.array[4];
+        r.ptr[9] = bb.array[4];
+        r.ptr[10] = ba.array[5];
+        r.ptr[11] = bb.array[5];
+        r.ptr[12] = ba.array[6];
+        r.ptr[13] = bb.array[6];
+        r.ptr[14] = ba.array[7];
+        r.ptr[15] = bb.array[7];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
+    __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+    byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
+    byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
+    assert(C.array == correct);
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
+__m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @trusted
+{
+    // PERF DMD D_SIMD
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_unpcklpd(a, b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 0, i32 2>
+                   ret <2 x double> %r`;
+        return LDCInlineIR!(ir, double2, double2, double2)(a, b);
+    }
+    else
+    {
+        double2 r = void;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = b.array[0];
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(4.0, 6.0);
+    __m128d B = _mm_setr_pd(7.0, 9.0);
+    __m128d C = _mm_unpacklo_pd(A, B);
+    double[2] correct = [4.0, 7.0];
+    assert(C.array == correct);
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
+__m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
+{
+    return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(-4.0, 6.0);
+    __m128d B = _mm_setr_pd(4.0, -6.0);
+    long2 R = cast(long2) _mm_xor_pd(A, B);
+    long[2] correct = [long.min, long.min];
+    assert(R.array == correct);
+}
+
+/// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
+__m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
+{
+    return a ^ b;
+}
+unittest
+{
+    __m128i A = _mm_setr_epi64(975394, 619809709);
+    __m128i B = _mm_setr_epi64(-920275025, -6);
+    long2 R = cast(long2) _mm_xor_si128(A, B);
+    long[2] correct = [975394 ^ (-920275025L), 619809709L ^ -6];
+    assert(R.array == correct);
+}
+
+unittest
+{
+    float distance(float[4] a, float[4] b) nothrow @nogc
+    {
+        __m128 va = _mm_loadu_ps(a.ptr);
+        __m128 vb = _mm_loadu_ps(b.ptr);
+        __m128 diffSquared = _mm_sub_ps(va, vb);
+        diffSquared = _mm_mul_ps(diffSquared, diffSquared);
+        __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
+        sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
+        return _mm_cvtss_f32(_mm_sqrt_ss(sum));
+    }
+    assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
+}
diff --git a/external/inteli/internals.d b/external/inteli/internals.d
new file mode 100644
index 0000000..fe37bca
--- /dev/null
+++ b/external/inteli/internals.d
@@ -0,0 +1,1988 @@
+/**
+* Internal stuff only, do not import.
+*
+* Copyright: Copyright Guillaume Piolat 2016-2025, Stefanos Baziotis 2019.
+*            cet 2024.
+*            Copyright Kitsunebi Games 2025.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.internals;
+
+import inteli.types;
+
+package:
+nothrow:
+@nogc:
+
+// nurt compatibility
+
+version(Have_nurt) 
+{
+    import numem.core.hooks : nu_malloc, nu_free, nu_memcpy;
+    public import core.internal.exception : onOutOfMemoryError;
+    alias malloc = nu_malloc;
+    alias free = nu_free;
+    alias memcpy = nu_memcpy;
+} 
+else 
+{
+    public import core.stdc.stdlib: malloc, free;
+    public import core.stdc.string: memcpy;
+    public import core.exception: onOutOfMemoryError;
+}
+
+// The only math functions needed for intel-intrinsics
+public import core.math: sqrt;
+public import core.bitop: bsf, bsr;
+
+
+
+/// Helps portability with yet unsupported platforms
+void __warn_noop(string fname = __FUNCTION__)() 
+{
+    pragma(msg, "Warning: ", fname, " is currently not supported, it will become a NO-OP!");
+}
+///ditto
+RetT __warn_noop_ret(RetT, string fname = __FUNCTION__)(RetT rval = RetT.init) 
+    if (!is(RetT == void)) 
+{
+    pragma(msg, "Warning: ", fname, " is currently not supported, it will become a NO-OP!");
+    return rval;
+}
+
+
+
+version(GNU)
+{
+    version (X86)
+    {
+        // For 32-bit x86, disable vector extensions with GDC. 
+        // It just doesn't work well.
+        enum GDC_with_x86 = true;
+        enum GDC_with_MMX = false;
+        enum GDC_with_SSE = false;
+        enum GDC_with_SSE2 = false;
+        enum GDC_with_SSE3 = false;
+        enum GDC_with_SSSE3 = false;
+        enum GDC_with_SSE41 = false;
+        enum GDC_with_SSE42 = false;
+        enum GDC_with_AVX = false;
+        enum GDC_with_AVX2 = false;
+        enum GDC_with_SHA = false;
+        enum GDC_with_BMI2 = false;
+    }
+    else version (X86_64)
+    {
+        // GDC support uses extended inline assembly:
+        //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
+        //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
+        //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
+
+        public import core.simd: byte16, short8, int4, float4, double2;
+
+        // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
+        // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
+        public import gcc.builtins;
+
+        // TODO: SSE and SSE2 should be truly optional instead, in the future, if we 
+        // want to support other archs with GDC
+
+        enum GDC_with_x86 = true;
+        enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
+        enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
+        enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
+
+        static if (__VERSION__ >= 2100) // Starting at GDC 12.1
+        {
+            enum GDC_with_SSE3 = __traits(compiles, __builtin_ia32_haddps);
+            enum GDC_with_SSSE3 = __traits(compiles, __builtin_ia32_pmulhrsw128);
+            enum GDC_with_SSE41 = __traits(compiles, __builtin_ia32_dpps);
+            enum GDC_with_SSE42 = __traits(compiles, __builtin_ia32_pcmpgtq);
+            enum GDC_with_AVX = __traits(compiles, __builtin_ia32_vbroadcastf128_pd256);
+            enum GDC_with_AVX2 = __traits(compiles, __builtin_ia32_gathersiv2df);
+            enum GDC_with_BMI2 = __traits(compiles, __builtin_ia32_pext_si);
+
+        }
+        else
+        {
+            // Before GCC 11.3, no reliable way to detect instruction sets.
+            // We start above detection at GCC 12, with DMDFE 2.100, which
+            // is more conservative.
+            enum GDC_with_SSE3 = false;
+            enum GDC_with_SSSE3 = false;
+            enum GDC_with_SSE41 = false;
+            enum GDC_with_SSE42 = false;
+            enum GDC_with_AVX = false;
+            enum GDC_with_AVX2 = false;
+            enum GDC_with_BMI2 = false;
+        }
+
+        enum GDC_with_SHA = false; // TODO: detect that
+    }
+    else
+    {
+        enum GDC_with_x86 = false;
+        enum GDC_with_MMX = false;
+        enum GDC_with_SSE = false;
+        enum GDC_with_SSE2 = false;
+        enum GDC_with_SSE3 = false;
+        enum GDC_with_SSSE3 = false;
+        enum GDC_with_SSE41 = false;
+        enum GDC_with_SSE42 = false;
+        enum GDC_with_AVX = false;
+        enum GDC_with_AVX2 = false;
+        enum GDC_with_SHA = false;
+        enum GDC_with_BMI2 = false;
+    }
+}
+else
+{
+    enum GDC_with_x86 = false;
+    enum GDC_with_MMX = false;
+    enum GDC_with_SSE = false;
+    enum GDC_with_SSE2 = false;
+    enum GDC_with_SSE3 = false;
+    enum GDC_with_SSSE3 = false;
+    enum GDC_with_SSE41 = false;
+    enum GDC_with_SSE42 = false;
+    enum GDC_with_AVX = false;
+    enum GDC_with_AVX2 = false;
+    enum GDC_with_SHA = false;
+    enum GDC_with_BMI2 = false;
+}
+
+version(LDC)
+{
+    public import core.simd;
+    public import ldc.simd;
+    public import ldc.intrinsics;
+    public import ldc.llvmasm: __asm;
+
+    version (X86)
+        private enum bool some_x86 = true;
+    else version (X86_64)
+        private enum bool some_x86 = true;
+    else
+        private enum bool some_x86 = false;
+
+    // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
+    static if (__VERSION__ >= 2083)
+    {
+        import ldc.llvmasm;
+        alias LDCInlineIR = __ir_pure;
+
+        // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
+        alias LDCInlineIREx = __irEx_pure; 
+
+        enum bool LDC_with_InlineIREx = true;
+    }
+    else
+    {
+        alias LDCInlineIR = inlineIR;
+        enum bool LDC_with_InlineIREx = false;
+    }
+
+    // This is used to disable LDC feature that are expensive at compile time: 
+    // everything that relies on inline LLVM IR.
+    version(D_Optimized)
+    {
+        enum bool LDC_with_optimizations = true;
+    }
+    else
+    {
+        static if (__VERSION__ < 2101)
+        {
+            // See Issue #136, D_Optimized only appeared in DMDFE 2.101.
+            // Relying on this had terrible consequences.
+            enum bool LDC_with_optimizations = true;
+        }
+        else
+            enum bool LDC_with_optimizations = false;
+    }
+
+    version(ARM)
+    {
+        public import ldc.gccbuiltins_arm;
+
+        enum LDC_with_ARM32 = true;
+        enum LDC_with_ARM64 = false;
+        enum LDC_with_ARM64_CRC = false;
+        enum LDC_with_SSE = false;
+        enum LDC_with_SSE2 = false;
+        enum LDC_with_SSE3 = false;
+        enum LDC_with_SSSE3 = false;
+        enum LDC_with_SSE41 = false;
+        enum LDC_with_SSE42 = false;
+        enum LDC_with_CRC32 = false;
+        enum LDC_with_AVX = false;
+        enum LDC_with_F16C = false;
+        enum LDC_with_AVX2 = false;
+        enum LDC_with_SHA = false;
+        enum LDC_with_BMI2 = false;
+    }
+    else version(AArch64)
+    {
+        public import ldc.gccbuiltins_aarch64;
+        enum LDC_with_ARM32 = false;
+        enum LDC_with_ARM64 = true; // implies "has Neon"
+        enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc");
+        enum LDC_with_SSE = false;
+        enum LDC_with_SSE2 = false;
+        enum LDC_with_SSE3 = false;
+        enum LDC_with_SSSE3 = false;
+        enum LDC_with_SSE41 = false;
+        enum LDC_with_SSE42 = false;
+        enum LDC_with_CRC32 = false;
+        enum LDC_with_AVX = false;
+        enum LDC_with_F16C = false;
+        enum LDC_with_AVX2 = false;
+        enum LDC_with_SHA = false;
+        enum LDC_with_BMI2 = false;
+    }
+    else static if (some_x86)
+    {
+        public import ldc.gccbuiltins_x86;
+
+        // Workaround LDC 1.32.0 having NO builtins at all.
+        // See LDC issue 4347 https://github.com/ldc-developers/ldc/issues/4347
+        enum LDC_with_ia32_builtins = __traits(compiles, __builtin_ia32_clflush); // This one must be available in all of LDC history.
+
+        static if (!LDC_with_ia32_builtins)
+        {
+            // in case our __builtin_ia32_clflush workaround breaks
+            pragma(msg, "Warning: LDC v1.32.0 has no SIMD builtins. intel-intrinsics will use slow path. Please avoid LDC 1.32.0");
+        }
+
+        enum LDC_with_ARM32 = false;
+        enum LDC_with_ARM64 = false;
+        enum LDC_with_ARM64_CRC = false;
+        enum LDC_with_SSE = __traits(targetHasFeature, "sse") && LDC_with_ia32_builtins;
+        enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2") && LDC_with_ia32_builtins;
+        enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3") && LDC_with_ia32_builtins;
+        enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3") && LDC_with_ia32_builtins;
+        enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1") && LDC_with_ia32_builtins;
+        enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2") && LDC_with_ia32_builtins;
+
+        // Since LDC 1.30, crc32 is a separate (and sufficient) attribute from sse4.2
+        // As of Jan 2023, GDC doesn't make that distinction, -msse4.2 includes -mcrc32 for GDC.
+        static if (__VERSION__ >= 2100)
+        {
+            enum LDC_with_CRC32 = __traits(targetHasFeature, "crc32") && LDC_with_ia32_builtins;
+        }
+        else
+        {
+            enum LDC_with_CRC32 = __traits(targetHasFeature, "sse4.2") && LDC_with_ia32_builtins; // crc32 used to be included in sse4.2
+        }
+
+        enum LDC_with_AVX = __traits(targetHasFeature, "avx") && LDC_with_ia32_builtins;
+        enum LDC_with_F16C = __traits(targetHasFeature, "f16c") && LDC_with_ia32_builtins;
+        enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2") && LDC_with_ia32_builtins;
+        enum LDC_with_SHA = __traits(targetHasFeature, "sha") && LDC_with_ia32_builtins;
+        enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2") && LDC_with_ia32_builtins;
+    }
+    else
+    {
+        enum LDC_with_ARM32 = false;
+        enum LDC_with_ARM64 = false;
+        enum LDC_with_ARM64_CRC = false;
+        enum LDC_with_SSE = false;
+        enum LDC_with_SSE2 = false;
+        enum LDC_with_SSE3 = false;
+        enum LDC_with_SSSE3 = false;
+        enum LDC_with_SSE41 = false;
+        enum LDC_with_SSE42 = false;
+        enum LDC_with_CRC32 = false;
+        enum LDC_with_AVX = false;
+        enum LDC_with_F16C = false;
+        enum LDC_with_AVX2 = false;
+        enum LDC_with_SHA = false;
+        enum LDC_with_BMI2 = false;
+    }
+
+    // Should we use inline x86 assembly with DMD syntax, in LDC?
+    version(D_InlineAsm_X86)
+    {
+        enum LDC_with_32b_x86_asm = LDC_with_SSE2; // if no SSE support, disable the x86 asm code path
+        enum LDC_with_64b_x86_asm = false;
+    }
+    else version(D_InlineAsm_X86_64)
+    {
+        enum LDC_with_32b_x86_asm = false;
+        enum LDC_with_64b_x86_asm = LDC_with_SSE2;
+    }
+    else
+    {
+        enum LDC_with_32b_x86_asm = false;
+        enum LDC_with_64b_x86_asm = false;
+    }
+}
+else
+{
+    enum LDC_with_ARM32 = false;
+    enum LDC_with_ARM64 = false;
+    enum LDC_with_ARM64_CRC = false;
+    enum LDC_with_SSE = false;
+    enum LDC_with_SSE2 = false;
+    enum LDC_with_SSE3 = false;
+    enum LDC_with_SSSE3 = false;
+    enum LDC_with_SSE41 = false;
+    enum LDC_with_SSE42 = false;
+    enum LDC_with_CRC32 = false;
+    enum LDC_with_AVX = false;
+    enum LDC_with_F16C = false;
+    enum LDC_with_AVX2 = false;
+    enum LDC_with_SHA = false;
+    enum LDC_with_BMI2 = false;
+
+    enum LDC_with_InlineIREx = false;
+    enum bool LDC_with_optimizations = false;
+    enum bool LDC_with_32b_x86_asm = false;
+    enum bool LDC_with_64b_x86_asm = false;
+}
+enum LDC_with_x86_asm = LDC_with_32b_x86_asm || LDC_with_64b_x86_asm;
+
+
+enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64;
+
+version(DigitalMars)
+{
+    version(D_InlineAsm_X86)
+        enum DMD_with_asm = true;
+    else version(D_InlineAsm_X86_64)
+        enum DMD_with_asm = true;
+    else
+        enum DMD_with_asm = false;
+
+    version(D_InlineAsm_X86)
+        enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
+    else
+        enum DMD_with_32bit_asm = false;
+
+    version (D_SIMD)
+    {
+        enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated;
+
+        // Going further, does DMD has SSE4.1 through -mcpu?
+        static if (DMD_with_DSIMD)
+            enum bool DMD_with_DSIMD_and_SSE41 = __traits(compiles, int4(0) * int4(0));
+        else
+            enum bool DMD_with_DSIMD_and_SSE41 = false;
+
+        // No DMD way to detect those instruction sets => pessimize
+        // would be cool to have a way to detect support for this at CT
+        enum DMD_with_DSIMD_and_SSE3  = DMD_with_DSIMD_and_SSE41; 
+        enum DMD_with_DSIMD_and_SSSE3 = DMD_with_DSIMD_and_SSE41;
+
+        version(D_AVX)
+            enum DMD_with_DSIMD_and_AVX   = true;
+        else
+            enum DMD_with_DSIMD_and_AVX   = false;
+
+        version(D_AVX2)
+            enum DMD_with_DSIMD_and_AVX2  = true;
+        else
+            enum DMD_with_DSIMD_and_AVX2  = false;
+
+        enum DMD_with_DSIMD_and_SSE42 = DMD_with_DSIMD_and_AVX;
+    }
+    else
+    {
+        enum DMD_with_DSIMD = false;
+        enum DMD_with_DSIMD_and_SSE3  = false;
+        enum DMD_with_DSIMD_and_SSSE3 = false;
+        enum DMD_with_DSIMD_and_SSE41 = false;
+        enum DMD_with_DSIMD_and_SSE42 = false;
+        enum DMD_with_DSIMD_and_AVX   = false;
+        enum DMD_with_DSIMD_and_AVX2  = false;
+    }
+}
+else
+{
+    enum DMD_with_asm = false;
+    enum DMD_with_32bit_asm = false;
+    enum DMD_with_DSIMD = false;
+    enum DMD_with_DSIMD_and_SSE3  = false;
+    enum DMD_with_DSIMD_and_SSSE3 = false;
+    enum DMD_with_DSIMD_and_SSE41 = false;
+    enum DMD_with_DSIMD_and_SSE42 = false;
+    enum DMD_with_DSIMD_and_AVX   = false;
+    enum DMD_with_DSIMD_and_AVX2  = false;
+}
+
+
+// Sometimes, can be helpful to merge builtin code, however keep in mind that
+// LDC and GDC builtins often subtly diverge, wrt. unsigned vs signed vectors, 
+// return types, purity... test it in Godbolt! this is safer with float and double intrinsics.
+enum GDC_or_LDC_with_SSE  = GDC_with_SSE  || LDC_with_SSE;
+enum GDC_or_LDC_with_SSE2 = GDC_with_SSE2 || LDC_with_SSE2;
+enum GDC_or_LDC_with_SSE3 = GDC_with_SSE3 || LDC_with_SSE3;
+enum GDC_or_LDC_with_SSE41 = GDC_with_SSE41 || LDC_with_SSE41;
+enum GDC_or_LDC_with_SSE42 = GDC_with_SSE42 || LDC_with_SSE42;
+
+enum GDC_or_LDC_with_AVX  = GDC_with_AVX  || LDC_with_AVX;
+enum GDC_or_LDC_with_AVX2 = GDC_with_AVX2 || LDC_with_AVX2;
+enum GDC_or_LDC_with_SHA  = GDC_with_SHA  || LDC_with_SHA;
+enum GDC_or_LDC_with_BMI2 = GDC_with_BMI2 || LDC_with_BMI2;
+
+static if (__VERSION__ >= 2102)
+{
+    enum SIMD_COMPARISON_MASKS_8B  = !MMXSizedVectorsAreEmulated; // can do < <= => > == with builtin 8 bytes __vectors.
+    enum SIMD_COMPARISON_MASKS_16B = !SSESizedVectorsAreEmulated; // can do < <= => > == with builtin 16 bytes __vectors.
+    enum SIMD_COMPARISON_MASKS_32B = !AVXSizedVectorsAreEmulated; // can do < <= => > == with builtin 32 bytes __vectors.
+}
+else
+{
+    enum SIMD_COMPARISON_MASKS_8B = false;
+    enum SIMD_COMPARISON_MASKS_16B = false;
+    enum SIMD_COMPARISON_MASKS_32B = false;
+}
+
+
+static if (LDC_with_ARM32)
+{
+    package uint arm_get_fpcr() nothrow @nogc @trusted
+    {
+        return __builtin_arm_get_fpscr();
+    }
+
+    package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
+    {
+        __builtin_arm_set_fpscr(cw);
+    }
+}
+
+static if (LDC_with_ARM64)
+{
+    pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
+        long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
+
+    package uint arm_get_fpcr() pure nothrow @nogc @trusted
+    {
+        // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR
+        return __asm!uint("mrs $0, fpcr", "=r");
+    }
+
+    package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
+    {
+        // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
+        long save_x2;
+        __asm!void("str x2, $1 \n" ~
+                   "ldr w2, $0 \n" ~
+                   "msr fpcr, x2 \n" ~
+                   "ldr x2, $1 "   , "m,m", cw, &save_x2);
+    }
+}
+
+
+// For internal use only, since public API deals with a x86 semantic emulation
+enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
+enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
+enum uint _MM_ROUND_UP_ARM          = 0x00400000;
+enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
+enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
+enum uint _MM_FLUSH_ZERO_MASK_ARM   = 0x01000000;
+
+
+//
+//  <ROUNDING>
+//
+//  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
+//  doesn't change the FPU rounding mode, and isn't expected to do so.
+//  So we devised these rounding function to help having consistent rounding between 
+//  LDC and DMD. It's important that DMD uses whatever is in MXCSR to round.
+//
+//  Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 
+//  functionality.
+//  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
+//  We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
+
+int convertFloatToInt32UsingMXCSR(float value) @trusted
+{
+    int result;
+    version(GNU)
+    {
+        version(X86)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
+            }
+        }
+        else version(X86_64)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
+            }
+        }
+        else
+        {
+            // BUG: this is truncation instead of MXCSR
+            result = cast(int)value;
+        }
+    }
+    else static if (LDC_with_ARM32)
+    {
+        result = __asm!int(`vldr s2, $1
+                            vcvtr.s32.f32 s2, s2
+                            vmov $0, s2`, "=r,m,~{s2}", value);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // Get current rounding mode.
+        uint fpscr = arm_get_fpcr();
+
+        switch(fpscr & _MM_ROUND_MASK_ARM)
+        {
+            default:
+            case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f32(value); break;
+            case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f32(value); break;
+            case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f32(value); break;
+            case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value);  break;
+        }
+    }
+    else
+    {
+        asm pure nothrow @nogc @trusted
+        {
+            cvtss2si EAX, value;
+            mov result, EAX;
+        }
+    }
+    return result;
+}
+
+int convertDoubleToInt32UsingMXCSR(double value) @trusted
+{
+    int result;
+    version(GNU)
+    {
+        version(X86)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
+            }
+        }
+        else version(X86_64)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
+            }
+        }
+        else
+        {
+            // BUG: this is truncation instead of MXCSR
+            result = cast(int)value;
+        }
+    }
+    else static if (LDC_with_ARM32)
+    {
+        result = __asm!int(`vldr d2, $1
+                            vcvtr.s32.f64 s2, d2
+                            vmov $0, s2`, "=r,m,~{s2},~{d2}", value);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // Get current rounding mode.
+        uint fpscr = arm_get_fpcr();
+
+        switch(fpscr & _MM_ROUND_MASK_ARM)
+        {
+            default:
+            case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f64(value); break;
+            case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f64(value); break;
+            case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f64(value); break;
+            case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value);  break;
+        }
+    }
+    else
+    {
+        asm pure nothrow @nogc @trusted
+        {
+            cvtsd2si EAX, value;
+            mov result, EAX;
+        }
+    }
+    return result;
+}
+
+long convertFloatToInt64UsingMXCSR(float value) @trusted
+{
+    static if (LDC_with_ARM32)
+    {
+        // We have to resort to libc since 32-bit ARM 
+        // doesn't seem to have 64-bit registers.
+        
+        uint fpscr = arm_get_fpcr(); // Get current rounding mode.
+
+        // Note: converting to double precision else rounding could be different for large integers
+        double asDouble = value; 
+
+        switch(fpscr & _MM_ROUND_MASK_ARM)
+        {
+            default:
+            case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
+            case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
+            case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
+            case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
+        }
+    }
+    else static if (LDC_with_ARM64)
+    {
+        uint fpscr = arm_get_fpcr();
+
+        switch(fpscr & _MM_ROUND_MASK_ARM)
+        {
+            default:
+            case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f32(value);
+            case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f32(value);
+            case _MM_ROUND_UP_ARM:          return vcvtps_s64_f32(value);
+            case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value);
+        }
+    }
+    // 64-bit can use an SSE instruction
+    else version(D_InlineAsm_X86_64)
+    {
+        long result;
+        version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                movss XMM0, value;
+                cvtss2si RAX, XMM0;
+                mov result, RAX;
+            }
+        }
+        else
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                movss XMM0, value;
+                db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
+                mov result, RAX;
+            }
+        }
+        return result;
+    }
+    else version(D_InlineAsm_X86)
+    {
+        // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
+        // This leads to an unfortunate FPU sequence in every C++ compiler.
+        // See: https://godbolt.org/z/vZym77
+
+        // Get current MXCSR rounding
+        uint sseRounding;
+        ushort savedFPUCW;
+        ushort newFPUCW;
+        long result;
+        asm pure nothrow @nogc @trusted
+        {
+            stmxcsr sseRounding;
+            fld value;
+            fnstcw savedFPUCW;
+            mov AX, savedFPUCW;
+            and AX, 0xf3ff;          // clear FPU rounding bits
+            movzx ECX, word ptr sseRounding;
+            and ECX, 0x6000;         // only keep SSE rounding bits
+            shr ECX, 3;
+            or AX, CX;               // make a new control word for FPU with SSE bits
+            mov newFPUCW, AX;
+            fldcw newFPUCW;
+            fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
+            fldcw savedFPUCW;
+        }
+        return result;
+    }
+    else static if (GDC_with_x86)
+    {
+        version(X86_64) // 64-bit can just use the right instruction
+        {
+            static assert(GDC_with_SSE);
+            __m128 A;
+            A.ptr[0] = value;
+            return __builtin_ia32_cvtss2si64 (A);
+        }
+        else version(X86) // 32-bit
+        {
+            // This is untested!
+            uint sseRounding;
+            ushort savedFPUCW;
+            ushort newFPUCW;
+            long result;
+            asm pure nothrow @nogc @trusted
+            {
+                "stmxcsr %1;\n" ~
+                "fld %2;\n" ~
+                "fnstcw %3;\n" ~
+                "movw %3, %%ax;\n" ~
+                "andw $0xf3ff, %%ax;\n" ~
+                "movzwl %1, %%ecx;\n" ~
+                "andl $0x6000, %%ecx;\n" ~
+                "shrl $3, %%ecx;\n" ~
+                "orw %%cx, %%ax\n" ~
+                "movw %%ax, %4;\n" ~
+                "fldcw %4;\n" ~
+                "fistpll %0;\n" ~
+                "fldcw %3;\n" 
+                  : "=m"(result)    // %0
+                  : "m" (sseRounding),
+                    "f" (value),
+                    "m" (savedFPUCW),
+                    "m" (newFPUCW) 
+                  : "eax", "ecx", "st";
+            }
+            return result;
+        }
+        else
+            static assert(false);
+    }
+    else
+    {
+        // BUG
+        // This is a last result and wrong, typically
+        // for GDC architectures we don't yet support
+        return cast(long)value;
+    }
+}
+
+
+///ditto
+long convertDoubleToInt64UsingMXCSR(double value) @trusted
+{
+    static if (LDC_with_ARM32)
+    {
+        // We have to resort to libc since 32-bit ARM 
+        // doesn't seem to have 64-bit registers.
+        uint fpscr = arm_get_fpcr(); // Get current rounding mode.
+        switch(fpscr & _MM_ROUND_MASK_ARM)
+        {
+            default:
+            case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
+            case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
+            case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
+            case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
+        }
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // Get current rounding mode.
+        uint fpscr = arm_get_fpcr();
+
+        switch(fpscr & _MM_ROUND_MASK_ARM)
+        {
+            default:
+            case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f64(value);
+            case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f64(value);
+            case _MM_ROUND_UP_ARM:          return vcvtps_s64_f64(value);
+            case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value);
+        }
+    }
+    // 64-bit can use an SSE instruction
+    else version(D_InlineAsm_X86_64)
+    {
+        long result;
+        version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                movsd XMM0, value;
+                cvtsd2si RAX, XMM0;
+                mov result, RAX;
+            }
+        }
+        else
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                movsd XMM0, value;
+                db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
+                mov result, RAX;
+            }
+        }
+        return result;
+    }
+    else version(D_InlineAsm_X86)
+    {
+        // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
+        // This leads to an unfortunate FPU sequence in every C++ compiler.
+        // See: https://godbolt.org/z/vZym77
+
+        // Get current MXCSR rounding
+        uint sseRounding;
+        ushort savedFPUCW;
+        ushort newFPUCW;
+        long result;
+        asm pure nothrow @nogc @trusted
+        {
+            stmxcsr sseRounding;
+            fld value;
+            fnstcw savedFPUCW;
+            mov AX, savedFPUCW;
+            and AX, 0xf3ff;
+            movzx ECX, word ptr sseRounding;
+            and ECX, 0x6000;
+            shr ECX, 3;
+            or AX, CX;
+            mov newFPUCW, AX;
+            fldcw newFPUCW;
+            fistp result;
+            fldcw savedFPUCW;
+        }
+        return result;
+    }
+    else static if (GDC_with_x86)
+    {
+        version(X86_64)
+        {
+            static assert(GDC_with_SSE2);
+            __m128d A;
+            A.ptr[0] = value;
+            return __builtin_ia32_cvtsd2si64 (A);
+        }
+        else
+        {
+            // This is untested!
+            uint sseRounding;
+            ushort savedFPUCW;
+            ushort newFPUCW;
+            long result;
+            asm pure nothrow @nogc @trusted
+            {
+                "stmxcsr %1;\n" ~
+                "fld %2;\n" ~
+                "fnstcw %3;\n" ~
+                "movw %3, %%ax;\n" ~
+                "andw $0xf3ff, %%ax;\n" ~
+                "movzwl %1, %%ecx;\n" ~
+                "andl $0x6000, %%ecx;\n" ~
+                "shrl $3, %%ecx;\n" ~
+                "orw %%cx, %%ax\n" ~
+                "movw %%ax, %4;\n" ~
+                "fldcw %4;\n" ~
+                "fistpll %0;\n" ~
+                "fldcw %3;\n"         
+                  : "=m"(result)    // %0
+                  : "m" (sseRounding),
+                    "t" (value),
+                    "m" (savedFPUCW),
+                    "m" (newFPUCW) 
+                  : "eax", "ecx", "st";
+            }
+            return result;
+        }
+    }
+    else
+    {
+        // BUG
+        // This is a last result and wrong, typically
+        // for GDC architectures we don't yet support
+        return cast(long)value;
+    }
+}
+
+//
+//  </ROUNDING>
+//
+
+
+// using the Intel terminology here
+
+byte saturateSignedWordToSignedByte(short value) pure @safe
+{
+    if (value > 127) value = 127;
+    if (value < -128) value = -128;
+    return cast(byte) value;
+}
+
+ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
+{
+    if (value > 255) value = 255;
+    if (value < 0) value = 0;
+    return cast(ubyte) value;
+}
+
+short saturateSignedIntToSignedShort(int value) pure @safe
+{
+    if (value > 32767) value = 32767;
+    if (value < -32768) value = -32768;
+    return cast(short) value;
+}
+
+ushort saturateSignedIntToUnsignedShort(int value) pure @safe
+{
+    if (value > 65535) value = 65535;
+    if (value < 0) value = 0;
+    return cast(ushort) value;
+}
+
+unittest // test saturate operations
+{
+    assert( saturateSignedWordToSignedByte(32000) == 127);
+    assert( saturateSignedWordToUnsignedByte(32000) == 255);
+    assert( saturateSignedWordToSignedByte(-4000) == -128);
+    assert( saturateSignedWordToUnsignedByte(-4000) == 0);
+    assert( saturateSignedIntToSignedShort(32768) == 32767);
+    assert( saturateSignedIntToUnsignedShort(32768) == 32768);
+    assert( saturateSignedIntToSignedShort(-32769) == -32768);
+    assert( saturateSignedIntToUnsignedShort(-32769) == 0);
+}
+
+version(unittest)
+{
+    // This is just for debugging tests
+    import core.stdc.stdio: printf;
+
+    // printing vectors for implementation
+    // Note: you can override `pure` within a `debug` clause
+
+    void _mm_print_pi64(__m64 v) @trusted
+    {
+        long1 vl = cast(long1)v;
+        printf("%lld\n", vl.array[0]);
+    }
+
+    void _mm_print_pi32(__m64 v) @trusted
+    {
+        int[2] C = (cast(int2)v).array;
+        printf("%d %d\n", C[0], C[1]);
+    }
+
+    void _mm_print_pi16(__m64 v) @trusted
+    {
+        short[4] C = (cast(short4)v).array;
+        printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
+    }
+
+    void _mm_print_pi8(__m64 v) @trusted
+    {
+        byte[8] C = (cast(byte8)v).array;
+        printf("%d %d %d %d %d %d %d %d\n",
+        C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
+    }
+
+    void _mm_print_epi64(__m128i v) @trusted
+    {
+        long2 vl = cast(long2)v;
+        printf("%lld %lld\n", vl.array[0], vl.array[1]);
+    }
+
+    void _mm_print_epi32(__m128i v) @trusted
+    {
+        printf("%d %d %d %d\n",
+              v.array[0], v.array[1], v.array[2], v.array[3]);
+    }  
+
+    void _mm_print_epi16(__m128i v) @trusted
+    {
+        short[8] C = (cast(short8)v).array;
+        printf("%d %d %d %d %d %d %d %d\n",
+        C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
+    }
+
+    void _mm_print_epi8(__m128i v) @trusted
+    {
+        byte[16] C = (cast(byte16)v).array;
+        printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
+        C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
+    }
+
+    void _mm_print_ps(__m128 v) @trusted
+    {
+        // %g because %f can conceal very small numbers and prints zero instead
+        float[4] C = (cast(float4)v).array;
+        printf("%g %g %g %g\n", C[0], C[1], C[2], C[3]);
+    }
+
+    void _mm_print_pd(__m128d v) @trusted
+    {
+        double[2] C = (cast(double2)v).array;
+        printf("%f %f\n", C[0], C[1]);
+    }
+
+    void _mm256_print_pd(__m256d v) @trusted
+    {
+        // %g because %f can conceal very small numbers and prints zero instead
+        printf("%g %g %g %g\n", v.array[0], v.array[1], v.array[2], v.array[3]); 
+    }
+
+    void _mm256_print_ps(__m256 v) @trusted
+    {
+        // %g because %f can conceal very small numbers and prints zero instead
+        printf("%g %g %g %g %g %g %g %g\n", 
+            v.array[0], v.array[1], v.array[2], v.array[3],
+            v.array[4], v.array[5], v.array[6], v.array[7]); 
+    }
+
+    void _mm256_print_epi16(__m256i v) @trusted
+    {
+        short16 vl = cast(short16)v;
+        printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 
+               vl.array[0], vl.array[1], vl.array[2], vl.array[3],
+               vl.array[4], vl.array[5], vl.array[6], vl.array[7],
+               vl.array[8], vl.array[9], vl.array[10], vl.array[11],
+               vl.array[12], vl.array[13], vl.array[14], vl.array[15]);
+    }
+
+    void _mm256_print_epi32(__m256i v) @trusted
+    {
+        int8 vl = cast(int8)v;
+        printf("%d %d %d %d %d %d %d %d\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3],
+                                            vl.array[4], vl.array[5], vl.array[6], vl.array[7]);
+    }
+
+    void _mm256_print_epi64(__m256i v) @trusted
+    {
+        long4 vl = cast(long4)v;
+        printf("%lld %lld %lld %lld\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3]);
+    }
+
+    void _mm256_print_epi8(__m256i v) @trusted
+    {
+        byte[32] C = (cast(byte32)v).array;
+        printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
+               C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], 
+               C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15],
+               C[16], C[17], C[18], C[19], C[20], C[21], C[22], C[23], 
+               C[24], C[25], C[26], C[27], C[28], C[29], C[30], C[31]);
+
+    }
+}
+
+
+//
+//  <FLOATING-POINT COMPARISONS>
+//
+// Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
+//       need different IR generation.
+
+enum FPComparison
+{
+    false_,// always false
+    oeq,   // ordered and equal
+    ogt,   // ordered and greater than
+    oge,   // ordered and greater than or equal
+    olt,   // ordered and less than
+    ole,   // ordered and less than or equal
+    one,   // ordered and not equal
+    ord,   // ordered (no nans)
+    ueq,   // unordered or equal
+    ugt,   // unordered or greater than ("nle")
+    uge,   // unordered or greater than or equal ("nlt")
+    ult,   // unordered or less than ("nge")
+    ule,   // unordered or less than or equal ("ngt")
+    une,   // unordered or not equal ("neq")
+    uno,   // unordered (either nans)
+    true_, // always true
+}
+
+private static immutable string[FPComparison.max+1] FPComparisonToString =
+[
+    "false",
+    "oeq",
+    "ogt",
+    "oge",
+    "olt",
+    "ole",
+    "one",
+    "ord",
+    "ueq",
+    "ugt",
+    "uge",
+    "ult",
+    "ule",
+    "une",
+    "uno",
+    "true"
+];
+
+// AVX FP comparison to FPComparison
+FPComparison mapAVXFPComparison(int imm8) pure @safe
+{
+    // Always map on non-signalling
+    static immutable FPComparison[16] mapping =
+    [
+        FPComparison.oeq, // _CMP_EQ_OQ
+        FPComparison.olt, // _CMP_LT_OS
+        FPComparison.ole, // _CMP_LE_OS
+        FPComparison.uno, // _CMP_UNORD_Q
+        FPComparison.une, // _CMP_NEQ_UQ // TODO does it mean net-equal OR unordered?
+        FPComparison.uge, // _CMP_NLT_US
+        FPComparison.ugt, // _CMP_NLE_US
+        FPComparison.ord, // _CMP_ORD_Q
+        FPComparison.ueq,   // _CMP_EQ_UQ  
+        FPComparison.ult,   // _CMP_NGE_US 
+        FPComparison.ule,   // _CMP_NGT_US 
+        FPComparison.false_,// _CMP_FALSE_OQ
+        FPComparison.one,   // _CMP_NEQ_OQ
+        FPComparison.oge,   // _CMP_GE_OS
+        FPComparison.ogt,   // _CMP_GT_OS
+        FPComparison.true_  // _CMP_TRUE_UQ
+    ];
+
+    return mapping[imm8 & 0x0f]; // note: signalling NaN information is mixed up
+}
+
+// Individual float comparison: returns -1 for true or 0 for false.
+// Useful for DMD and testing
+private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
+{
+    bool unordered = isnan(a) || isnan(b);
+    final switch(comparison) with(FPComparison)
+    {
+        case false_: return false;
+        case oeq: return a == b;
+        case ogt: return a > b;
+        case oge: return a >= b;
+        case olt: return a < b;
+        case ole: return a <= b;
+        case one: return !unordered && (a != b); // NaN with != always yields true
+        case ord: return !unordered; 
+        case ueq: return unordered || (a == b);
+        case ugt: return unordered || (a > b);
+        case uge: return unordered || (a >= b);
+        case ult: return unordered || (a < b);
+        case ule: return unordered || (a <= b);
+        case une: return (a != b); // NaN with != always yields true
+        case uno: return unordered;
+        case true_: return true;
+    }
+}
+
+static if (LDC_with_optimizations) // this save time for bigger projects, since LDCInlineIR gets more expensive there.
+{
+    /// Provides packed float comparisons
+    package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
+    {
+        enum ir = `
+            %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
+            %r = sext <4 x i1> %cmp to <4 x i32>
+            ret <4 x i32> %r`;
+
+        return LDCInlineIR!(ir, int4, float4, float4)(a, b);
+    }
+
+    ///ditto
+    package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @safe
+    {
+        enum ir = `
+            %cmp = fcmp `~ FPComparisonToString[comparison] ~` <8 x float> %0, %1
+            %r = sext <8 x i1> %cmp to <8 x i32>
+            ret <8 x i32> %r`;
+        return LDCInlineIR!(ir, int8, float8, float8)(a, b);
+    }
+
+    /// Provides packed double comparisons
+    package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
+    {
+        enum ir = `
+            %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
+            %r = sext <2 x i1> %cmp to <2 x i64>
+            ret <2 x i64> %r`;
+
+        return LDCInlineIR!(ir, long2, double2, double2)(a, b);
+    }
+
+    ///ditto 
+    package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @safe
+    {
+        enum ir = `
+            %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x double> %0, %1
+            %r = sext <4 x i1> %cmp to <4 x i64>
+            ret <4 x i64> %r`;
+        return LDCInlineIR!(ir, long4, double4, double4)(a, b);
+    }
+
+    /// CMPSS-style comparisons
+    /// clang implement it through x86 intrinsics, it is possible with IR alone
+    /// but leads to less optimal code.
+    /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
+    /// Not that simple.
+    package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
+    {
+        /*
+        enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
+        enum bool invertOp = (predicateNumber & 0x80) != 0;
+        static if(invertOp)
+            return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
+        else
+            return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
+        */
+        enum ir = `
+            %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
+            %r = sext i1 %cmp to i32
+            %r2 = bitcast i32 %r to float
+            ret float %r2`;
+
+        float4 r = a;
+        r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
+        return r;
+    }
+
+    /// CMPSD-style comparisons
+    /// clang implement it through x86 intrinsics, it is possible with IR alone
+    /// but leads to less optimal code.
+    /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
+    /// Not that simple.    
+    package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
+    {
+        enum ir = `
+            %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
+            %r = sext i1 %cmp to i64
+            %r2 = bitcast i64 %r to double
+            ret double %r2`;
+
+        double2 r = a;
+        r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
+        return r;
+    }
+}
+else
+{
+    /// Provides packed float comparisons
+    package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
+    {
+        int4 result;
+        foreach(i; 0..4)
+        {
+            result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
+        }
+        return result;
+    }
+    ///ditto
+    package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @trusted
+    {
+        int8 result;
+        foreach(i; 0..8)
+        {
+            result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
+        }
+        return result;
+    }
+
+    /// Provides packed double comparisons
+    package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
+    {
+        long2 result;
+        foreach(i; 0..2)
+        {
+            result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
+        }
+        return result;
+    }
+    ///ditto
+    package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @trusted
+    {
+        long4 result;
+        foreach(i; 0..4)
+        {
+            result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
+        }
+        return result;
+    }
+
+    /// Provides CMPSS-style comparison
+    package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
+    {
+        int4 result = cast(int4)a;
+        result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
+        return cast(float4)result;
+    }
+
+    /// Provides CMPSD-style comparison
+    package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
+    {
+        long2 result = cast(long2)a;
+        result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
+        return cast(double2)result;
+    }
+}
+unittest // cmpps
+{
+    // Check all comparison type is working
+    float4 A = [1, 3, 5, float.nan];
+    float4 B = [2, 3, 4, 5];
+
+    int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
+    int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
+    int4 result_oge = cmpps!(FPComparison.oge)(A, B);
+    int4 result_olt = cmpps!(FPComparison.olt)(A, B);
+    int4 result_ole = cmpps!(FPComparison.ole)(A, B);
+    int4 result_one = cmpps!(FPComparison.one)(A, B);
+    int4 result_ord = cmpps!(FPComparison.ord)(A, B);
+    int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
+    int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
+    int4 result_uge = cmpps!(FPComparison.uge)(A, B);
+    int4 result_ult = cmpps!(FPComparison.ult)(A, B);
+    int4 result_ule = cmpps!(FPComparison.ule)(A, B);
+    int4 result_une = cmpps!(FPComparison.une)(A, B);
+    int4 result_uno = cmpps!(FPComparison.uno)(A, B);
+
+    static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
+    static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
+    static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
+    static immutable int[4] correct_olt    = [-1, 0, 0, 0];
+    static immutable int[4] correct_ole    = [-1,-1, 0, 0];
+    static immutable int[4] correct_one    = [-1, 0,-1, 0];
+    static immutable int[4] correct_ord    = [-1,-1,-1, 0];
+    static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
+    static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
+    static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
+    static immutable int[4] correct_ult    = [-1, 0, 0,-1];
+    static immutable int[4] correct_ule    = [-1,-1, 0,-1];
+    static immutable int[4] correct_une    = [-1, 0,-1,-1];
+    static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
+
+    assert(result_oeq.array == correct_oeq);
+    assert(result_ogt.array == correct_ogt);
+    assert(result_oge.array == correct_oge);
+    assert(result_olt.array == correct_olt);
+    assert(result_ole.array == correct_ole);
+    assert(result_one.array == correct_one);
+    assert(result_ord.array == correct_ord);
+    assert(result_ueq.array == correct_ueq);
+    assert(result_ugt.array == correct_ugt);
+    assert(result_uge.array == correct_uge);
+    assert(result_ult.array == correct_ult);
+    assert(result_ule.array == correct_ule);
+    assert(result_une.array == correct_une);
+    assert(result_uno.array == correct_uno);
+}
+unittest
+{
+    double2 a = [1, 3];
+    double2 b = [2, 3];
+    long2 c = cmppd!(FPComparison.ult)(a, b);
+    static immutable long[2] correct = [cast(long)(-1), 0];
+    assert(c.array == correct);
+}
+unittest // cmpss
+{
+    void testComparison(FPComparison comparison)(float4 A, float4 B)
+    {
+        float4 result = cmpss!comparison(A, B);
+        int4 iresult = cast(int4)result;
+        int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
+        assert(iresult.array[0] == expected);
+        assert(result.array[1] == A.array[1]);
+        assert(result.array[2] == A.array[2]);
+        assert(result.array[3] == A.array[3]);
+    }
+
+    // Check all comparison type is working
+    float4 A = [1, 3, 5, 6];
+    float4 B = [2, 3, 4, 5];
+    float4 C = [float.nan, 3, 4, 5];
+
+    testComparison!(FPComparison.oeq)(A, B);
+    testComparison!(FPComparison.oeq)(A, C);
+    testComparison!(FPComparison.ogt)(A, B);
+    testComparison!(FPComparison.ogt)(A, C);
+    testComparison!(FPComparison.oge)(A, B);
+    testComparison!(FPComparison.oge)(A, C);
+    testComparison!(FPComparison.olt)(A, B);
+    testComparison!(FPComparison.olt)(A, C);
+    testComparison!(FPComparison.ole)(A, B);
+    testComparison!(FPComparison.ole)(A, C);
+    testComparison!(FPComparison.one)(A, B);
+    testComparison!(FPComparison.one)(A, C);
+    testComparison!(FPComparison.ord)(A, B);
+    testComparison!(FPComparison.ord)(A, C);
+    testComparison!(FPComparison.ueq)(A, B);
+    testComparison!(FPComparison.ueq)(A, C);
+    testComparison!(FPComparison.ugt)(A, B);
+    testComparison!(FPComparison.ugt)(A, C);
+    testComparison!(FPComparison.uge)(A, B);
+    testComparison!(FPComparison.uge)(A, C);
+    testComparison!(FPComparison.ult)(A, B);
+    testComparison!(FPComparison.ult)(A, C);
+    testComparison!(FPComparison.ule)(A, B);
+    testComparison!(FPComparison.ule)(A, C);
+    testComparison!(FPComparison.une)(A, B);
+    testComparison!(FPComparison.une)(A, C);
+    testComparison!(FPComparison.uno)(A, B);
+    testComparison!(FPComparison.uno)(A, C);
+}
+unittest // cmpsd
+{
+    void testComparison(FPComparison comparison)(double2 A, double2 B)
+    {
+        double2 result = cmpsd!comparison(A, B);
+        long2 iresult = cast(long2)result;
+        long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
+        assert(iresult.array[0] == expected);
+        assert(result.array[1] == A.array[1]);
+    }
+
+    // Check all comparison type is working
+    double2 A = [1, 3];
+    double2 B = [2, 4];
+    double2 C = [double.nan, 5];
+
+    testComparison!(FPComparison.oeq)(A, B);
+    testComparison!(FPComparison.oeq)(A, C);
+    testComparison!(FPComparison.ogt)(A, B);
+    testComparison!(FPComparison.ogt)(A, C);
+    testComparison!(FPComparison.oge)(A, B);
+    testComparison!(FPComparison.oge)(A, C);
+    testComparison!(FPComparison.olt)(A, B);
+    testComparison!(FPComparison.olt)(A, C);
+    testComparison!(FPComparison.ole)(A, B);
+    testComparison!(FPComparison.ole)(A, C);
+    testComparison!(FPComparison.one)(A, B);
+    testComparison!(FPComparison.one)(A, C);
+    testComparison!(FPComparison.ord)(A, B);
+    testComparison!(FPComparison.ord)(A, C);
+    testComparison!(FPComparison.ueq)(A, B);
+    testComparison!(FPComparison.ueq)(A, C);
+    testComparison!(FPComparison.ugt)(A, B);
+    testComparison!(FPComparison.ugt)(A, C);
+    testComparison!(FPComparison.uge)(A, B);
+    testComparison!(FPComparison.uge)(A, C);
+    testComparison!(FPComparison.ult)(A, B);
+    testComparison!(FPComparison.ult)(A, C);
+    testComparison!(FPComparison.ule)(A, B);
+    testComparison!(FPComparison.ule)(A, C);
+    testComparison!(FPComparison.une)(A, B);
+    testComparison!(FPComparison.une)(A, C);
+    testComparison!(FPComparison.uno)(A, B);
+    testComparison!(FPComparison.uno)(A, C);
+}
+
+//
+//  </FLOATING-POINT COMPARISONS>
+//
+
+
+__m64 to_m64(__m128i a) pure @trusted
+{
+    long2 la = cast(long2)a;
+    long1 r = la.array[0];
+    return r;
+}
+
+__m128i to_m128i(__m64 a) pure @trusted
+{
+  /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 
+    
+    version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 
+    {
+        long2 r = a.array[0];
+        r.ptr[1] = 0;
+        return cast(int4)r;
+    }
+    else */
+    {
+        long2 r = [0, 0];
+        r.ptr[0] = a.array[0];
+        return cast(__m128i)r;
+    }
+}
+
+
+// ADDITIONAL LLVM INTRINSICS
+// Basically LDC didn't add them yet
+version(LDC)
+{
+    static if (__VERSION__ >= 2097) // LDC 1.27+
+    {
+        pragma(LDC_intrinsic, "llvm.abs.i#")
+            T inteli_llvm_abs(T)(T val, bool attrib);
+    }
+
+    static if (__VERSION__ >= 2092) // LDC 1.22+
+    {
+        pragma(LDC_intrinsic, "llvm.sadd.sat.i#")
+            T inteli_llvm_adds(T)(T a, T b) pure @safe;
+        pragma(LDC_intrinsic, "llvm.ssub.sat.i#")
+            T inteli_llvm_subs(T)(T a, T b) pure @safe;
+        pragma(LDC_intrinsic, "llvm.uadd.sat.i#")
+            T inteli_llvm_addus(T)(T a, T b) pure @safe;
+        pragma(LDC_intrinsic, "llvm.usub.sat.i#")
+            T inteli_llvm_subus(T)(T a, T b) pure @safe;
+
+        enum LDC_with_saturated_intrinsics = true;
+    }
+    else
+        enum LDC_with_saturated_intrinsics = false;
+}
+else
+    enum LDC_with_saturated_intrinsics = false;
+
+// ADDITIONAL x86 INTRINSICS
+// Absent from ldc.gccbuiltins_x86 for some reason, but needed.
+// https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td
+static if (LDC_with_SSE41)
+{
+    pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb")
+        byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe;
+}
+
+// SOME NEON INTRINSICS
+// Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
+// Not in the public API but the simde project expose it all for the user to use.
+// MAYDO: create a new neon.d module, for internal use only.
+// MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
+static if (LDC_with_ARM64)
+{
+    // VERY USEFUL LINK
+    // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
+    // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/
+
+    // Note: it is helpful to verify, in case of complex sequence of intrinsics, that the result is actually false.
+    // Some intrinsics have trouble when inlined inside another, such as vmovl_low_s32. In this case, it's better to use builtins 
+    // from backend to have an inlining that still match the instruction.
+
+    pragma(LDC_intrinsic, "llvm.aarch64.crc32cb")
+        uint __crc32cb(uint a, uint b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.crc32ch")
+        uint __crc32ch(uint a, uint b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.crc32cw")
+        uint __crc32cw(uint a, uint b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.crc32cx")
+        uint __crc32cd(uint a, ulong b) pure @safe;
+
+    //pragma(LDC_intrinsic, "llvm.aarch64.dmb")
+    //    uint __dmb(int a) @safe; // didn't found a name in intrinsic list
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8")
+        byte16 vabdq_u8(byte16 a, byte16 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16")
+        short8 vabsq_s16(short8 a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32")
+        int4 vabsq_s32(int4 a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8")
+        byte16 vabsq_s8(byte16 a) pure @safe;
+
+    byte8 vand_u8(byte8 a, byte8 b) pure @safe
+    {
+        return a & b;
+    }
+
+    long2 vandq_s64(long2 a, long2 b)
+    {
+        return a & b;
+    }
+
+    long2 vbicq_s64(long2 a, long2 b) pure @safe
+    {
+        return a & ~b;
+    }
+
+    int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe
+    {
+        return c ^ ((c ^ b) & a);
+    }
+
+    byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe
+    {
+        return c ^ ((c ^ b) & a);
+    }
+
+    long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe
+    {
+        return c ^ ((c ^ b) & a);
+    }
+
+    short8 vcombine_s16(short4 lo, short4 hi) pure @trusted
+    {
+        short8 r;
+        r.ptr[0]  = lo.array[0];
+        r.ptr[1]  = lo.array[1];
+        r.ptr[2]  = lo.array[2];
+        r.ptr[3]  = lo.array[3];
+        r.ptr[4]  = hi.array[0];
+        r.ptr[5]  = hi.array[1];
+        r.ptr[6]  = hi.array[2];
+        r.ptr[7]  = hi.array[3];
+        return r;
+    }
+
+    int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
+    {
+        int4 r;
+        r.ptr[0] = lo.array[0];
+        r.ptr[1] = lo.array[1];
+        r.ptr[2] = hi.array[0];
+        r.ptr[3] = hi.array[1];
+        return r;
+    }
+
+    byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
+    {
+        byte16 r;
+        r.ptr[0]  = lo.array[0];
+        r.ptr[1]  = lo.array[1];
+        r.ptr[2]  = lo.array[2];
+        r.ptr[3]  = lo.array[3];
+        r.ptr[4]  = lo.array[4];
+        r.ptr[5]  = lo.array[5];
+        r.ptr[6]  = lo.array[6];
+        r.ptr[7]  = lo.array[7];
+        r.ptr[8]  = hi.array[0];
+        r.ptr[9]  = hi.array[1];
+        r.ptr[10] = hi.array[2];
+        r.ptr[11] = hi.array[3];
+        r.ptr[12] = hi.array[4];
+        r.ptr[13] = hi.array[5];
+        r.ptr[14] = hi.array[6];
+        r.ptr[15] = hi.array[7];
+        return r;
+    }
+
+    short8 vcombine_u16(short4 lo, short4 hi) pure @trusted
+    {
+        short8 r;
+        r.ptr[0]  = lo.array[0];
+        r.ptr[1]  = lo.array[1];
+        r.ptr[2]  = lo.array[2];
+        r.ptr[3]  = lo.array[3];
+        r.ptr[4]  = hi.array[0];
+        r.ptr[5]  = hi.array[1];
+        r.ptr[6]  = hi.array[2];
+        r.ptr[7]  = hi.array[3];
+        return r;
+    }
+
+
+    // float4 => int4
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
+        int4 vcvtmq_s32_f32(float4 a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
+        int4 vcvtnq_s32_f32(float4 a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
+        int4 vcvtpq_s32_f32(float4 a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
+        int4 vcvtzq_s32_f32(float4 a) pure @safe;
+
+
+    // double2 => long2
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64")
+        long2 vcvtmq_s64_f64(double2 a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64")
+        long2 vcvtnq_s64_f64(double2 a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64")
+        long2 vcvtpq_s64_f64(double2 a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64")
+        long2 vcvtzq_s64_f64(double2 a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32")
+        int vcvtms_s32_f32(float a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32")
+        int vcvtns_s32_f32(float a) pure @safe;    
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32")
+        int vcvtps_s32_f32(float a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32")
+        int vcvts_s32_f32(float a) pure @safe;
+     
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64")
+        int vcvtms_s32_f64(double a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64")
+        int vcvtns_s32_f64(double a) pure @safe;    
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64")
+        int vcvtps_s32_f64(double a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64")
+        int vcvts_s32_f64(double a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32")
+        long vcvtms_s64_f32(float a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32")
+        long vcvtns_s64_f32(float a) pure @safe;    
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32")
+        long vcvtps_s64_f32(float a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32")
+        long vcvts_s64_f32(float a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64")
+        long vcvtms_s64_f64(double a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64")
+        long vcvtns_s64_f64(double a) pure @safe;    
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64")
+        long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64")
+        long vcvts_s64_f64(double a) pure @safe;
+
+    long2 vdupq_n_s64(long value) pure @safe
+    {
+        long2 r;
+        r = value;
+        return r;
+    }
+
+    short4 vget_high_s16(short8 a) pure @trusted
+    {
+        short4 r;
+        r.ptr[0] = a.array[4];
+        r.ptr[1] = a.array[5];
+        r.ptr[2] = a.array[6];
+        r.ptr[3] = a.array[7];
+        return r;
+    }
+
+    int2 vget_high_s32(int4 a) pure @trusted
+    {
+        int2 r;
+        r.ptr[0] = a.array[2];
+        r.ptr[1] = a.array[3];
+        return r;
+    }
+
+    byte8 vget_high_u8(byte16 a) pure @trusted
+    {
+        byte8 r;
+        r.ptr[0] = a.array[8];
+        r.ptr[1] = a.array[9];
+        r.ptr[2] = a.array[10];
+        r.ptr[3] = a.array[11];
+        r.ptr[4] = a.array[12];
+        r.ptr[5] = a.array[13];
+        r.ptr[6] = a.array[14];
+        r.ptr[7] = a.array[15];
+        return r;
+    }
+
+    short4 vget_low_s16(short8 a) pure @trusted
+    {
+        short4 r;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = a.array[1];
+        r.ptr[2] = a.array[2];
+        r.ptr[3] = a.array[3];
+        return r;
+    } 
+
+    int2 vget_low_s32(int4 a) pure @trusted
+    {
+        int2 r;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = a.array[1];
+        return r;
+    }
+
+    byte8 vget_low_u8(byte16 a) pure @trusted
+    {
+        byte8 r;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = a.array[1];
+        r.ptr[2] = a.array[2];
+        r.ptr[3] = a.array[3];
+        r.ptr[4] = a.array[4];
+        r.ptr[5] = a.array[5];
+        r.ptr[6] = a.array[6];
+        r.ptr[7] = a.array[7];
+        return r;
+    }
+
+    long vgetq_lane_s64(long2 v, const int lane) pure @safe
+    {
+        return v.array[lane];
+    }
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
+        short8 vmaxq_s16(short8 a, short8 b) pure @safe;
+
+    int4 vmaxq_s32(int4 a, int4 b) pure @safe
+    {
+        int4 r;
+        r[0] = a[0] >= b[0] ? a[0] : b[0];
+        r[1] = a[1] >= b[1] ? a[1] : b[1];
+        r[2] = a[2] >= b[2] ? a[2] : b[2];
+        r[3] = a[3] >= b[3] ? a[3] : b[3];
+        return r;
+    }
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
+        short8 vminq_s16(short8 a, short8 b) pure @safe;
+
+    int4 vmovl_u16(short4 a) pure @trusted
+    {
+        int4 r;
+        r.ptr[0] = cast(ushort)a.array[0];
+        r.ptr[1] = cast(ushort)a.array[1];
+        r.ptr[2] = cast(ushort)a.array[2];
+        r.ptr[3] = cast(ushort)a.array[3];
+        return r;
+    }
+
+    int2 vmovn_s64(long2 a) pure @trusted
+    {
+        int2 r;
+        r.ptr[0] = cast(int)(a.array[0]);
+        r.ptr[1] = cast(int)(a.array[1]);
+        return r;
+    }        
+
+    int4 vmull_s16(short4 a, short4 b) pure @trusted
+    {
+        int4 r;
+        r.ptr[0] = a.array[0] * b.array[0];
+        r.ptr[1] = a.array[1] * b.array[1];
+        r.ptr[2] = a.array[2] * b.array[2];
+        r.ptr[3] = a.array[3] * b.array[3];
+        return r;
+    }
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64")
+        long2 vmull_s32(int2 a, int2 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16")
+        short4 vpadd_s16(short4 a, short4 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
+        int2 vpadd_s32(int2 a, int2 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
+        byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8")
+        short8 vpaddlq_u8 (byte16 a) pure @safe;
+
+    static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
+    {
+        pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
+            float4 vpaddq_f32(float4 a, float4 b) pure @safe;
+    }
+    else
+    {
+        pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
+            float4 vpaddq_f32(float4 a, float4 b) pure @safe;
+    }
+    
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16")
+        short8 vpaddq_s16(short8 a, short8 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
+        byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32")
+        int4 vpaddq_s32(int4 a, int4 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16")
+        short4 vqadd_s16(short4 a, short4 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16")
+        short8 vqaddq_s16(short8 a, short8 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
+        byte8 vqmovn_s16(short8 a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16")
+        short4 vqmovn_s32(int4 a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16")
+        short4 vqmovn_u32(int4 a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
+        byte8 vqmovun_s16(short8 a) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16")
+        short4 vqsub_s16(short4 a, short4 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16")
+        short8 vqsubq_s16(short8 a, short8 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8")
+        byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
+        byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
+        short8 vrhadd_u16(short8 a, short8 b) pure @safe;
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16")
+        short4 vrshrn_n_s32(int4 a, int n) pure @safe;        
+
+    byte8 vshr_u8(byte8 a, byte8 b) pure @safe
+    {
+        return a >>> b;
+    }
+
+    byte16 vshrq_n_s8(byte16 a, byte r) pure @safe
+    { 
+        a = a >> byte16(cast(byte)r);
+        return a;
+    }
+
+    pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8")
+        byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe;
+}
+
+version(unittest)
+{
+    double abs_double(double x) @trusted
+    {
+        version(LDC)
+            return llvm_fabs(x);
+        else
+        {
+            long uf = *cast(long*)(&x);
+            uf &= 0x7fffffff_ffffffff;
+            return *cast(double*)(&uf);
+        }
+    }
+}
+
+// needed because in old GDC from travis, core.stdc.math.isnan isn't pure
+
+bool isnan(float x) pure @trusted
+{
+    uint u = *cast(uint*)(&x);
+    bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF);
+    return result;
+}
+unittest
+{
+    float x = float.nan;
+    assert(isnan(x));
+
+    x = 0;
+    assert(!isnan(x));
+    
+    x = float.infinity;
+    assert(!isnan(x));
+}
+
+bool isnan(double x) pure @trusted
+{
+    ulong u = *cast(ulong*)(&x);
+    return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF);
+}
+unittest
+{
+    double x = double.nan;
+    assert(isnan(x));
+
+    x = 0;
+    assert(!isnan(x));
+    
+    x = double.infinity;
+    assert(!isnan(x));
+}
\ No newline at end of file
diff --git a/external/inteli/math.d b/external/inteli/math.d
new file mode 100644
index 0000000..fcc79aa
--- /dev/null
+++ b/external/inteli/math.d
@@ -0,0 +1,350 @@
+/**
+* Transcendental bonus functions.
+*
+* Copyright: Copyright Guillaumr Piolat 2016-2020.
+*            Copyright (C) 2007  Julien Pommier
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.math;
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+import inteli.emmintrin;
+import inteli.internals;
+
+nothrow @nogc:
+
+/// Natural `log` computed for a single 32-bit float.
+/// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
+/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
+// #BONUS
+float _mm_log_ss(float v) pure @safe
+{
+    __m128 r = _mm_log_ps(_mm_set1_ps(v));
+    return r.array[0];
+}
+
+/// Natural logarithm computed for 4 simultaneous float.
+/// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
+/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
+// #BONUS
+__m128 _mm_log_ps(__m128 x) pure @safe
+{
+    static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000];
+    static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524];
+    static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2];
+    static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1];
+    static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1];
+    static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1];
+    static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1];
+    static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1];
+    static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1];
+    static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1];
+    static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1];
+    static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
+    static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
+
+    /* the smallest non denormalized float number */
+    static immutable __m128i _psi_min_norm_pos  = [0x00800000,   0x00800000,   0x00800000, 0x00800000];
+
+    __m128i emm0;
+    __m128 one = _ps_1;
+    __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+    x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos);  /* cut off denormalized stuff */
+    emm0 = _mm_srli_epi32(cast(__m128i)x, 23);
+
+    /* keep only the fractional part */
+    x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask);
+    x = _mm_or_ps(x, _ps_0p5);
+
+    emm0 = _mm_sub_epi32(emm0, _pi32_0x7f);
+    __m128 e = _mm_cvtepi32_ps(emm0);
+    e += one;
+    __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF);
+    __m128 tmp = _mm_and_ps(x, mask);
+    x -= one;
+    e -= _mm_and_ps(one, mask);
+    x += tmp;
+    __m128 z = x * x;
+    __m128 y = _ps_cephes_log_p0;
+    y *= x;
+    y += _ps_cephes_log_p1;
+    y *= x;
+    y += _ps_cephes_log_p2;
+    y *= x;
+    y += _ps_cephes_log_p3;
+    y *= x;
+    y += _ps_cephes_log_p4;
+    y *= x;
+    y += _ps_cephes_log_p5;
+    y *= x;
+    y += _ps_cephes_log_p6;
+    y *= x;
+    y += _ps_cephes_log_p7;
+    y *= x;
+    y += _ps_cephes_log_p8;
+    y *= x;
+
+    y = y * z;
+    tmp = e * _ps_cephes_log_q1;
+    y += tmp;
+    tmp = z * _ps_0p5;
+    y = y - tmp;
+    tmp = e * _ps_cephes_log_q2;
+    x += y;
+    x += tmp;
+    x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+    return x;
+}
+
+/// Natural `exp` computed for a single float.
+/// This is an approximation, valid up to approximately -109dB of accuracy
+/// IMPORTANT: NaN input not supported.
+// #BONUS
+float _mm_exp_ss(float v) pure @safe
+{
+    __m128 r = _mm_exp_ps(_mm_set1_ps(v));
+    return r.array[0];
+}
+
+/// Natural `exp` computed for 4 simultaneous float in `x`.
+/// This is an approximation, valid up to approximately -109dB of accuracy
+/// IMPORTANT: NaN input not supported.
+// #BONUS
+__m128 _mm_exp_ps(__m128 x) pure @safe
+{
+    static immutable __m128 _ps_exp_hi         = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f];
+    static immutable __m128 _ps_exp_lo         = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f];
+    static immutable __m128 _ps_cephes_LOG2EF  = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341];
+    static immutable __m128 _ps_cephes_exp_C1  = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
+    static immutable __m128 _ps_cephes_exp_C2  = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
+    static immutable __m128 _ps_cephes_exp_p0  = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4];
+    static immutable __m128 _ps_cephes_exp_p1  = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3];
+    static immutable __m128 _ps_cephes_exp_p2  = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3];
+    static immutable __m128 _ps_cephes_exp_p3  = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2];
+    static immutable __m128 _ps_cephes_exp_p4  = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1];
+    static immutable __m128 _ps_cephes_exp_p5  = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1];
+
+    __m128 tmp = _mm_setzero_ps(), fx;
+    __m128i emm0;
+    __m128 one = _ps_1;
+
+    x = _mm_min_ps(x, _ps_exp_hi);
+    x = _mm_max_ps(x, _ps_exp_lo);
+
+    /* express exp(x) as exp(g + n*log(2)) */
+    fx = x * _ps_cephes_LOG2EF;
+    fx += _ps_0p5;
+
+    /* how to perform a floorf with SSE: just below */
+    emm0 = _mm_cvttps_epi32(fx);
+    tmp  = _mm_cvtepi32_ps(emm0);
+
+    /* if greater, substract 1 */
+    __m128 mask = _mm_cmpgt_ps(tmp, fx);
+    mask = _mm_and_ps(mask, one);
+    fx = tmp - mask;
+
+    tmp = fx * _ps_cephes_exp_C1;
+    __m128 z = fx * _ps_cephes_exp_C2;
+    x -= tmp;
+    x -= z;
+
+    z = x * x;
+
+    __m128 y = _ps_cephes_exp_p0;
+    y *= x;
+    y += _ps_cephes_exp_p1;
+    y *= x;
+    y += _ps_cephes_exp_p2;
+    y *= x;
+    y += _ps_cephes_exp_p3;
+    y *= x;
+    y += _ps_cephes_exp_p4;
+    y *= x;
+    y += _ps_cephes_exp_p5;
+    y *= z;
+    y += x;
+    y += one;
+
+    /* build 2^n */
+    emm0 = _mm_cvttps_epi32(fx);
+
+    emm0 = _mm_add_epi32(emm0, _pi32_0x7f);
+    emm0 = _mm_slli_epi32(emm0, 23);
+    __m128 pow2n = cast(__m128)emm0;
+    y *= pow2n;
+    return y;
+}
+
+/// Computes `base^exponent` for a single 32-bit float.
+/// This is an approximation, valid up to approximately -100dB of accuracy
+/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
+// #BONUS
+float _mm_pow_ss(float base, float exponent) pure @safe
+{
+    __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent));
+    return r.array[0];
+}
+
+/// Computes `base^exponent`, for 4 floats at once.
+/// This is an approximation, valid up to approximately -100dB of accuracy
+/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
+// #BONUS
+__m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe
+{
+    return _mm_exp_ps(exponents * _mm_log_ps(base));
+}
+
+/// Computes `base^exponent`, for 4 floats at once.
+/// This is an approximation, valid up to approximately -100dB of accuracy
+/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
+// #BONUS
+__m128 _mm_pow_ps(__m128 base, float exponent) pure @safe
+{
+    return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base));
+}
+
+unittest
+{
+    import std.math;
+
+    bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow
+    {
+        if (!isFinite(groundTruth))
+            return true; // no need to approximate where this is NaN or infinite
+
+        if (groundTruth == 0) // the approximaton should produce zero too if needed
+        {
+            return approx == 0;
+        }
+
+        if (approx == 0)
+        {
+            // If the approximation produces zero, the error should be below 140 dB
+            return ( abs(groundTruth) < 1e-7 );
+        }
+
+        if ( ( abs(groundTruth / approx) - 1 ) >= epsilon)
+        {
+            import core.stdc.stdio;
+            debug printf("approxEquals (%g, %g, %g) failed\n", groundTruth, approx, epsilon);
+            debug printf("ratio is %f\n", abs(groundTruth / approx) - 1);
+        }
+
+        return ( abs(groundTruth / approx) - 1 ) < epsilon;
+    }
+
+    // test _mm_log_ps
+    for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.05)
+    {
+        foreach (exponent; -23..23)
+        {
+            double x = mantissa * 2.0 ^^ exponent;
+            double phobosValue = log(x);
+            __m128 v = _mm_log_ps(_mm_set1_ps(x));
+            foreach(i; 0..4)
+                assert(approxEquals(phobosValue, v.array[i], 1.1e-6));
+        }
+    }
+
+    // test _mm_exp_ps    
+    for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
+    {
+        foreach (exponent; -23..23)
+        {
+            double x = mantissa * 2.0 ^^ exponent;
+
+            // don't test too high numbers because they saturate FP precision pretty fast
+            if (x > 50) continue;
+
+            double phobosValue = exp(x);
+            __m128 v = _mm_exp_ps(_mm_set1_ps(x));
+            foreach(i; 0..4)
+            {
+                if (!approxEquals(phobosValue, v.array[i], 3.4e-6))
+                {
+                    import core.stdc.stdio;
+                    printf("x = %f   truth = %f vs estimate = %fn", x, phobosValue, v.array[i]);
+                    assert(false);
+                }
+            }
+        }
+    }
+
+    // test than exp(-inf) is 0
+    {
+        __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity));
+        float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
+        assert(R.array == correct);
+    }
+
+    // test log baheviour with NaN and infinities
+    // the only guarantee for now is that _mm_log_ps(negative) yield a NaN
+    {
+        __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan));
+      // DOESN'T PASS
+      //  assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity
+      // DOESN'T PASS
+      //  assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity
+        assert(isNaN(R.array[2])); // log(negative number) = NaN
+
+        // DOESN'T PASS
+        //assert(isNaN(R[3])); // log(NaN) = NaN
+    }
+
+
+    // test _mm_pow_ps
+    for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
+    {
+        foreach (exponent; -8..4)
+        {
+            double powExponent = mantissa * 2.0 ^^ exponent;
+
+            for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.1)
+            {
+                foreach (exponent2; -4..4)
+                {
+                    double powBase = mantissa2 * 2.0 ^^ exponent2;
+                    double phobosValue = pow(powBase, powExponent);
+                    float fPhobos = phobosValue;
+                    if (!isFinite(fPhobos)) continue;
+                     __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent));
+
+                    foreach(i; 0..4)
+                    {
+                        if (!approxEquals(phobosValue, v.array[i], 1e-5))
+                        {
+                            printf("%g ^^ %g\n", powBase, powExponent);
+                            assert(false);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+private:
+
+static immutable __m128 _ps_1   = [1.0f, 1.0f, 1.0f, 1.0f];
+static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f];
+static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f];
\ No newline at end of file
diff --git a/external/inteli/mmx.d b/external/inteli/mmx.d
new file mode 100644
index 0000000..3a6c216
--- /dev/null
+++ b/external/inteli/mmx.d
@@ -0,0 +1,1072 @@
+/**
+* MMX intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=MMX
+* 
+* Copyright: Copyright Guillaume Piolat 2019-2020.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.mmx;
+
+public import inteli.types;
+import inteli.internals;
+
+import inteli.xmmintrin;
+import inteli.emmintrin;
+
+nothrow @nogc:
+
+// Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics,
+// since it just generates the right IR and cleaning-up FPU registers is up to the codegen.
+// intel-intrinsics is just semantics.
+// Even GDC does not seem to use mm0-mm7 registers, instead preferring xmm0-xmm7.
+
+
+/// Add packed 16-bit integers in `a` and `b`.
+__m64 _mm_add_pi16 (__m64 a, __m64 b)
+{
+    return cast(__m64)(cast(short4)a + cast(short4)b);
+}
+unittest
+{
+    short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3));
+    short[4] correct = [7, 7, 7, 7];
+    assert(R.array == correct);
+}
+
+/// Add packed 32-bit integers in `a` and `b`.
+__m64 _mm_add_pi32 (__m64 a, __m64 b)
+{
+    return cast(__m64)(cast(int2)a + cast(int2)b);
+}
+unittest
+{
+    int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3));
+    int[2] correct = [7, 7];
+    assert(R.array == correct);
+}
+
+/// Add packed 8-bit integers in `a` and `b`.
+__m64 _mm_add_pi8 (__m64 a, __m64 b)
+{
+    return cast(__m64)(cast(byte8)a + cast(byte8)b);
+}
+unittest
+{
+    byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128));
+    byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1];
+    assert(R.array == correct);
+}
+
+/// Add packed 16-bit integers in `a` and `b` using signed saturation.
+// PERF: PADDSW not generated
+__m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted
+{
+    return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b)));
+}
+unittest
+{
+    short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0),
+                                            _mm_set_pi16(3, 2, 1, 0));
+    static immutable short[4] correctResult = [0, 2, 4, 6];
+    assert(res.array == correctResult);
+}
+
+/// Add packed 8-bit integers in `a` and `b` using signed saturation.
+// PERF: PADDSB not generated
+__m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted
+{
+    return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b)));
+}
+unittest
+{
+    byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0),
+                                         _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
+    static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
+    assert(res.array == correctResult);
+}
+
+/// Add packed 16-bit integers in `a` and `b` using unsigned saturation.
+// PERF: PADDUSW not generated
+__m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted
+{
+    return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b)));
+}
+unittest
+{
+    short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0),
+                                            _mm_set_pi16(3, 2, 1, 0));
+    static immutable short[4] correctResult = [0, cast(short)65535, 4, 6];
+    assert(res.array == correctResult);
+}
+
+/// Add packed 8-bit integers in `a` and `b` using unsigned saturation.
+// PERF: PADDUSB not generated
+__m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted
+{
+    return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b)));
+}
+unittest
+{
+    byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0),
+                                         _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
+    static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14];
+    assert(res.array == correctResult);
+}
+
+/// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`.
+__m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe
+{
+    return a & b;
+}
+unittest
+{
+    __m64 A = [7];
+    __m64 B = [14];
+    __m64 R = _mm_and_si64(A, B);
+    assert(R.array[0] == 6);
+}
+
+/// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`.
+__m64 _mm_andnot_si64 (__m64 a, __m64 b)
+{
+    return (~a) & b;
+}
+unittest
+{
+    __m64 A = [7];
+    __m64 B = [14];
+    __m64 R = _mm_andnot_si64(A, B);
+    assert(R.array[0] == 8);
+}
+
+/// Compare packed 16-bit integers in `a` and `b` for equality.
+__m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_8B)
+    {
+        return cast(__m64)(cast(short4)a == cast(short4)b);
+    }
+    else static if (GDC_with_MMX)
+    {
+        return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b);        
+    }
+    else
+    {
+        return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b);
+    }
+}
+unittest
+{
+    short4   A = [-3, -2, -1,  0];
+    short4   B = [ 4,  3,  2,  1];
+    short[4] E = [ 0,  0,  0,  0];
+    short4   R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B));
+    assert(R.array == E);
+}
+
+/// Compare packed 32-bit integers in `a` and `b` for equality.
+__m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_8B)
+    {
+        return cast(__m64)(cast(int2)a == cast(int2)b);
+    }
+    else static if (GDC_with_MMX)
+    {        
+        return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b);
+    }
+    else
+    {
+        return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b);
+    }
+}
+unittest
+{
+    int2   A = [-3, -2];
+    int2   B = [ 4, -2];
+    int[2] E = [ 0, -1];
+    int2   R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B));
+    assert(R.array == E);
+}
+
+/// Compare packed 8-bit integers in `a` and `b` for equality,
+__m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_8B)
+    {
+        return cast(__m64)(cast(byte8)a == cast(byte8)b);
+    }
+    else static if (GDC_with_MMX)
+    {        
+        return cast(__m64) __builtin_ia32_pcmpeqb(cast(ubyte8)a, cast(ubyte8)b);
+    }
+    else
+    {
+        return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b);
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
+    __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
+    byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B);
+    byte[8] correct =     [0,-1, 0, 0, 0,-1, 0, 0];
+    assert(C.array == correct);
+}
+
+/// Compare packed 16-bit integers in `a` and `b` for greater-than.
+__m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_8B)
+    {
+        return cast(__m64)(cast(short4)a > cast(short4)b);
+    }
+    else static if (GDC_with_MMX)
+    { 
+        return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b);
+    }
+    else
+    {
+        return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b);
+    }
+}
+unittest
+{
+    short4   A = [-3, -2, -1,  0];
+    short4   B = [ 4,  3,  2,  1];
+    short[4] E = [ 0,  0,  0,  0];
+    short4   R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B));
+    assert(R.array == E);
+}
+
+/// Compare packed 32-bit integers in `a` and `b` for greater-than.
+__m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_8B)
+    {
+        return cast(__m64)(cast(int2)a > cast(int2)b);
+    }
+    else static if (GDC_with_MMX)
+    {
+        return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b);
+    }
+    else
+    {
+        return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b);
+    }
+}
+unittest
+{
+    int2   A = [-3,  2];
+    int2   B = [ 4, -2];
+    int[2] E = [ 0, -1];
+    int2   R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B));
+    assert(R.array == E);
+}
+
+/// Compare packed signed 8-bit integers in `a` and `b` for greater-than.
+__m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe
+{
+    static if (SIMD_COMPARISON_MASKS_8B)
+    {
+        return cast(__m64)(cast(byte8)a > cast(byte8)b);
+    }
+    else static if (GDC_with_MMX)
+    {
+        return cast(__m64) __builtin_ia32_pcmpgtb (cast(ubyte8)a, cast(ubyte8)b);
+    }
+    else
+    {
+        return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b);
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
+    __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
+    byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B);
+    byte[8] correct =     [0, 0,-1, 0, 0, 0, 0, 0];
+    assert(C.array == correct);
+}
+
+/// Copy 64-bit integer `a` to `dst`.
+long _mm_cvtm64_si64 (__m64 a) pure @safe
+{
+    long1 la = cast(long1)a;
+    return a.array[0];
+}
+unittest
+{
+    __m64 A = _mm_setr_pi32(2, 1);
+    long1 lA = cast(long1)A;
+    assert(A.array[0] == 0x100000002);
+}
+
+/// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`.
+__m64 _mm_cvtsi32_si64 (int a) pure @trusted
+{
+    __m64 r = void;
+    r.ptr[0] = a;
+    return r;
+}
+unittest
+{
+    __m64 R = _mm_cvtsi32_si64(-1);
+    assert(R.array[0] == -1);
+}
+
+/// Copy 64-bit integer `a` to `dst`.
+__m64 _mm_cvtsi64_m64 (long a) pure @trusted
+{
+    __m64 r = void;
+    r.ptr[0] = a;
+    return r;
+}
+unittest
+{
+    __m64 R = _mm_cvtsi64_m64(0x123456789A);
+    assert(R.array[0] == 0x123456789A);
+}
+
+/// Get the lower 32-bit integer in `a`.
+int _mm_cvtsi64_si32 (__m64 a) pure @safe
+{
+    int2 r = cast(int2)a;
+    return r.array[0];
+}
+unittest
+{
+    __m64 A = _mm_setr_pi32(-6, 5);
+    int R = _mm_cvtsi64_si32(A);
+    assert(R == -6);
+}
+
+/// Empty the MMX state, which marks the x87 FPU registers as available for 
+/// use by x87 instructions. 
+/// This instruction is supposed to be used at the end of all MMX technology procedures.
+/// But this is useless when using `intel-intrinsics`, with all D compilers.
+void _mm_empty() pure @safe
+{
+    // do nothing, see comment on top of file
+}
+
+
+deprecated alias _m_empty = _mm_empty; /// Deprecated intrinsics.
+deprecated alias _m_from_int =  _mm_cvtsi32_si64; ///ditto
+deprecated alias _m_from_int64 = _mm_cvtsi64_m64; ///ditto
+
+/// Multiply packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers. 
+/// Horizontally add adjacent pairs of intermediate 32-bit integers
+__m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b)));
+}
+unittest
+{
+    short4 A = [-32768, -32768, 32767, 32767];
+    short4 B = [-32768, -32768, 32767, 32767];
+    int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B);
+    int[2] correct = [-2147483648, 2*32767*32767];
+    assert(R.array == correct);
+}
+
+/// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
+/// and store the high 16 bits of the intermediate integers.
+__m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b)));
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(4, 8, -16, 7);
+    __m64 B = _mm_set1_pi16(16384);
+    short4 R = cast(short4)_mm_mulhi_pi16(A, B);
+    short[4] correct = [1, 2, -4, 1];
+    assert(R.array == correct);
+}
+
+/// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
+/// and store the low 16 bits of the intermediate integers.
+__m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b)));
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(4, 1, 16, 7);
+    __m64 B = _mm_set1_pi16(16384);
+    short4 R = cast(short4)_mm_mullo_pi16(A, B);
+    short[4] correct = [0, 16384, 0, -16384];
+    assert(R.array == correct);
+}
+
+/// Compute the bitwise OR of 64 bits in `a` and `b`.
+__m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe
+{
+    return a | b;
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(255, 1, -1, 0);
+    __m64 B = _mm_set1_pi16(15);
+    short4 R = cast(short4)_mm_or_si64(A, B);
+    short[4] correct =     [255, 15, -1, 15];
+    assert(R.array == correct);
+}
+
+/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
+__m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted
+{
+    int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b));
+    int2 r;
+    r.ptr[0] = p.array[0];
+    r.ptr[1] = p.array[2];
+    return cast(__m64)r;
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(256, -129, 254, 0);
+    byte8 R = cast(byte8) _mm_packs_pi16(A, A);
+    byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0];
+    assert(R.array == correct);
+}
+
+/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
+__m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted
+{
+    int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b));
+    int2 r;
+    r.ptr[0] = p.array[0];
+    r.ptr[1] = p.array[2];
+    return cast(__m64)r;
+}
+unittest
+{
+    __m64 A = _mm_setr_pi32(100000, -100000);
+    short4 R = cast(short4) _mm_packs_pi32(A, A);
+    short[4] correct = [32767, -32768, 32767, -32768];
+    assert(R.array == correct);
+}
+
+/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
+__m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted
+{
+    int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b));
+    int2 r;
+    r.ptr[0] = p.array[0];
+    r.ptr[1] = p.array[2];
+    return cast(__m64)r;
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(256, -129, 254, 0);
+    byte8 R = cast(byte8) _mm_packs_pu16(A, A);
+    ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0];
+    assert(R.array == cast(byte[8])correct);
+}
+
+deprecated alias
+    _m_packssdw = _mm_packs_pi32,     /// Deprecated intrinsics.
+    _m_packsswb = _mm_packs_pi16,     ///ditto
+    _m_packuswb = _mm_packs_pu16,     ///ditto
+    _m_paddb = _mm_add_pi8,           ///ditto
+    _m_paddd = _mm_add_pi32,          ///ditto
+    _m_paddsb = _mm_adds_pi8,         ///ditto
+    _m_paddsw = _mm_adds_pi16,        ///ditto
+    _m_paddusb = _mm_adds_pu8,        ///ditto
+    _m_paddusw = _mm_adds_pu16,       ///ditto
+    _m_paddw = _mm_add_pi16,          ///ditto
+    _m_pand = _mm_and_si64,           ///ditto
+    _m_pandn = _mm_andnot_si64,       ///ditto
+    _m_pcmpeqb = _mm_cmpeq_pi8,       ///ditto
+    _m_pcmpeqd = _mm_cmpeq_pi32,      ///ditto
+    _m_pcmpeqw = _mm_cmpeq_pi16,      ///ditto
+    _m_pcmpgtb = _mm_cmpgt_pi8,       ///ditto
+    _m_pcmpgtd = _mm_cmpgt_pi32,      ///ditto
+    _m_pcmpgtw = _mm_cmpgt_pi16,      ///ditto
+    _m_pmaddwd = _mm_madd_pi16,       ///ditto
+    _m_pmulhw = _mm_mulhi_pi16,       ///ditto
+    _m_pmullw = _mm_mullo_pi16,       ///ditto
+    _m_por = _mm_or_si64,             ///ditto
+    _m_pslld = _mm_sll_pi32,          ///ditto
+    _m_pslldi = _mm_slli_pi32,        ///ditto
+    _m_psllq = _mm_sll_si64,          ///ditto
+    _m_psllqi = _mm_slli_si64,        ///ditto
+    _m_psllw = _mm_sll_pi16,          ///ditto
+    _m_psllwi = _mm_slli_pi16,        ///ditto
+    _m_psrad = _mm_sra_pi32,          ///ditto
+    _m_psradi = _mm_srai_pi32,        ///ditto
+    _m_psraw = _mm_sra_pi16,          ///ditto
+    _m_psrawi = _mm_srai_pi16,        ///ditto
+    _m_psrld = _mm_srl_pi32,          ///ditto
+    _m_psrldi = _mm_srli_pi32,        ///ditto
+    _m_psrlq = _mm_srl_si64,          ///ditto
+    _m_psrlqi = _mm_srli_si64,        ///ditto
+    _m_psrlw = _mm_srl_pi16,          ///ditto
+    _m_psrlwi = _mm_srli_pi16,        ///ditto
+    _m_psubb = _mm_sub_pi8,           ///ditto
+    _m_psubd = _mm_sub_pi32,          ///ditto
+    _m_psubsb = _mm_subs_pi8,         ///ditto
+    _m_psubsw = _mm_subs_pi16,        ///ditto
+    _m_psubusb = _mm_subs_pu8,        ///ditto
+    _m_psubusw = _mm_subs_pu16,       ///ditto
+    _m_psubw = _mm_sub_pi16,          ///ditto
+    _m_punpckhbw = _mm_unpackhi_pi8,  ///ditto
+    _m_punpckhdq = _mm_unpackhi_pi32, ///ditto
+    _m_punpckhwd = _mm_unpackhi_pi16, ///ditto
+    _m_punpcklbw = _mm_unpacklo_pi8,  ///ditto
+    _m_punpckldq = _mm_unpacklo_pi32, ///ditto
+    _m_punpcklwd = _mm_unpacklo_pi16, ///ditto
+    _m_pxor = _mm_xor_si64;           ///ditto
+                
+/// Set packed 16-bit integers with the supplied values.
+__m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted
+{
+    short[4] arr = [e0, e1, e2, e3];
+    return *cast(__m64*)(arr.ptr);
+}
+unittest
+{
+    short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0);
+    short[4] correct = [0, 1, 2, 3];
+    assert(R.array == correct);
+}
+
+/// Set packed 32-bit integers with the supplied values.
+__m64 _mm_set_pi32 (int e1, int e0) pure @trusted
+{
+    int[2] arr = [e0, e1];
+    return *cast(__m64*)(arr.ptr);
+}
+unittest
+{
+    int2 R = cast(int2) _mm_set_pi32(1, 0);
+    int[2] correct = [0, 1];
+    assert(R.array == correct);
+}
+
+/// Set packed 8-bit integers with the supplied values.
+__m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
+{
+    byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7];
+    return *cast(__m64*)(arr.ptr);
+}
+unittest
+{
+    byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0);
+    byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
+    assert(R.array == correct);
+}
+
+/// Broadcast 16-bit integer `a` to all elements.
+__m64 _mm_set1_pi16 (short a) pure @trusted
+{
+    return cast(__m64)(short4(a));
+}
+unittest
+{
+    short4 R = cast(short4) _mm_set1_pi16(44);
+    short[4] correct = [44, 44, 44, 44];
+    assert(R.array == correct);
+}
+
+/// Broadcast 32-bit integer `a` to all elements.
+__m64 _mm_set1_pi32 (int a) pure @trusted
+{
+    return cast(__m64)(int2(a));
+}
+unittest
+{
+    int2 R = cast(int2) _mm_set1_pi32(43);
+    int[2] correct = [43, 43];
+    assert(R.array == correct);
+}
+
+/// Broadcast 8-bit integer `a` to all elements.
+__m64 _mm_set1_pi8 (byte a) pure @trusted
+{
+    return cast(__m64)(byte8(a));
+}
+unittest
+{
+    byte8 R = cast(byte8) _mm_set1_pi8(42);
+    byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42];
+    assert(R.array == correct);
+}
+
+/// Set packed 16-bit integers with the supplied values in reverse order.
+__m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted
+{
+    short[4] arr = [e3, e2, e1, e0];
+    return *cast(__m64*)(arr.ptr);
+}
+unittest
+{
+    short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3);
+    short[4] correct = [0, 1, 2, 3];
+    assert(R.array == correct);
+}
+
+/// Set packed 32-bit integers with the supplied values in reverse order.
+__m64 _mm_setr_pi32 (int e1, int e0) pure @trusted
+{
+    int[2] arr = [e1, e0];
+    return *cast(__m64*)(arr.ptr);
+}
+unittest
+{
+    int2 R = cast(int2) _mm_setr_pi32(0, 1);
+    int[2] correct = [0, 1];
+    assert(R.array == correct);
+}
+
+/// Set packed 8-bit integers with the supplied values in reverse order.
+__m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
+{
+    byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0];
+    return *cast(__m64*)(arr.ptr);
+}
+unittest
+{
+    byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7);
+    byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
+    assert(R.array == correct);
+}
+
+/// Return vector of type `__m64` with all elements set to zero.
+__m64 _mm_setzero_si64 () pure @trusted
+{
+    __m64 r; // PERF =void;
+    r.ptr[0] = 0;
+    return r;
+}
+unittest
+{
+    __m64 R = _mm_setzero_si64();
+    assert(R.array[0] == 0);
+}
+
+/// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros.
+deprecated("Use _mm_slli_pi16 instead.") __m64 _mm_sll_pi16 (__m64 a, __m64 bits) pure @safe
+{
+    return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(bits)));
+}
+
+/// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros.
+deprecated("Use _mm_slli_pi32 instead.") __m64 _mm_sll_pi32 (__m64 a, __m64 bits) pure @safe
+{
+    return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(bits)));
+}
+
+/// Shift 64-bit integer `a` left by `bits` while shifting in zeros.
+deprecated("Use _mm_slli_si64 instead.") __m64 _mm_sll_si64 (__m64 a, __m64 bits) pure @safe
+{
+    return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(bits)));
+}
+
+/// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
+__m64 _mm_slli_pi16 (__m64 a, int imm8) pure @safe
+{
+    return to_m64(_mm_slli_epi16(to_m128i(a), imm8));
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
+    short4 B = cast(short4)( _mm_slli_pi16(A, 1) );
+    short[4] correct = [ -8, -10, 12, 14 ];
+    assert(B.array == correct);
+}
+
+/// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
+__m64 _mm_slli_pi32 (__m64 a, int imm8) pure @safe
+{
+    return to_m64(_mm_slli_epi32(to_m128i(a), imm8));
+}
+unittest
+{
+    __m64 A = _mm_setr_pi32(-4, 5);
+    int2 B = cast(int2)( _mm_slli_pi32(A, 1) );
+    int[2] correct = [ -8, 10 ];
+    assert(B.array == correct);
+}
+
+/// Shift 64-bit integer `a` left by `imm8` while shifting in zeros.
+__m64 _mm_slli_si64 (__m64 a, int imm8) pure @safe
+{
+    return to_m64(_mm_slli_epi64(to_m128i(a), imm8));
+}
+unittest
+{
+    __m64 A = _mm_cvtsi64_m64(-1);
+    long1 R = cast(long1)( _mm_slli_si64(A, 1) );
+    long[1] correct = [ -2 ];
+    assert(R.array == correct);
+}
+
+/// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits.
+deprecated("Use _mm_srai_pi16 instead.") __m64 _mm_sra_pi16 (__m64 a, __m64 bits) pure @safe
+{
+    return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(bits)));
+}
+
+/// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits.
+deprecated("Use _mm_srai_pi32 instead.") __m64 _mm_sra_pi32 (__m64 a, __m64 bits) pure @safe
+{
+    return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(bits)));
+}
+
+/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
+__m64 _mm_srai_pi16 (__m64 a, int imm8) pure @safe
+{
+    return to_m64(_mm_srai_epi16(to_m128i(a), imm8));
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
+    short4 B = cast(short4)( _mm_srai_pi16(A, 1) );
+    short[4] correct = [ -2, -3, 3, 3 ];
+    assert(B.array == correct);
+}
+
+/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
+__m64 _mm_srai_pi32 (__m64 a, int imm8) pure @safe
+{
+    return to_m64(_mm_srai_epi32(to_m128i(a), imm8));
+}
+unittest
+{
+    __m64 A = _mm_setr_pi32(-4, 5);
+    int2 B = cast(int2)( _mm_srai_pi32(A, 1) );
+    int[2] correct = [ -2, 2 ];
+    assert(B.array == correct);
+}
+
+/// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros.
+deprecated("Use _mm_srli_pi16 instead.") __m64 _mm_srl_pi16 (__m64 a, __m64 bits) pure @safe
+{
+    return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(bits)));
+}
+
+/// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros.
+deprecated("Use _mm_srli_pi32 instead.") __m64 _mm_srl_pi32 (__m64 a, __m64 bits) pure @safe
+{
+    return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(bits)));
+}
+
+/// Shift 64-bit integer `a` right by `bits` while shifting in zeros.
+deprecated("Use _mm_srli_si64 instead.") __m64 _mm_srl_si64 (__m64 a, __m64 bits) pure @safe
+{
+    return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(bits)));
+}
+
+/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
+__m64 _mm_srli_pi16 (__m64 a, int imm8) pure @safe
+{
+    return to_m64(_mm_srli_epi16(to_m128i(a), imm8));
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
+    short4 B = cast(short4)( _mm_srli_pi16(A, 1) );
+    short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ];
+    assert(B.array == correct);
+}
+
+/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
+__m64 _mm_srli_pi32 (__m64 a, int imm8) pure @safe
+{
+    return to_m64(_mm_srli_epi32(to_m128i(a), imm8));
+}
+unittest
+{
+    __m64 A = _mm_setr_pi32(-4, 5);
+    int2 B = cast(int2)( _mm_srli_pi32(A, 1) );
+    int[2] correct = [ 0x7ffffffe, 2 ];
+    assert(B.array == correct);
+}
+
+/// Shift 64-bit integer `a` right by `imm8` while shifting in zeros.
+__m64 _mm_srli_si64 (__m64 a, int imm8) pure @safe
+{
+    return to_m64(_mm_srli_epi64(to_m128i(a), imm8));
+}
+unittest
+{
+    __m64 A = _mm_cvtsi64_m64(-1);
+    long1 R = cast(long1)( _mm_srli_si64(A, 1) );
+    long[1] correct = [ 0x7fff_ffff_ffff_ffff ];
+    assert(R.array == correct);
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
+__m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe
+{
+    return cast(__m64)(cast(short4)a - cast(short4)b);
+}
+unittest
+{
+    short4 R = cast(short4) _mm_sub_pi16(_mm_setr_pi16(cast(short)65534,  1, 5, -32768),
+                                         _mm_setr_pi16(cast(short)65535, 16, 4, 4));
+    static immutable short[4] correct =                            [ -1,-15, 1, 32764];
+    assert(R.array == correct);
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
+__m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe
+{
+    return cast(__m64)(cast(int2)a - cast(int2)b);
+}
+unittest
+{
+    int2 R = cast(int2) _mm_sub_pi32(_mm_setr_pi32( 10,   4),
+                                     _mm_setr_pi32( 15, -70));
+    static immutable int[2] correct =             [ -5,  74];
+    assert(R.array == correct);
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
+__m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe
+{
+    return cast(__m64)(cast(byte8)a - cast(byte8)b);
+}
+unittest
+{
+    byte8 R = cast(byte8) _mm_sub_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128),
+                                      _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
+    static immutable byte[8] correct =                 [      -1,   7, -1,-30,  0,  0, 0, 120 ];
+    assert(R.array == correct);
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` using saturation.
+__m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b)));
+}
+unittest
+{
+    short4 R = cast(short4) _mm_subs_pi16(_mm_setr_pi16(cast(short)65534,  1, 5, -32768),
+                                          _mm_setr_pi16(cast(short)65535, 16, 4, 4));
+    static immutable short[4] correct =                             [ -1,-15, 1, -32768];
+    assert(R.array == correct);
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` using saturation.
+__m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b)));
+}
+unittest
+{
+    byte8 R = cast(byte8) _mm_subs_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128),
+                                       _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
+    static immutable byte[8] correct =                 [       -1,   7, -1,-30,  0,  0, 0, -128 ];
+    assert(R.array == correct);
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 
+/// using saturation.
+__m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b)));
+}
+unittest
+{
+    short4 R = cast(short4) _mm_subs_pu16(_mm_setr_pi16(cast(short)65534,  1, 5, 4),
+                                          _mm_setr_pi16(cast(short)65535, 16, 4, 4));
+    static immutable short[4] correct =                              [ 0,  0, 1, 0];
+    assert(R.array == correct);
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` 
+/// using saturation.
+__m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b)));
+}
+unittest
+{
+    byte8 R = cast(byte8) _mm_subs_pu8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8),
+                                       _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
+    static immutable byte[8] correct =                 [        0,   7,  0,  0,  0,  0, 0, 0, ];
+    assert(R.array == correct);
+}
+
+deprecated alias _m_to_int = _mm_cvtsi64_si32;  /// Deprecated intrinsics.
+deprecated alias _m_to_int64 = _mm_cvtm64_si64; ///ditto
+
+/// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
+__m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @trusted
+{   
+    static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+                   ret <4 x i16> %r`;
+        return cast(__m64) LDCInlineIR!(ir, short4, short4, short4)(cast(short4)a, cast(short4)b);
+    }
+    else
+    {
+        short4 ia = cast(short4)a;
+        short4 ib = cast(short4)b;
+        short4 r;
+        r.ptr[0] = ia.array[2];
+        r.ptr[1] = ib.array[2];
+        r.ptr[2] = ia.array[3];
+        r.ptr[3] = ib.array[3];
+        return cast(__m64)r;
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(4, 8, -16, 7);
+    __m64 B = _mm_setr_pi16(5, 9,  -3, 10);
+    short4 R = cast(short4) _mm_unpackhi_pi16(A, B);
+    short[4] correct = [-16, -3, 7, 10];
+    assert(R.array == correct);
+}
+
+/// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
+__m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @trusted
+{
+    // Generate punpckldq as far back as LDC 1.0.0 -O1
+    // (Yes, LLVM does generate punpckldq to reuse SSE2 instructions)
+    int2 ia = cast(int2)a;
+    int2 ib = cast(int2)b;
+    int2 r;
+    r.ptr[0] = ia.array[1];
+    r.ptr[1] = ib.array[1];
+    return cast(__m64)r;
+}
+unittest
+{
+    __m64 A = _mm_setr_pi32(4, 8);
+    __m64 B = _mm_setr_pi32(5, 9);
+    int2 R = cast(int2) _mm_unpackhi_pi32(A, B);
+    int[2] correct = [8, 9];
+    assert(R.array == correct);
+}
+
+/// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
+__m64 _mm_unpackhi_pi8 (__m64 a, __m64 b)
+{
+    static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+                   ret <8 x i8> %r`;
+        return cast(__m64) LDCInlineIR!(ir, byte8, byte8, byte8)(cast(byte8)a, cast(byte8)b);
+    }
+    else
+    {
+        byte8 ia = cast(byte8)a;
+        byte8 ib = cast(byte8)b;
+        byte8 r;
+        r.ptr[0] = ia.array[4];
+        r.ptr[1] = ib.array[4];
+        r.ptr[2] = ia.array[5];
+        r.ptr[3] = ib.array[5];
+        r.ptr[4] = ia.array[6];
+        r.ptr[5] = ib.array[6];
+        r.ptr[6] = ia.array[7];
+        r.ptr[7] = ib.array[7];
+        return cast(__m64)r;
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi8( 1,  2,  3,  4,  5,  6,  7,  8);
+    __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8);
+    byte8 R = cast(byte8) _mm_unpackhi_pi8(A, B);
+    byte[8] correct = [5, -5, 6, -6, 7, -7, 8, -8];
+    assert(R.array == correct);
+}
+
+/// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
+__m64 _mm_unpacklo_pi16 (__m64 a, __m64 b)
+{
+    // Generates punpcklwd since LDC 1.0.0 -01
+    short4 ia = cast(short4)a;
+    short4 ib = cast(short4)b;
+    short4 r;
+    r.ptr[0] = ia.array[0];
+    r.ptr[1] = ib.array[0];
+    r.ptr[2] = ia.array[1];
+    r.ptr[3] = ib.array[1];
+    return cast(__m64)r;
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(4, 8, -16, 7);
+    __m64 B = _mm_setr_pi16(5, 9,  -3, 10);
+    short4 R = cast(short4) _mm_unpacklo_pi16(A, B);
+    short[4] correct = [4, 5, 8, 9];
+    assert(R.array == correct);
+}
+
+/// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
+__m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @trusted
+{
+    // x86: Generate punpckldq as far back as LDC 1.0.0 -O1
+    // ARM: Generate zip as far back as LDC 1.8.0 -O1
+    int2 ia = cast(int2)a;
+    int2 ib = cast(int2)b;
+    int2 r;
+    r.ptr[0] = ia.array[0];
+    r.ptr[1] = ib.array[0];
+    return cast(__m64)r;
+}
+unittest
+{
+    __m64 A = _mm_setr_pi32(4, 8);
+    __m64 B = _mm_setr_pi32(5, 9);
+    int2 R = cast(int2) _mm_unpacklo_pi32(A, B);
+    int[2] correct = [4, 5];
+    assert(R.array == correct);
+}
+
+/// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
+__m64 _mm_unpacklo_pi8 (__m64 a, __m64 b)
+{
+    static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+            ret <8 x i8> %r`;
+        return cast(__m64) LDCInlineIR!(ir, byte8, byte8, byte8)(cast(byte8)a, cast(byte8)b);
+    }
+    else
+    {
+        byte8 ia = cast(byte8)a;
+        byte8 ib = cast(byte8)b;
+        byte8 r;
+        r.ptr[0] = ia.array[0];
+        r.ptr[1] = ib.array[0];
+        r.ptr[2] = ia.array[1];
+        r.ptr[3] = ib.array[1];
+        r.ptr[4] = ia.array[2];
+        r.ptr[5] = ib.array[2];
+        r.ptr[6] = ia.array[3];
+        r.ptr[7] = ib.array[3];
+        return cast(__m64)r;
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi8( 1,  2,  3,  4,  5,  6,  7,  8);
+    __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8);
+    byte8 R = cast(byte8) _mm_unpacklo_pi8(A, B);
+    byte[8] correct = [1, -1, 2, -2, 3, -3, 4, -4];
+    assert(R.array == correct);
+}
+
+/// Compute the bitwise XOR of 64 bits (representing integer data) in `a` and `b`.
+__m64 _mm_xor_si64 (__m64 a, __m64 b)
+{
+    return a ^ b;
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(255, 1, -1, 0);
+    __m64 B = _mm_set1_pi16(15);
+    short4 R = cast(short4)_mm_xor_si64(A, B);
+    short[4] correct =     [240, 14, -16, 15];
+    assert(R.array == correct);
+}
+
diff --git a/external/inteli/nmmintrin.d b/external/inteli/nmmintrin.d
new file mode 100644
index 0000000..f91a1f9
--- /dev/null
+++ b/external/inteli/nmmintrin.d
@@ -0,0 +1,1394 @@
+/**
+* SSE4.2 intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3
+*
+* Copyright: Guillaume Piolat 2022.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.nmmintrin;
+
+public import inteli.types;
+import inteli.internals;
+public import inteli.smmintrin;
+import core.bitop: bsf, bsr;
+
+
+// Note: this header will work whether you have SSE4.2 enabled or not.
+// With LDC, use "dflags-ldc": ["-mattr=+sse4.2"] or equivalent to actively 
+// generate SSE4.2 instruction (they are often enabled with -O1 or greater).
+// - Additionally, you need ["-mattr=+crc"] on ARM if you want hardware CRC instructions.
+// - Since LDC 1.30, you need ["-mattr=+crc32"] on x86_64 if you want hardware CRC instructions,
+//   it is not considered implied by sse4.2 anymore.
+// With GDC, use "dflags-gdc": ["-msse4.2"] or equivalent to generate SSE4.2 instructions.
+
+nothrow @nogc:
+
+// <Data size and signedness>
+
+/// String contains unsigned 8-bit characters (default).
+enum int _SIDD_UBYTE_OPS = 0;
+
+/// String contains unsigned 16-bit characters.
+enum int _SIDD_UWORD_OPS = 1;
+
+/// String contains signed 8-bit characters.
+enum int _SIDD_SBYTE_OPS = 2;
+
+/// String contains signed 16-bit characters.
+enum int _SIDD_SWORD_OPS = 3;
+
+// </Data size and signedness>
+
+
+// <Comparison options>
+
+/// For each character in `b`, find if it is in `a` (default)
+/// The resulting mask has bit set at b positions that were found in a.
+enum int _SIDD_CMP_EQUAL_ANY = 0;
+
+/// For each character in `b`, determine if
+/// `a[0] <= c <= a[1] or a[1] <= c <= a[2]...`
+/// Contrarily to false documentation on the Internet, pairs must be in `a`!
+enum int _SIDD_CMP_RANGES = 4;
+
+/// The strings defined by `a` and `b` are equal
+enum int _SIDD_CMP_EQUAL_EACH = 8;
+
+/// Search for the defined substring in the target
+enum int _SIDD_CMP_EQUAL_ORDERED = 12;
+
+// </Comparison options>
+
+// <Result polarity>
+
+/// Do not negate results (default, no effect)
+enum int _SIDD_POSITIVE_POLARITY = 0;
+
+/// Negates results
+enum int _SIDD_NEGATIVE_POLARITY = 16;
+
+/// No effect. Do not negate results before the end of the string. (default when using `_SIDD_NEGATIVE_POLARITY`)
+/// You basically never want this.
+enum int _SIDD_MASKED_POSITIVE_POLARITY = 32;
+
+/// Negates results only before the end of the string
+enum int _SIDD_MASKED_NEGATIVE_POLARITY = 48;
+
+// </Result polarity>
+
+// <Bit returned>
+
+/// **Index only**: return the least significant bit (default).
+enum int _SIDD_LEAST_SIGNIFICANT = 0;
+
+/// **Index only**: return the most significant bit.
+enum int _SIDD_MOST_SIGNIFICANT = 64;
+
+// </Bit returned>
+
+/// **Mask only**: return the bit mask (default).
+enum int _SIDD_BIT_MASK = 0;
+
+/// **Mask only**: return the byte/word mask.
+enum int _SIDD_UNIT_MASK = 64;
+
+/// So SSE4.2 has a lot of hard-to-understand instructions. Here is another explanation.
+///
+/// Alternative explanation of imm8
+///
+/// imm8 is an 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to do.
+///
+///    Bits [1:0]: Determine source data format.
+///      00: 16 unsigned bytes
+///      01: 8 unsigned words
+///      10: 16 signed bytes
+///      11: 8 signed words
+///
+///    Bits [3:2]: Determine comparison type and aggregation method.
+///      00: Subset: Each character in B is compared for equality with all
+///          the characters in A.
+///      01: Ranges: Each character in B is compared to A pairs. The comparison
+///          basis is greater than or equal for even-indexed elements in A,
+///          and less than or equal for odd-indexed elements in A.
+///      10: Match: Compare each pair of corresponding characters in A and
+///          B for equality.
+///      11: Substring: Search B for substring matches of A.
+///
+///    Bits [5:4]: Determine whether to do a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B.
+///
+
+
+
+/// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
+/// the control in `imm8`, and returns 1 if `b` "does not contain a null character"
+/// and the resulting mask was zero, and 0 otherwise.
+/// Warning: actually it seems the instruction does accept \0 in input, just the length must be >= count.
+///          It's not clear for what purpose.
+int _mm_cmpestra(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return cast(int) __builtin_ia32_pcmpestria128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpestria128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
+    }
+    else
+    {
+        __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
+        __m128i equalZero = _mm_cmpeq_epi8(mask, _mm_setzero_si128());
+        int sigbits = _mm_movemask_epi8(equalZero);
+        enum int Count = (imm8 & 1) ? 8 : 16;
+        return (sigbits == 0xffff) && (lb >= Count);
+    }
+}
+unittest
+{
+    char[16] A = "Maximum\x00length!!";
+    char[16] B = "Mbximum\x00length!!";
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+
+    // string matching a-la strcmp, for 16-bytes of data
+    // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one
+    assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
+                            | _SIDD_CMP_EQUAL_EACH
+                            | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmA, 16));
+    assert(0 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
+                            | _SIDD_CMP_EQUAL_EACH
+                            | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmB, 16));
+
+    // test negative length, this will be clamped to 16
+    assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
+                            | _SIDD_CMP_EQUAL_EACH
+                            | _SIDD_NEGATIVE_POLARITY)(mmA, -160, mmA, -17));
+
+    // it seems you can't compare shorter strings for equality using _mm_cmpestra (!)
+
+    // Test 16-bit format
+    assert(1 == _mm_cmpestra!(_SIDD_SWORD_OPS 
+                            | _SIDD_CMP_EQUAL_EACH
+                            | _SIDD_NEGATIVE_POLARITY)(mmA, 8, mmA, 8));
+}
+
+/// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
+/// the control in `imm8`, and returns 1 if the resulting mask was non-zero,
+/// and 0 otherwise.
+int _mm_cmpestrc(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return cast(int) __builtin_ia32_pcmpestric128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return cast(int) __builtin_ia32_pcmpestric128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
+    }
+    else
+    {
+        __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
+        int sigbits = _mm_movemask_epi8(mask);
+        return (sigbits != 0);
+    }
+}
+unittest
+{
+    // Compare two shorter strings
+    {
+        char[16] A = "Hello world";
+        char[16] B = "Hello moon";
+        __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+        __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+        __m128i mask = _mm_cmpestrm!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
+                                     | _SIDD_CMP_EQUAL_EACH
+                                     | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6);
+        assert(0 == _mm_cmpestrc!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
+                                | _SIDD_CMP_EQUAL_EACH
+                                | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6));
+        assert(1 == _mm_cmpestrc!(_SIDD_UBYTE_OPS 
+                                | _SIDD_CMP_EQUAL_EACH
+                                | _SIDD_NEGATIVE_POLARITY)(mmA, 7, mmB, 7));
+    }
+}
+
+/// Compare packed strings in `a` and `b` with lengths `la` and `lb` using
+/// the control in `imm8`, and return the generated index.
+/// Note: if the mask is all zeroes, the returned index is always `Count` 
+/// (8 or 16 depending on size).
+int _mm_cmpestri(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpestri128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpestri128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
+    }
+    else
+    {
+        __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
+
+        // Convert the unit mask to bit mask
+        static if (imm8 & 1)
+        {
+            enum int Count = 8;
+            mask = _mm_packs_epi16(mask, _mm_setzero_si128());
+        }
+        else
+        {
+            enum int Count = 16;
+        }
+        int signbits = _mm_movemask_epi8(mask);
+        static if (imm8 & _SIDD_MOST_SIGNIFICANT)
+        {
+            if (signbits == 0)
+                return Count;
+            else
+                return bsr(signbits);
+        }
+        else
+        {
+            if (signbits == 0)
+                return Count;
+            else
+                return bsf(signbits);
+        }
+    }
+}
+unittest
+{
+    // Find the index of the first difference (at index 6)
+    //                  v 
+    char[16] A = "Hello sun";
+    char[16] B = "Hello moon";
+
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+
+    int index = _mm_cmpestri!(_SIDD_UBYTE_OPS
+                            | _SIDD_CMP_EQUAL_EACH
+                            | _SIDD_NEGATIVE_POLARITY
+                            | _SIDD_LEAST_SIGNIFICANT)(mmA, 9, mmB, 10);
+    assert(index == 6);
+
+    // Those string must compare equal, regardless of what happens after their length.
+    index = _mm_cmpestri!(_SIDD_UBYTE_OPS
+                        | _SIDD_CMP_EQUAL_EACH
+                        | _SIDD_NEGATIVE_POLARITY
+                        | _SIDD_LEAST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars
+    assert(index == 16);
+
+    index = _mm_cmpestri!(_SIDD_UBYTE_OPS
+                        | _SIDD_CMP_EQUAL_EACH
+                        | _SIDD_NEGATIVE_POLARITY
+                        | _SIDD_MOST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars
+    assert(index == 16);
+}
+unittest
+{
+    // Identify the last character that isn't an identifier character.
+    //                   v (at index 7)
+    char[16] A = "my_i(en)ifie";
+    char[16] identRanges = "__azAz09";
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr);
+    byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS
+                                            | _SIDD_CMP_RANGES
+                                            | _SIDD_MASKED_NEGATIVE_POLARITY
+                                            | _SIDD_UNIT_MASK)(mmI, 8, mmA, 12);
+    byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(mask.array == correctM);
+
+    int index = _mm_cmpestri!(_SIDD_UBYTE_OPS
+                            | _SIDD_CMP_RANGES
+                            | _SIDD_MASKED_NEGATIVE_POLARITY
+                            | _SIDD_MOST_SIGNIFICANT)(mmI, 8, mmA, 12);
+    assert(index == 7); // ')' is the last char not to be in [__azAz09]
+}
+unittest
+{
+    // testing _SIDD_CMP_RANGES but with signed shorts comparison instead (this only makes sense for _SIDD_CMP_RANGES)
+    short[8] ranges  = [0,  -1,  1000, 2000,    0,    0,    0, 0];
+    short[8] numbers = [-32768, -1000, -1, -0, 0, 1, 1000, 32767];
+    __m128i mmRanges = _mm_loadu_si128(cast(__m128i*)ranges.ptr);
+    __m128i mmNumbers = _mm_loadu_si128(cast(__m128i*)numbers.ptr);
+
+    short8 mask = cast(short8)_mm_cmpestrm!(_SIDD_UWORD_OPS
+                                          | _SIDD_CMP_RANGES
+                                          | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8);
+    short[8] correctM = [ -1, -1, -1, -1, -1, -1, -1, -1];
+    mask = cast(short8)_mm_cmpestrm!(_SIDD_SWORD_OPS
+                                   | _SIDD_CMP_RANGES
+                                   | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8);
+    short[8] correctZ = [ 0, 0, 0, 0, 0, 0, -1, 0];
+    assert(mask.array == correctZ);
+}
+unittest
+{
+    // Find a substring
+    char[16] A = "def";
+    char[16] B = "abcdefghdefff";
+    char[16] C = "no substring";
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+    __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
+
+    byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS
+                                            | _SIDD_CMP_EQUAL_ORDERED
+                                            | _SIDD_UNIT_MASK)(mmA, 3, mmB, 13);
+    byte[16] correctM = [0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0];
+    assert(mask.array == correctM);
+
+    int firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
+                                 | _SIDD_CMP_EQUAL_ORDERED)(mmA, 3, mmB, 13);
+    assert(firstMatch == 3);
+
+    int lastMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
+                                 | _SIDD_CMP_EQUAL_ORDERED
+                                 | _SIDD_MOST_SIGNIFICANT)(mmA, 3, mmB, 13);
+    assert(lastMatch == 8);
+    firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
+                                 | _SIDD_CMP_EQUAL_ORDERED)(mmA, -3, mmC, -12);
+    assert(firstMatch == 16); // no substring found
+}
+
+/// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
+/// the control in `imm8`, and return the generated mask.
+__m128i _mm_cmpestrm(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
+    }
+    else
+    {
+        __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
+        
+        static if (imm8 & _SIDD_UNIT_MASK)
+        {
+            return mask;
+        }
+        else
+        {
+            // _SIDD_BIT_MASK
+            static if (imm8 & 1)
+            {
+                mask = _mm_packs_epi16(mask, _mm_setzero_si128());
+            }
+            return _mm_cvtsi32_si128( _mm_movemask_epi8(mask));
+        }
+    }
+}
+unittest
+{
+    char[16] A = "Hello world!";
+    char[16] B = "aeiou!";
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+
+    // Find which letters from B where found in A.
+    byte16 R = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 
+                                       | _SIDD_CMP_EQUAL_ANY
+                                       | _SIDD_BIT_MASK)(mmA, -12, mmB, -6);
+    // because 'e', 'o', and '!' were found
+    byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(R.array == correctR);
+    byte16 M = cast(byte16) _mm_cmpestrm!(_SIDD_UBYTE_OPS 
+                                        | _SIDD_CMP_EQUAL_ANY
+                                        | _SIDD_UNIT_MASK)(mmA, 12, mmB, 6);
+    byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(M.array == correctM);
+}
+
+/// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
+/// the control in `imm8`, and returns bit 0 of the resulting bit mask.
+int _mm_cmpestro(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpestrio128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpestrio128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
+    }
+    else
+    {
+        int4 mask = cast(int4) cmpstrMaskExplicit!imm8(a, la, b, lb);
+        return mask.array[0] & 1;
+    }
+}
+unittest
+{
+    char[16] A = "Hallo world!";
+    char[16] B = "aeiou!";
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+
+    // Find which letters from B were found in A.
+    int res = _mm_cmpestro!(_SIDD_UBYTE_OPS 
+                          | _SIDD_CMP_EQUAL_ANY
+                          | _SIDD_BIT_MASK)(mmA, 12, mmB, -6);
+    // because 'a' was found in "Hallo world!"
+    assert(res == 1);
+}
+
+/// Returns 1 if "any character in a was null", and 0 otherwise.
+/// Warning: what they mean is it returns 1 if the given length `la` is < Count.
+int _mm_cmpestrs(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpestris128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpestris128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
+    }
+    else
+    {
+        // Yes, this intrinsic is there for symmetrical reasons and probably useless.
+        // saturates lengths (the Intrinsics Guide doesn't tell this)
+        if (la < 0) la = -la;
+        if (la > 16) la = 16;
+        enum int Count = (imm8 & 1) ? 8 : 16;
+        return (la < Count);
+    }
+}
+unittest
+{
+    __m128i a;
+    a = 0;
+    assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 15, a, 8) == 1);
+    assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 16, a, 8) == 0);
+    assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -15, a, 8) == 1);
+    assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -16, a, 8) == 0);
+}
+
+/// Returns 1 if "any character in b was null", and 0 otherwise.
+/// Warning: what they mean is it returns 1 if the given length `lb` is < Count.
+int _mm_cmpestrz(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpestriz128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpestriz128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
+    }
+    else
+    {
+        // Yes, this intrinsic is there for symmetrical reasons and probably useless.
+        // saturates lengths (the Intrinsics Guide doesn't tell this)
+        if (lb < 0) lb = -lb;
+        if (lb > 16) lb = 16;
+        enum int Count = (imm8 & 1) ? 8 : 16;
+        return (lb < Count);
+    }
+}
+unittest
+{
+    __m128i b;
+    b = 0;
+    assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 15, b, 15) == 1);
+    assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 16, b, 16) == 0);
+    assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -15, b, -15) == 1);
+    assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -16, b, -16) == 0);
+}
+
+/// Compare packed signed 64-bit integers in a and b for greater-than.
+__m128i _mm_cmpgt_epi64 (__m128i a, __m128i b) pure @trusted
+{
+    long2 la = cast(long2)a;
+    long2 lb = cast(long2)b;
+    // PERF: with DMD, enabling this requires SSE4.2, hence D_AVX
+    /*static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        return cast(__m128i)(la > lb);
+    }
+    else*/
+    static if (GDC_with_SSE42)
+    {
+        return cast(__m128i) __builtin_ia32_pcmpgtq(la, lb);
+    }
+    else version(LDC)
+    {
+        // LDC x86: Optimized since LDC 1.1.0 -O1
+        //   arm64: Optimized since LDC 1.8.0 -O1
+        // When SSE4.2 is disabled, this gives same sequence than below.
+        static if (SIMD_COMPARISON_MASKS_16B)
+            return cast(__m128i)(la > lb);
+        else
+            return cast(__m128i)( greaterMask!long2(la, lb));
+    }
+    else
+    {        
+        long2 r;
+        r.ptr[0] = (la.array[0] > lb.array[0]) ? 0xffffffff_ffffffff : 0;
+        r.ptr[1] = (la.array[1] > lb.array[1]) ? 0xffffffff_ffffffff : 0;
+        return cast(__m128i)r;  
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi64(-3,  2);
+    __m128i B = _mm_setr_epi64(4, -2);
+    long[2] correct = [ 0, -1 ];
+    long2 R = cast(long2)(_mm_cmpgt_epi32(A, B));
+    assert(R.array == correct);
+}
+
+/// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`,
+/// and returns 1 if `b` did not contain a null character and the resulting mask was zero, 
+/// and 0 otherwise.
+int _mm_cmpistra(int imm8)(__m128i a, __m128i b) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return cast(int) __builtin_ia32_pcmpistria128(cast(ubyte16)a, cast(ubyte16)b, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpistria128(cast(byte16)a, cast(byte16)b, imm8);
+    }
+    else
+    {
+        static if (imm8 & 1)
+        {
+            int la = findLengthShort(a);
+            int lb = findLengthShort(b);
+        }
+        else
+        {
+            int la = findLengthByte(a);
+            int lb = findLengthByte(b);
+        }
+        return _mm_cmpestra!imm8(a, la, b, lb);
+    }
+}
+unittest
+{
+    char[16] A = "Maximum\x00one";
+    char[16] B = "Maximum\x00four";
+    char[16] C = "Mbximum\x00length!";
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+    __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
+
+    // string matching a-la strcmp, for 16-bytes of data
+    // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one
+    assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 
+                            | _SIDD_CMP_EQUAL_EACH
+                            | _SIDD_MASKED_NEGATIVE_POLARITY)(mmA, mmB)); // match, but b is too short
+
+    assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 
+                            | _SIDD_CMP_EQUAL_EACH
+                            | _SIDD_NEGATIVE_POLARITY)(mmA, mmC)); // do not match
+}
+
+/// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`,
+/// and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+int _mm_cmpistrc(int imm8)(__m128i a, __m128i b) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return cast(int) __builtin_ia32_pcmpistric128(cast(ubyte16)a, cast(ubyte16)b, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return cast(int) __builtin_ia32_pcmpistric128(cast(byte16)a, cast(byte16)b, imm8);
+    }
+    else
+    {
+        static if (imm8 & 1)
+        {
+            int la = findLengthShort(a);
+            int lb = findLengthShort(b);
+        }
+        else
+        {
+            int la = findLengthByte(a);
+            int lb = findLengthByte(b);
+        }
+        return _mm_cmpestrc!imm8(a, la, b, lb);
+    }
+}
+unittest
+{
+    // Compare two shorter strings
+    {
+        char[16] A = "Hello";
+        char[16] B = "Hello moon";
+        __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+        __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+        assert(0 == _mm_cmpistrc!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
+                                | _SIDD_CMP_EQUAL_EACH
+                                | _SIDD_NEGATIVE_POLARITY)(mmA, mmA));
+        assert(1 == _mm_cmpistrc!(_SIDD_UBYTE_OPS 
+                                | _SIDD_CMP_EQUAL_EACH
+                                | _SIDD_NEGATIVE_POLARITY)(mmA, mmB));
+    }
+}
+
+/// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`
+/// and return the generated index.
+/// Note: if the mask is all zeroes, the returned index is always `Count` 
+/// (8 or 16 depending on size).
+int _mm_cmpistri(int imm8)(__m128i a, __m128i b) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpistri128(cast(ubyte16)a, cast(ubyte16)b, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpistri128(cast(byte16)a, cast(byte16)b, imm8);
+    }
+    else
+    {
+        static if (imm8 & 1)
+        {
+            int la = findLengthShort(a);
+            int lb = findLengthShort(b);
+        }
+        else
+        {
+            int la = findLengthByte(a);
+            int lb = findLengthByte(b);
+        }
+        return _mm_cmpestri!imm8(a, la, b, lb);
+    }
+}
+unittest
+{
+    // Identify the last character that isn't an identifier character.
+    //                   v (at index 7)
+    char[16] A = "my_i(en)ifie";
+    char[16] identRanges = "__azAz09";
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr);
+    byte16 mask = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS
+                                            | _SIDD_CMP_RANGES
+                                            | _SIDD_MASKED_NEGATIVE_POLARITY
+                                            | _SIDD_UNIT_MASK)(mmI, mmA);
+    byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(mask.array == correctM);
+
+    int index = _mm_cmpistri!(_SIDD_UBYTE_OPS
+                            | _SIDD_CMP_RANGES
+                            | _SIDD_MASKED_NEGATIVE_POLARITY
+                            | _SIDD_MOST_SIGNIFICANT)(mmI, mmA);
+    assert(index == 7); // ')' is the last char not to be in [__azAz09]
+}
+
+/// Compare packed strings with implicit lengths in `a` and `b` using the control in
+/// `imm8`, and return the generated mask.
+__m128i _mm_cmpistrm(int imm8)(__m128i a, __m128i b) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(ubyte16)a, cast(ubyte16)b, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(byte16)a, cast(byte16)b, imm8);
+    }
+    else
+    {
+        static if (imm8 & 1)
+        {
+            int la = findLengthShort(a);
+            int lb = findLengthShort(b);
+        }
+        else
+        {
+            int la = findLengthByte(a);
+            int lb = findLengthByte(b);
+        }
+        return _mm_cmpestrm!imm8(a, la, b, lb);
+    }
+}
+unittest
+{
+    char[16] A = "Hello world!";
+    char[16] B = "aeiou!";
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+
+    // Find which letters from B where found in A.
+    byte16 R = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS 
+                                       | _SIDD_CMP_EQUAL_ANY
+                                       | _SIDD_BIT_MASK)(mmA, mmB);
+    // because 'e', 'o', and '!' were found
+    byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(R.array == correctR);
+    byte16 M = cast(byte16) _mm_cmpistrm!(_SIDD_UBYTE_OPS 
+                                        | _SIDD_CMP_EQUAL_ANY
+                                        | _SIDD_UNIT_MASK)(mmA, mmB);
+    byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(M.array == correctM);
+}
+
+/// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
+/// the control in `imm8`, and returns bit 0 of the resulting bit mask.
+int _mm_cmpistro(int imm8)(__m128i a, __m128i b) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpistrio128(cast(ubyte16)a, cast(ubyte16)b, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpistrio128(cast(byte16)a, cast(byte16)b, imm8);
+    }
+    else
+    {
+        static if (imm8 & 1)
+        {
+            int la = findLengthShort(a);
+            int lb = findLengthShort(b);
+        }
+        else
+        {
+            int la = findLengthByte(a);
+            int lb = findLengthByte(b);
+        }
+        return _mm_cmpestro!imm8(a, la, b, lb);
+    }
+}
+unittest
+{
+    char[16] A = "Hallo world!";
+    char[16] B = "aeiou!";
+    char[16] C = "Z";
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+    __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
+
+    // Find which letters from B where found in A.
+    int res = _mm_cmpistro!(_SIDD_UBYTE_OPS 
+                          | _SIDD_CMP_EQUAL_ANY
+                          | _SIDD_BIT_MASK)(mmA, mmB);
+    // because 'a' was found in "Hallo world!"
+    assert(res == 1);
+    res = _mm_cmpistro!(_SIDD_UBYTE_OPS 
+                      | _SIDD_CMP_EQUAL_ANY
+                      | _SIDD_BIT_MASK)(mmA, mmC);
+    assert(res == 0); // because 'Z' wasn't found in A
+}
+
+/// Returns 1 if any character in `a` was null, and 0 otherwise.
+int _mm_cmpistrs(int imm8)(__m128i a, __m128i b) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpistris128(cast(ubyte16)a, cast(ubyte16)b, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpistris128(cast(byte16)a, cast(byte16)b, imm8);
+    }
+    else
+    {
+        static if (imm8 & 1)
+        {
+            int la = findLengthShort(a);
+            return la != 8;
+        }
+        else
+        {
+            int la = findLengthByte(a);
+            return la != 16;
+        }
+    }
+}
+unittest
+{
+    char[16] A = "";
+    char[16] B = "hello";
+    char[16] C = "Maximum length!!";
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+    __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
+    assert(_mm_cmpistrs!_SIDD_UBYTE_OPS(mmA, mmA) == 1);
+    assert(_mm_cmpistrs!_SIDD_SBYTE_OPS(mmB, mmB) == 1);
+    assert(_mm_cmpistrs!_SIDD_UWORD_OPS(mmC, mmC) == 0);
+}
+
+/// Returns 1 if any character in `b` was null, and 0 otherwise.
+int _mm_cmpistrz(int imm8)(__m128i a, __m128i b) @trusted
+{
+    static if (GDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpistriz128(cast(ubyte16)a, cast(ubyte16)b, imm8);
+    }
+    else static if (LDC_with_SSE42)
+    {
+        return __builtin_ia32_pcmpistriz128(cast(byte16)a, cast(byte16)b, imm8);
+    }
+    else
+    {
+        static if (imm8 & 1)
+        {
+            int lb = findLengthShort(b);
+            return lb != 8;
+        }
+        else
+        {
+            int lb = findLengthByte(b);
+            return lb != 16;
+        }
+    }
+}
+unittest
+{
+    char[16] A = "";
+    char[16] B = "hello";
+    char[16] C = "Maximum length!!";
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+    __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
+    assert(_mm_cmpistrz!_SIDD_UBYTE_OPS(mmC, mmA) == 1);
+    assert(_mm_cmpistrz!_SIDD_SBYTE_OPS(mmC, mmB) == 1);
+    assert(_mm_cmpistrz!_SIDD_UWORD_OPS(mmA, mmC) == 0);
+}
+
+
+/// Starting with the initial value in `crc`, accumulates a CR32 value 
+/// for unsigned 16-bit integer `v`.
+/// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
+uint _mm_crc32_u16 (uint crc, ushort v) @safe
+{
+    static if (GDC_with_SSE42)
+    {
+        return __builtin_ia32_crc32hi(crc, v);
+    }
+    else static if (LDC_with_CRC32)
+    {
+        return __builtin_ia32_crc32hi(crc, v);
+    }
+    else static if (LDC_with_ARM64_CRC)
+    {
+        return __crc32ch(crc, v);
+    }
+    else
+    {
+        crc = _mm_crc32_u8(crc, v & 0xff);
+        crc = _mm_crc32_u8(crc, v >> 8);
+        return crc;
+    }
+}
+unittest
+{
+    uint A = _mm_crc32_u16(0x12345678, 0x4512);
+    uint B = _mm_crc32_u16(0x76543210, 0xf50f);
+    uint C = _mm_crc32_u16(0xDEADBEEF, 0x0017);
+    assert(A == 0x39c3f0ff);
+    assert(B == 0xcffbcf07);
+    assert(C == 0xc7e3fe85);
+}
+
+/// Starting with the initial value in `crc`, accumulates a CRC32 value 
+/// for unsigned 32-bit integer `v`.
+/// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
+uint _mm_crc32_u32 (uint crc, uint v) @safe
+{
+    static if (GDC_with_SSE42)
+    {
+        return __builtin_ia32_crc32si(crc, v);
+    }
+    else static if (LDC_with_CRC32)
+    {
+        return __builtin_ia32_crc32si(crc, v);
+    }
+    else static if (LDC_with_ARM64_CRC)
+    {
+        return __crc32cw(crc, v);
+    }
+    else
+    {
+        crc = _mm_crc32_u8(crc, v & 0xff);
+        crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+        crc = _mm_crc32_u8(crc, (v >> 16) & 0xff);
+        crc = _mm_crc32_u8(crc, (v >> 24) & 0xff);
+        return crc;
+    }
+}
+unittest
+{
+    uint A = _mm_crc32_u32(0x12345678, 0x45123563);
+    uint B = _mm_crc32_u32(0x76543210, 0xf50f9993);
+    uint C = _mm_crc32_u32(0xDEADBEEF, 0x00170017);
+    assert(A == 0x22a6ec54);
+    assert(B == 0x7019a6cf);
+    assert(C == 0xbc552c27);
+}
+
+/// Starting with the initial value in `crc`, accumulates a CRC32 
+/// value for unsigned 64-bit integer `v`.
+/// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
+ulong _mm_crc32_u64 (ulong crc, ulong v)
+{
+    version(X86_64)
+        enum bool hasX86Intrin = GDC_with_SSE42 || LDC_with_CRC32;
+    else
+        enum bool hasX86Intrin = false; // intrinsics not available in 32-bit
+
+    static if (hasX86Intrin)
+    {
+        return __builtin_ia32_crc32di(crc, v);
+    }
+    else static if (LDC_with_ARM64_CRC)
+    {
+        return __crc32cd(cast(uint)crc, v);
+    }
+    else
+    {
+        uint crc32 = cast(uint)crc;
+        crc32 = _mm_crc32_u8(crc32, (v >> 0) & 0xff);
+        crc32 = _mm_crc32_u8(crc32, (v >> 8) & 0xff);
+        crc32 = _mm_crc32_u8(crc32, (v >> 16) & 0xff);
+        crc32 = _mm_crc32_u8(crc32, (v >> 24) & 0xff);
+        crc32 = _mm_crc32_u8(crc32, (v >> 32) & 0xff);
+        crc32 = _mm_crc32_u8(crc32, (v >> 40) & 0xff);
+        crc32 = _mm_crc32_u8(crc32, (v >> 48) & 0xff);
+        crc32 = _mm_crc32_u8(crc32, (v >> 56) & 0xff);
+        return crc32;
+    }
+}
+unittest
+{
+    ulong A = _mm_crc32_u64(0x1234567812345678, 0x39C3F0FFCFFBCF07);
+    ulong B = _mm_crc32_u64(0x7654321001234567, 0xFACEFEED);
+    ulong C = _mm_crc32_u64(0xDEADBEEFCAFEBABE, 0x0017C7E3FE850017);
+    assert(A == 0xd66b1074);
+    assert(B == 0xac12f9c6);
+    assert(C == 0xa2d13dd8);
+}
+
+/// Starting with the initial value in `crc`, accumulates a CRC32 value 
+/// for unsigned 8-bit integer `v`.
+/// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
+uint _mm_crc32_u8 (uint crc, ubyte v) @safe
+{
+    static if (GDC_with_SSE42)
+    {
+        return __builtin_ia32_crc32qi(crc, v);
+    }
+    else static if (LDC_with_CRC32)
+    {
+        return __builtin_ia32_crc32qi(crc, v);
+    }
+    else static if (LDC_with_ARM64_CRC)
+    {
+        return __crc32cb(crc, v);
+    }
+    else
+    {
+        return CRC32cTable[(crc ^ v) & 0xFF] ^ (crc >> 8); 
+    }
+}
+unittest
+{
+    uint A = _mm_crc32_u8(0x12345678, 0x45);
+    uint B = _mm_crc32_u8(0x76543210, 0xf5);
+    uint C = _mm_crc32_u8(0xDEADBEEF, 0x00);
+    assert(A == 0x8fd93134);
+    assert(B == 0xd6b7e834);
+    assert(C == 0xbdfd3980);
+}
+
+
+// Utilities for this file
+
+private:
+
+static if (GDC_with_SSE42)
+{
+    version(X86_64)
+        enum bool NeedCRC32CTable = false;
+    else
+        enum bool NeedCRC32CTable = true;
+}
+else static if (LDC_with_CRC32)
+{
+    version(X86_64)
+        enum bool NeedCRC32CTable = false;
+    else
+        enum bool NeedCRC32CTable = true;
+}
+else static if (LDC_with_ARM64_CRC)
+{
+    enum bool NeedCRC32CTable = false;
+}
+else
+{
+    enum bool NeedCRC32CTable = true;
+}
+
+static if (NeedCRC32CTable)
+{
+    static immutable uint[256] CRC32cTable =
+    [
+        0x0, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+        0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+        0x105ec76f, 0xe235446c, 0xf165b798, 0x30e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+        0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+        0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x61c6936, 0xf477ea35,
+        0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+        0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x5125dad, 0x1642ae59, 0xe4292d5a,
+        0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+        0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+        0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0xc38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+        0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+        0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0xf36e6f7,
+        0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+        0xeb1fcbad, 0x197448ae, 0xa24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+        0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+        0xfb410cc2, 0x92a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+        0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+        0x82f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+        0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+        0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0xb21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+        0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+        0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0xe330a81, 0xfc588982,
+        0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+        0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0xd3d3e1a, 0x1e6dcdee, 0xec064eed,
+        0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+        0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+        0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x7198540,
+        0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+        0xe330a81a, 0x115b2b19, 0x20bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+        0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+        0xf36e6f75, 0x105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+        0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,
+    ];
+}
+
+int findLengthByte(__m128i a) pure @safe
+{
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i zeroMask = _mm_cmpeq_epi8(a, zero); // 0xff where a byte is zero
+    int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index
+    if (mask == 0)
+        return 16;
+    else
+        return bsf(mask);
+}
+unittest
+{
+    char[16] A = "Hel!o";
+    char[16] B = "Maximum length!!";
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+    assert(findLengthByte(mmA) == 5);
+    assert(findLengthByte(mmB) == 16);
+}
+
+int findLengthShort(__m128i a) pure @safe
+{
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i zeroMask = _mm_cmpeq_epi16(a, zero); // 0xffff where a short is zero
+    int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index
+    if (mask == 0)
+        return 8;
+    else
+        return bsf(mask) >> 1;
+}
+unittest
+{
+    short[8] A = [10, 5423, 475, 0, 1, 1, 1, 1 ];
+    short[8] B = [-1, -2, -3, 4, 5, 6, -32768, 1];
+    __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
+    __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
+    assert(findLengthShort(mmA) == 3);
+    assert(findLengthShort(mmB) == 8);
+}
+
+static immutable byte[32] MASK_DATA =
+[
+    -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+];
+
+// Makes a byte validity mask with a given explicit length string.
+__m128i validMask8e(int len) @trusted
+{
+    return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len]);
+}
+unittest
+{
+    char[16] A = "";
+    char[16] B = "0123456789abcdef";
+    byte[16] correctA = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+    byte[16] correctB = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1];
+    byte16 MA = cast(byte16) validMask8e(0);
+    byte16 MB = cast(byte16) validMask8e(16);
+    assert(MA.array == correctA);
+    assert(MB.array == correctB);
+}
+
+// Makes a short validity mask with a given explicit length string.
+__m128i validMask16e(int len) @trusted
+{
+    return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len*2]);
+}
+unittest
+{
+    short[8] A = [3, 4, 5, 0, 3, 4, 5, 6];
+    short[8] correctA = [-1, -1, -1, 0, 0, 0, 0, 0];
+    short8 MA = cast(short8) validMask16e(3);
+    assert(MA.array == correctA);
+}
+
+// Internal implementation for non-SSE4.2
+// Compare 8-bit or 16-bit strings, get a mask.
+// `aValid` and `bValid` are byte-mask or word-mask of the valid
+// zone in `a` and `b`.
+__m128i cmpstrMaskExplicit(int imm8)(__m128i a, 
+                                     ref int la, 
+                                     __m128i b, 
+                                     ref int lb) @safe
+{
+    // saturates lengths (the Intrinsics Guide doesn't tell this)
+    if (la < 0) la = -la;
+    if (lb < 0) lb = -lb;
+    if (la > 16) la = 16;
+    if (lb > 16) lb = 16;
+
+    static if (imm8 & 1)
+    {
+        __m128i aValid = validMask16e(la);
+        __m128i bValid = validMask16e(lb);
+    }
+    else
+    {
+        __m128i aValid = validMask8e(la);
+        __m128i bValid = validMask8e(lb);
+    }
+    return cmpstrMask!imm8(a, aValid, b, bValid);
+}
+
+//ditto
+__m128i cmpstrMask(int imm8)(__m128i a, 
+                             __m128i aValid, 
+                             __m128i b, 
+                             const __m128i bValid) @safe
+{
+    enum bool chars16Bits = imm8 & 1;
+    enum int Mode = (imm8 >> 2) & 3;
+
+    static if (Mode == 0) // equal any
+    {
+        __m128i R = _mm_setzero_si128();
+        static if (chars16Bits) // 64 comparisons
+        {
+            for (int k = 0; k < 8; ++k)
+            {
+                __m128i eqMask = _mm_cmpeq_epi16(a, b);
+                eqMask = _mm_and_si128(eqMask, aValid);
+                R = _mm_or_si128(R, eqMask);
+
+                // rotate a and aValid
+                a = _mm_or_si128(_mm_srli_si128!2(a), _mm_slli_si128!14(a));
+                aValid = _mm_or_si128(_mm_srli_si128!2(aValid), _mm_slli_si128!14(aValid));
+            }
+        }
+        else
+        {
+            for (int k = 0; k < 16; ++k)
+            {
+                __m128i eqMask = _mm_cmpeq_epi8(a, b);
+                eqMask = _mm_and_si128(eqMask, aValid);
+                R = _mm_or_si128(R, eqMask);
+
+                // rotate a and aValid
+                a = _mm_or_si128(_mm_srli_si128!1(a), _mm_slli_si128!15(a));
+                aValid = _mm_or_si128(_mm_srli_si128!1(aValid), _mm_slli_si128!15(aValid));
+            }
+        }
+        R = _mm_and_si128(R, bValid);
+    }
+    else static if (Mode == 1) // ranges
+    {
+        enum bool signed = (imm8 & 2) != 0;
+
+        // For each character in b, the returned mask says if it was found in a range-pair in `a`.
+        __m128i R = _mm_setzero_si128();
+        static if (chars16Bits)
+        {
+            for (int pos = 0; pos < 8; pos += 2)
+            {
+                short min = (cast(short8)a).array[pos];
+                short max = (cast(short8)a).array[pos+1];
+                static if (signed)
+                {
+                    __m128i ge = ~_mm_cmplt_epi16(b, _mm_set1_epi16(min));
+                    __m128i le = ~_mm_cmpgt_epi16(b, _mm_set1_epi16(max));
+                }
+                else
+                {
+                    // No SSE way to do 16-bit unsigned comparisons, 
+                    // but flipping the sign bit let us used signed comp
+                    __m128i firstBits = _mm_set1_epi16(-32768);
+                    __m128i reverseB = _mm_xor_si128(b, firstBits);
+                    __m128i reverseMin = _mm_xor_si128(_mm_set1_epi16(min), firstBits);
+                    __m128i reverseMax = _mm_xor_si128(_mm_set1_epi16(max), firstBits);
+                    __m128i ge = ~_mm_cmplt_epi16(reverseB, reverseMin);
+                    __m128i le = ~_mm_cmpgt_epi16(reverseB, reverseMax);
+                }
+                __m128i inRange = _mm_and_si128(le, ge);
+
+                // Not considered in range a is invalid here.
+                short aValidHere = (cast(short8)aValid).array[pos+1];
+                __m128i mmAValidHere = _mm_set1_epi16(aValidHere);
+                inRange = _mm_and_si128(inRange, mmAValidHere); 
+
+                R = _mm_or_si128(R, inRange);
+            }            
+        }
+        else // 8-bits
+        {
+            for (int pos = 0; pos < 16; pos += 2)
+            {
+                byte min = (cast(byte16)a).array[pos];
+                byte max = (cast(byte16)a).array[pos+1];
+                static if (signed)
+                {
+                    __m128i ge = _mm_xor_si128(_mm_cmplt_epi8(b, _mm_set1_epi8(min)));
+                    __m128i le = _mm_xor_si128(_mm_cmpgt_epi8(b, _mm_set1_epi8(max)));
+                }
+                else
+                {
+                    // No SSE way to do 16-bit unsigned comparisons, 
+                    // but flipping the sign bit let us used signed comp
+                    __m128i firstBits = _mm_set1_epi8(-128);
+                    __m128i reverseB = _mm_xor_si128(b, firstBits);
+                    __m128i reverseMin = _mm_xor_si128(_mm_set1_epi8(min), firstBits);
+                    __m128i reverseMax = _mm_xor_si128(_mm_set1_epi8(max), firstBits);
+                    __m128i ge = ~_mm_cmplt_epi8(reverseB, reverseMin);
+                    __m128i le = ~_mm_cmpgt_epi8(reverseB, reverseMax);
+                }
+                __m128i inRange = _mm_and_si128(le, ge);
+
+                // Not considered in range a is invalid here.
+                byte aValidHere = (cast(byte16)aValid).array[pos+1];
+                __m128i mmAValidHere = _mm_set1_epi8(aValidHere);
+                inRange = _mm_and_si128(inRange, mmAValidHere); 
+
+                R = _mm_or_si128(R, inRange);
+            }
+        }
+        // invalid b part is not in range
+        R = _mm_and_si128(R, bValid);
+    }
+    else static if (Mode == 2) // equal each, just 16 comparisons not 256
+    {
+        static if (chars16Bits)
+        {
+            __m128i R = _mm_cmpeq_epi16(a, b);
+        }
+        else
+        {
+            __m128i R = _mm_cmpeq_epi8(a, b);
+        }
+
+        // if only a or b is invalid, consider not equal
+        R = _mm_andnot_si128(_mm_xor_si128(aValid, bValid), R);
+
+        // if a and b are both invalid, consider equal
+        R = _mm_or_si128(R, ~_mm_or_si128(aValid, bValid));
+    }  
+    else static if (Mode == 3) // equal ordered
+    {
+        // a is searched in b.
+
+        __m128i bValidShift = bValid;
+
+        __m128i R = _mm_set1_epi32(-1); // all b positions possible for containing a
+        static if (chars16Bits)
+        {
+            for (int pos = 0; pos < 8; ++pos)
+            {
+                // compare character k of a, where can it go in b?
+                short charK = (cast(short8)a).array[pos];
+                __m128i mmcharK = _mm_set1_epi16(charK);
+
+                short aValidHere = (cast(short8)aValid).array[pos];
+                __m128i mmAValidHere = _mm_set1_epi16(aValidHere);
+                __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1));
+                __m128i eqMask = _mm_cmpeq_epi16(mmcharK, b);
+
+                // Where A is invalid, the comparison always holds "equal"
+                eqMask = _mm_or_si128(eqMask, mmAInvalidHere);
+
+                // Where B is invalid, and A is valid, the comparison is forced to false
+                eqMask = _mm_and_si128(eqMask, _mm_or_si128(bValidShift, mmAInvalidHere));
+
+                R = _mm_and_si128(eqMask);
+
+                // drop first char of b
+                b = _mm_srli_si128!2(b);
+                bValidShift = _mm_srli_si128!2(bValidShift);
+            }
+        }
+        else
+        {
+            for (int pos = 0; pos < 16; ++pos)
+            {
+                // compare character k of a, where can it go in b?
+                byte charK = (cast(byte16)a).array[pos];
+                __m128i mmcharK = _mm_set1_epi8(charK);
+
+                byte aValidHere = (cast(byte16)aValid).array[pos];            
+                __m128i mmAValidHere = _mm_set1_epi8(aValidHere);
+                __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1));
+                __m128i eqMask = _mm_cmpeq_epi8(mmcharK, b);
+
+                // Where A is invalid, the comparison always holds "equal"
+                eqMask = _mm_or_si128(eqMask, mmAInvalidHere);
+
+                // Where B is invalid, and A is valid, the comparison is forced to false
+                eqMask = _mm_and_si128(eqMask, _mm_or_si128(bValidShift, mmAInvalidHere));
+
+                R = _mm_and_si128(R, eqMask);
+
+                // drop first char of b
+                b = _mm_srli_si128!1(b);
+                bValidShift = _mm_srli_si128!1(bValidShift);
+            }
+        }
+    }
+    else 
+        static assert(0);
+
+    // Optionally negate result
+    static if (imm8 & _SIDD_NEGATIVE_POLARITY)
+    {
+        static if (imm8 & _SIDD_MASKED_POSITIVE_POLARITY) 
+        {
+            R = _mm_xor_si128(R, bValid); // only negate valid b
+        }
+        else
+        {
+            R = _mm_xor_si128(R, _mm_set1_epi32(-1)); // negate all
+        }
+    }
+    return R;
+}
\ No newline at end of file
diff --git a/external/inteli/package.d b/external/inteli/package.d
new file mode 100644
index 0000000..5b6d922
--- /dev/null
+++ b/external/inteli/package.d
@@ -0,0 +1,25 @@
+/**
+* Public API. You can `import inteli;` if want access to all intrinsics, under any circumstances.
+* That's the what intel-intrinsics enables.
+*
+* Copyright: Copyright Guillaume Piolat 2016-2020.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli; 
+
+// Importing with `import inteli;` simply imports all available intrinsics.
+public import inteli.types;
+public import inteli.mmx;        // MMX
+public import inteli.emmintrin;  // SSE
+public import inteli.xmmintrin;  // SSE2
+public import inteli.pmmintrin;  // SSE3
+public import inteli.tmmintrin;  // SSSE3
+public import inteli.smmintrin;  // SSE4.1
+public import inteli.nmmintrin;  // SSE4.2
+public import inteli.shaintrin;  // SHA
+public import inteli.bmi2intrin; // BMI2
+public import inteli.avxintrin;  // AVX
+public import inteli.avx2intrin; // AVX2
+
+public import inteli.math; // Bonus
+
diff --git a/external/inteli/pmmintrin.d b/external/inteli/pmmintrin.d
new file mode 100644
index 0000000..e76af16
--- /dev/null
+++ b/external/inteli/pmmintrin.d
@@ -0,0 +1,294 @@
+/**
+* SSE3 intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE3
+*
+* Copyright: Guillaume Piolat 2016-2020.
+*            Charles Gregory 2019.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.pmmintrin;
+
+public import inteli.types;
+import inteli.internals;
+public import inteli.emmintrin;
+
+
+// Note: this header will work whether you have SSE3 enabled or not.
+// With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 
+// generate SSE3 instruction (they are often enabled with -O1 or greater).
+// With GDC, use "dflags-gdc": ["-msse3"] or equivalent to generate SSE3 instructions.
+
+
+nothrow @nogc:
+
+/// Alternatively add and subtract packed double-precision (64-bit) 
+/// floating-point elements in `a` to/from packed elements in `b`.
+__m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted
+{
+    static if (DMD_with_DSIMD_and_SSE3)
+    {
+        return cast(__m128d) __simd(XMM.ADDSUBPD, cast(void16)a, cast(void16)b);
+    }
+    else static if (GDC_with_SSE3)
+    {
+        return __builtin_ia32_addsubpd(a, b);
+    }
+    else static if (LDC_with_SSE3)
+    {
+        return __builtin_ia32_addsubpd(a, b);
+    }
+    else
+    {
+        // ARM: well optimized starting with LDC 1.18.0 -O2, not disrupted by LLVM 13+
+        a.ptr[0] = a.array[0] - b.array[0];
+        a.ptr[1] = a.array[1] + b.array[1];
+        return a;
+    }
+}
+unittest
+{
+    auto v1 =_mm_setr_pd(1.0,2.0);
+    auto v2 =_mm_setr_pd(1.0,2.0);
+    assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array);
+}
+
+/// Alternatively add and subtract packed single-precision (32-bit) 
+/// floating-point elements in `a` to/from packed elements in `b`.
+float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted
+{
+    static if (DMD_with_DSIMD_and_SSE3)
+    {
+        return cast(__m128) __simd(XMM.ADDSUBPS, cast(void16)a, cast(void16)b);
+    }
+    else static if (GDC_with_SSE3)
+    {
+        return __builtin_ia32_addsubps(a, b);
+    }
+    else static if (LDC_with_SSE3)
+    {
+        return __builtin_ia32_addsubps(a, b);
+    }
+    else
+    {    
+        a.ptr[0] -= b.array[0];
+        a.ptr[1] += b.array[1];
+        a.ptr[2] -= b.array[2];
+        a.ptr[3] += b.array[3];
+        return a;
+    }
+}
+unittest
+{
+    auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
+    auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
+    assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array );
+}
+
+
+/// Horizontally add adjacent pairs of double-precision (64-bit) 
+/// floating-point elements in `a` and `b`.
+__m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted
+{
+    // PERF: ARM64?
+    static if (DMD_with_DSIMD_and_SSE3)
+    {
+        return cast(__m128d) __simd(XMM.HADDPD, cast(void16)a, cast(void16)b);
+    }
+    else static if (GDC_or_LDC_with_SSE3)
+    {
+        return __builtin_ia32_haddpd(a, b);
+    }
+    else
+    {
+        __m128d res;
+        res.ptr[0] = a.array[1] + a.array[0];
+        res.ptr[1] = b.array[1] + b.array[0];
+        return res;
+    }
+}
+unittest
+{
+    auto A =_mm_setr_pd(1.5, 2.0);
+    auto B =_mm_setr_pd(1.0, 2.0);
+    assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array );
+}
+
+/// Horizontally add adjacent pairs of single-precision (32-bit) 
+/// floating-point elements in `a` and `b`.
+__m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted
+{
+    static if (DMD_with_DSIMD_and_SSE3)
+    {
+        return cast(__m128) __simd(XMM.HADDPS, cast(void16)a, cast(void16)b);
+    }
+    else static if (GDC_or_LDC_with_SSE3)
+    {
+        return __builtin_ia32_haddps(a, b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        return vpaddq_f32(a, b);
+    }
+    else
+    {    
+        __m128 res;
+        res.ptr[0] = a.array[1] + a.array[0];
+        res.ptr[1] = a.array[3] + a.array[2];
+        res.ptr[2] = b.array[1] + b.array[0];
+        res.ptr[3] = b.array[3] + b.array[2];
+        return res;
+    }
+}
+unittest
+{
+    __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
+    __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
+    assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array );
+}
+
+/// Horizontally subtract adjacent pairs of double-precision (64-bit) 
+/// floating-point elements in `a` and `b`.
+__m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted
+{
+    static if (DMD_with_DSIMD_and_SSE3)
+    {
+        return cast(__m128d) __simd(XMM.HSUBPD, cast(void16)a, cast(void16)b);
+    }
+    else static if (GDC_or_LDC_with_SSE3)
+    {
+        return __builtin_ia32_hsubpd(a, b);
+    }
+    else
+    {
+        // yep, sounds optimal for ARM64 too. Strangely enough.
+        __m128d res;
+        res.ptr[0] = a.array[0] - a.array[1];
+        res.ptr[1] = b.array[0] - b.array[1];
+        return res;
+    }
+}
+unittest
+{
+    auto A =_mm_setr_pd(1.5, 2.0);
+    auto B =_mm_setr_pd(1.0, 2.0);
+    assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array );
+}
+
+/// Horizontally subtract adjacent pairs of single-precision (32-bit) 
+/// floating-point elements in `a` and `b`.
+__m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted
+{
+    static if (DMD_with_DSIMD_and_SSE3)
+    {
+        return cast(__m128) __simd(XMM.HSUBPS, cast(void16)a, cast(void16)b);
+    }
+    else static if (GDC_or_LDC_with_SSE3)
+    {
+        return __builtin_ia32_hsubps(a, b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        int4 mask = [0, 0x80000000, 0, 0x80000000];
+        a = cast(__m128)(cast(int4)a ^ mask);
+        b = cast(__m128)(cast(int4)b ^ mask);
+        return vpaddq_f32(a, b);
+    }
+    else
+    {
+        __m128 res;
+        res.ptr[0] = a.array[0] - a.array[1];
+        res.ptr[1] = a.array[2] - a.array[3];
+        res.ptr[2] = b.array[0] - b.array[1];
+        res.ptr[3] = b.array[2] - b.array[3];
+        return res;
+    }
+}
+unittest
+{
+    __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
+    __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
+    assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array);
+}
+
+/// Load 128-bits of integer data from unaligned memory.
+// Note: The saying is LDDQU was only ever useful around 2008
+// See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used
+alias _mm_lddqu_si128 = _mm_loadu_si128;
+
+/// Load a double-precision (64-bit) floating-point element from memory into both elements of result.
+__m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted
+{
+    // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
+    // Same for GDC with -O1
+    double value = *mem_addr;
+    __m128d res;
+    res.ptr[0] = value;
+    res.ptr[1] = value;
+    return res;
+}
+unittest
+{
+    double a = 7.5;
+    __m128d A = _mm_loaddup_pd(&a);
+    double[2] correct = [7.5, 7.5];
+    assert(A.array == correct);
+}
+
+/// Duplicate the low double-precision (64-bit) floating-point element from `a`.
+__m128d _mm_movedup_pd (__m128d a) pure @trusted
+{
+    // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
+    // Something efficient with -01 for GDC
+    a.ptr[1] = a.array[0];
+    return a;
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(7.0, 2.5);
+    assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array);
+}
+
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`.
+__m128 _mm_movehdup_ps (__m128 a) pure @trusted
+{
+    static if (GDC_with_SSE3)
+    {
+        return __builtin_ia32_movshdup (a);
+    }
+    else
+    {
+        // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3
+        a.ptr[0] = a.array[1];
+        a.ptr[2] = a.array[3];
+        return a;
+    }
+    
+}
+unittest
+{
+    __m128 A = _mm_movehdup_ps(_mm_setr_ps(1, 2, 3, 4));
+    float[4] correct = [2.0f, 2, 4, 4 ];
+    assert(A.array == correct);
+}
+
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`.
+__m128 _mm_moveldup_ps (__m128 a) pure @trusted
+{
+    static if (GDC_with_SSE3)
+    {
+        return __builtin_ia32_movsldup (a);
+    }
+    else
+    {
+        // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3
+        a.ptr[1] = a.array[0];
+        a.ptr[3] = a.array[2];
+        return a;
+    }
+}
+unittest
+{
+    __m128 A = _mm_moveldup_ps(_mm_setr_ps(1, 2, 3, 4));
+    float[4] correct = [1.0f, 1, 3, 3 ];
+    assert(A.array == correct);
+}
\ No newline at end of file
diff --git a/external/inteli/shaintrin.d b/external/inteli/shaintrin.d
new file mode 100644
index 0000000..227873e
--- /dev/null
+++ b/external/inteli/shaintrin.d
@@ -0,0 +1,268 @@
+/**
+* SHA intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=SHA
+* 
+* Copyright: Guillaume Piolat 2021.
+*            Johan Engelen 2021.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.shaintrin;
+
+// SHA instructions
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=SHA
+// Note: this header will work whether you have SHA enabled or not.
+// With LDC, use "dflags-ldc": ["-mattr=+sha"] or equivalent to actively
+// generate SHA instructions.
+// With GDC, use "dflags-gdc": ["-msha"] or equivalent to generate SHA instructions.
+
+public import inteli.types;
+import inteli.internals;
+
+
+
+nothrow @nogc:
+
+/+
+/// Perform an intermediate calculation for the next four SHA1 message values (unsigned 32-bit integers) using previous message values from a and b, and store the result in dst.
+__m128i _mm_sha1nexte_epu32(__m128i a, __m128i b) @trusted
+{
+    static if (SHA_builtins)
+    {
+        return __builtin_ia32_sha1nexte(cast(int4) a, cast(int4) b);
+    }
+    else
+    {
+        assert(0);
+    }
+}
+unittest
+{
+}
++/
+
+/+
+/// Perform the final calculation for the next four SHA1 message values (unsigned 32-bit integers) using the intermediate result in a and the previous message values in b, and store the result in dst.
+__m128i _mm_sha1msg1_epu32(__m128i a, __m128i b) @trusted
+{
+    static if (SHA_builtins)
+    {
+        return __builtin_ia32_sha1msg1(cast(int4) a, cast(int4) b);
+    }
+    else
+    {
+        assert(0);
+    }
+}
+unittest
+{
+}
++/
+
+/+
+/// Calculate SHA1 state variable E after four rounds of operation from the current SHA1 state variable a, add that value to the scheduled values (unsigned 32-bit integers) in b, and store the result in dst.
+__m128i _mm_sha1msg2_epu32(__m128i a, __m128i b) @trusted
+{
+    static if (SHA_builtins)
+    {
+        return __builtin_ia32_sha1msg2(cast(int4) a, cast(int4) b);
+    }
+    else
+    {
+        assert(0);
+    }
+}
+unittest
+{
+}
++/
+
+/+
+/// Perform four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D) from a and some pre-computed sum of the next 4 round message values (unsigned 32-bit integers), and state variable E from b, and store the updated SHA1 state (A,B,C,D) in dst. func contains the logic functions and round constants.
+__m128i _mm_sha1rnds4_epu32(__m128i a, __m128i b, const int func) @trusted
+{
+    static if (SHA_builtins)
+    {
+        return __builtin_ia32_sha1rnds4(cast(int4) a, cast(int4) b, func);
+    }
+    else
+    {
+        assert(0);
+    }
+
+}
++/
+
+/// Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from `a` and `b`, and return the result.
+__m128i _mm_sha256msg1_epu32(__m128i a, __m128i b) @trusted
+{
+    static if (GDC_or_LDC_with_SHA)
+    {
+        return __builtin_ia32_sha256msg1(cast(int4) a, cast(int4) b);
+    }
+    else
+    {
+        static uint sigma0(uint x) nothrow @nogc @safe
+        { 
+            return bitwiseRotateRight_uint(x, 7) ^ bitwiseRotateRight_uint(x, 18) ^ x >> 3;
+        }
+
+        int4 dst;
+        int4 a4 = cast(int4) a;
+        int4 b4 = cast(int4) b;
+        uint W4 = b4.array[0];
+        uint W3 = a4.array[3];
+        uint W2 = a4.array[2];
+        uint W1 = a4.array[1];
+        uint W0 = a4.array[0];
+        dst.ptr[3] = W3 + sigma0(W4);
+        dst.ptr[2] = W2 + sigma0(W3);
+        dst.ptr[1] = W1 + sigma0(W2);
+        dst.ptr[0] = W0 + sigma0(W1);
+        return cast(__m128i) dst;
+    }
+}
+unittest
+{
+    __m128i a = [15, 20, 130, 12345];
+    __m128i b = [15, 20, 130, 12345];
+    __m128i result = _mm_sha256msg1_epu32(a, b);
+    assert(result.array == [671416337, 69238821, 2114864873, 503574586]);
+}
+
+/// Perform 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from `a`, an initial SHA256 state (A,B,E,F) from `b`, and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers) and the corresponding round constants from k, and return the updated SHA256 state (A,B,E,F).
+__m128i _mm_sha256msg2_epu32(__m128i a, __m128i b) @trusted
+{
+    static if (GDC_or_LDC_with_SHA)
+    {
+        return __builtin_ia32_sha256msg2(cast(int4) a, cast(int4) b);
+    }
+    else
+    {
+        static uint sigma1(uint x) nothrow @nogc @safe
+        { 
+            return bitwiseRotateRight_uint(x, 17) ^ bitwiseRotateRight_uint(x, 19) ^ x >> 10; 
+        }
+
+        int4 dst;
+        int4 a4 = cast(int4) a;
+        int4 b4 = cast(int4) b;
+        uint W14 = b4.array[2];
+        uint W15 = b4.array[3];
+        uint W16 = a4.array[0] + sigma1(W14);
+        uint W17 = a4.array[1] + sigma1(W15);
+        uint W18 = a4.array[2] + sigma1(W16);
+        uint W19 = a4.array[3] + sigma1(W17);
+        dst.ptr[3] = W19;
+        dst.ptr[2] = W18;
+        dst.ptr[1] = W17;
+        dst.ptr[0] = W16;
+        return cast(__m128i) dst;
+    }
+}
+unittest
+{
+    __m128i a = [15, 20, 130, 12345];
+    __m128i b = [15, 20, 130, 12345];
+    __m128i result = _mm_sha256msg2_epu32(a, b);
+    assert(result.array == [5324815, 505126944, -2012842764, -1542210977]);
+}
+
+/// Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from `a` and `b`, and return the result.
+__m128i _mm_sha256rnds2_epu32(__m128i a, __m128i b, __m128i k) @trusted
+{
+    // TODO: the pragma(inline) false prevent a DMD 1.100
+    //       regression in Linux + x86_64 + -b release-unittest, report that
+
+    version(DigitalMars)
+    {
+        enum bool workaround = true;
+    }
+    else
+    {
+        enum bool workaround = false;
+    }
+
+    static if (GDC_or_LDC_with_SHA)
+    {
+        return __builtin_ia32_sha256rnds2(cast(int4) a, cast(int4) b, cast(int4) k);
+    }
+    else
+    {
+        static uint Ch(uint x, uint y, uint z) nothrow @nogc @safe
+        { 
+            static if (workaround) pragma (inline, false);
+            return z ^ (x & (y ^ z)); 
+        }
+        
+        static uint Maj(uint x, uint y, uint z) nothrow @nogc @safe
+        { 
+            static if (workaround) pragma (inline, false);
+            return (x & y) | (z & (x ^ y)); 
+        }
+
+        static uint sum0(uint x) nothrow @nogc @safe
+        { 
+            static if (workaround) pragma (inline, false);
+            return bitwiseRotateRight_uint(x, 2) ^ bitwiseRotateRight_uint(x, 13) ^ bitwiseRotateRight_uint(x, 22); 
+        }
+
+        static uint sum1(uint x) nothrow @nogc @safe
+        { 
+            static if (workaround) pragma (inline, false);
+            return bitwiseRotateRight_uint(x, 6) ^ bitwiseRotateRight_uint(x, 11) ^ bitwiseRotateRight_uint(x, 25); 
+        }
+
+        int4 dst;
+        int4 a4 = cast(int4) a;
+        int4 b4 = cast(int4) b;
+        int4 k4 = cast(int4) k;
+
+        const A0 = b4.array[3];
+        const B0 = b4.array[2];
+        const C0 = a4.array[3];
+        const D0 = a4.array[2];
+        const E0 = b4.array[1];
+        const F0 = b4.array[0];
+        const G0 = a4.array[1];
+        const H0 = a4.array[0];
+        const W_K0 = k4.array[0];
+        const W_K1 = k4.array[1];
+        const A1 = Ch(E0, F0, G0) + sum1(E0) + W_K0 + H0 + Maj(A0, B0, C0) + sum0(A0);
+        const B1 = A0;
+        const C1 = B0;
+        const D1 = C0;
+        const E1 = Ch(E0, F0, G0) + sum1(E0) + W_K0 + H0 + D0;
+        const F1 = E0;
+        const G1 = F0;
+        const H1 = G0;
+        const A2 = Ch(E1, F1, G1) + sum1(E1) + W_K1 + H1 + Maj(A1, B1, C1) + sum0(A1);
+        const B2 = A1;
+        const C2 = B1;
+        const D2 = C1;
+        const E2 = Ch(E1, F1, G1) + sum1(E1) + W_K1 + H1 + D1;
+        const F2 = E1;
+        const G2 = F1;
+        const H2 = G1;
+
+        dst.ptr[3] = A2;
+        dst.ptr[2] = B2;
+        dst.ptr[1] = E2;
+        dst.ptr[0] = F2;
+
+        return cast(__m128i) dst;
+    }
+}
+unittest
+{
+    __m128i a = [15, 20, 130, 12345];
+    __m128i b = [15, 20, 130, 12345];
+    __m128i k = [15, 20, 130, 12345];
+    __m128i result = _mm_sha256rnds2_epu32(a, b, k);
+    assert(result.array == [1384123044, -2050674062, 327754346, 956342016]);
+}
+
+private uint bitwiseRotateRight_uint(const uint value, const uint count) @safe
+{
+    assert(count < 8 * uint.sizeof);
+    return cast(uint) ((value >> count) | (value << (uint.sizeof * 8 - count)));
+}
\ No newline at end of file
diff --git a/external/inteli/smmintrin.d b/external/inteli/smmintrin.d
new file mode 100644
index 0000000..16fdb47
--- /dev/null
+++ b/external/inteli/smmintrin.d
@@ -0,0 +1,2215 @@
+/**
+* SSE4.1 intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1
+*
+* Copyright: Guillaume Piolat 2021.
+*            Johan Engelen 2021.
+*            cet 2024.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.smmintrin;
+
+// SSE4.1 instructions
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
+// Note: this header will work whether you have SSE4.1 enabled or not.
+// With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
+// generate SSE4.1 instructions.
+// With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions.
+
+public import inteli.types;
+import inteli.internals;
+
+// smmintrin pulls in all previous instruction set intrinsics.
+public import inteli.tmmintrin;
+
+nothrow @nogc:
+
+enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
+enum int _MM_FROUND_TO_NEG_INF     = 0x01; /// ditto
+enum int _MM_FROUND_TO_POS_INF     = 0x02; /// ditto
+enum int _MM_FROUND_TO_ZERO        = 0x03; /// ditto
+enum int _MM_FROUND_CUR_DIRECTION  = 0x04; /// ditto
+enum int _MM_FROUND_RAISE_EXC      = 0x00; /// ditto
+enum int _MM_FROUND_NO_EXC         = 0x08; /// ditto
+
+enum int _MM_FROUND_NINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
+enum int _MM_FROUND_FLOOR     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
+enum int _MM_FROUND_CEIL      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
+enum int _MM_FROUND_TRUNC     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
+enum int _MM_FROUND_RINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
+enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC    | _MM_FROUND_CUR_DIRECTION);
+
+/// Add packed signed 32-bit integers in `a` and `b` using saturation.
+/// #BONUS
+__m128i _mm_adds_epi32(__m128i a, __m128i b) pure
+{
+    // PERF: ARM64 should use 2x vqadd_s32
+    static if (LDC_with_saturated_intrinsics)
+        return cast(__m128i)inteli_llvm_adds!int4(cast(int4)a, cast(int4)b);
+    else
+    {
+        __m128i int_max = _mm_set1_epi32(0x7FFFFFFF);
+        __m128i res = _mm_add_epi32(a, b);
+        __m128i sign_bit = _mm_srli_epi32(a, 31);
+        __m128i sign_xor  = _mm_xor_si128(a, b);
+        __m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a, res));
+        __m128i saturated = _mm_add_epi32(int_max, sign_bit);
+        return cast(__m128i) _mm_blendv_ps(cast(__m128)res, 
+            cast(__m128)saturated, 
+            cast(__m128)overflow);
+    }
+}
+unittest
+{
+    __m128i a = _mm_setr_epi32(int.max, 1, 2, int.min);
+    __m128i b = _mm_setr_epi32(1, 2, 3, -4);
+    assert(_mm_adds_epi32(a, b).array == [int.max, 3, 5, int.min]);
+}
+
+/// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results.
+// Note: changed signature, GDC needs a compile-time value for imm8.
+__m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        pragma(inline, true); // else wouldn't inline in _mm256_blend_epi16
+        return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8);
+    }
+    else 
+    {
+        // LDC x86 This generates pblendw since LDC 1.1 and -O2
+        short8 r;
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        for (int n = 0; n < 8; ++n)
+        {
+            r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n];
+        }
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7);
+    __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+    short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011
+    short[8] correct =        [8, 9,  2,  3, 12,  5,  6, 15];
+    assert(C.array == correct);
+}
+
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`.
+// Note: changed signature, GDC needs a compile-time value for `imm8`.
+__m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted
+{
+    static assert(imm8 >= 0 && imm8 < 4);
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8);
+    }
+    else
+    {
+        // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12
+        double2 r;
+        for (int n = 0; n < 2; ++n)
+        {
+            r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
+        }
+        return cast(__m128d)r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(0, 1);
+    __m128d B = _mm_setr_pd(8, 9);
+    double2 C = _mm_blend_pd!2(A, B);
+    double[2] correct =    [0, 9];
+    assert(C.array == correct);
+}
+
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 
+/// mask `imm8`.
+// Note: changed signature, GDC needs a compile-time value for imm8.
+__m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) pure @trusted
+{
+    // PERF DMD
+    static assert(imm8 >= 0 && imm8 < 16);
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_blendps(a, b, imm8);
+    }
+    else version(LDC)
+    {
+        // LDC x86: generates blendps since LDC 1.1 -O2
+        //   arm64: pretty good, two instructions worst case
+        return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0,
+                                         (imm8 & 2) ? 5 : 1,
+                                         (imm8 & 4) ? 6 : 2,
+                                         (imm8 & 8) ? 7 : 3)(a, b);
+    }
+    else
+    {
+        // PERF GDC without SSE4.1 is quite bad
+        __m128 r;
+        for (int n = 0; n < 4; ++n)
+        {
+            r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
+        }
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(0, 1,  2,  3);
+    __m128 B = _mm_setr_ps(8, 9, 10, 11);
+    float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101
+    float[4] correct =    [8, 1, 10, 11];
+    assert(C.array == correct);
+}
+
+/// Blend packed 8-bit integers from `a` and `b` using `mask`.
+/// Select from `b` if the high-order bit of the corresponding 8-bit element in `mask` is set, else select from `a`.
+__m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) pure @trusted
+{
+    // PERF DMD
+    /*static if (GDC_with_SSE41)
+    {
+        // This intrinsic do nothing in GDC 12.
+        // TODO report to GDC. No problem in GCC.
+        return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask);
+    }
+    else*/
+    static if (LDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // LDC arm64: two instructions since LDC 1.12 -O2
+        byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7);
+        return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a);
+    }
+    else
+    {
+        __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
+        return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  
+                               8,  9, 10, 11, 12, 13, 14, 15);
+    __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 
+                              24, 25, 26, 27, 28, 29, 30, 31);
+    __m128i M = _mm_setr_epi8( 1, -1,  1,  1, -4,  1, -8,  127,  
+                               1,  1, -1, -1,  4,  1,  8, -128);
+    byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M);
+    byte[16] correct =      [  0, 17,  2,  3, 20,  5, 22,  7,
+                               8,  9, 26, 27, 12, 13, 14, 31 ];
+    assert(R.array == correct);
+}
+
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`.
+__m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE42)
+    {
+        // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction
+        // with -msse4.2 but not -msse4.1.
+        // Not sure what is the reason, and there is a replacement sequence.
+        // Sounds like a bug.
+        return __builtin_ia32_blendvpd(a, b, mask);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_blendvpd(a, b, mask);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        long2 shift;
+        shift = 63;
+        long2 lmask = cast(long2)mask >> shift;
+        return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a);
+    }
+    else
+    {
+        __m128d r; // PERF =void;
+        long2 lmask = cast(long2)mask;
+        for (int n = 0; n < 2; ++n)
+        {
+            r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
+        }
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.0, 2.0);
+    __m128d B = _mm_setr_pd(3.0, 4.0);
+    __m128d M1 = _mm_setr_pd(-3.0, 2.0);
+    __m128d R1 = _mm_blendv_pd(A, B, M1);
+    double[2] correct1 = [3.0, 2.0];
+    assert(R1.array == correct1);
+
+    // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost
+    // See Issue #78
+    __m128d M2 = _mm_setr_pd(double.nan, double.infinity);
+    __m128d R2 = _mm_blendv_pd(A, B, M2);
+    double[2] correct2 = [1.0, 2.0];
+    assert(R2.array == correct2);
+}
+
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`.
+__m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_blendvps(a, b, mask);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_blendvps(a, b, mask);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        int4 shift;
+        shift = 31;
+        int4 lmask = cast(int4)mask >> shift;
+        return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a);
+    }
+    else
+    {
+        // LDC x86_64: Compiles to 5 instr since LDC 1.27 -O2
+        // If lack of optimization, consider replacing by:
+        //  __m128i overflow_mask = _mm_srai_epi32(overflow, 31);
+        //    return _mm_or_si128(
+        //        _mm_and_si128(overflow_mask, saturated),
+        //        _mm_andnot_si128(overflow_mask, res)
+        // LLVM makes almost the same sequence when optimized.
+        __m128 r;
+        int4 lmask = cast(int4)mask;
+        for (int n = 0; n < 4; ++n)
+        {
+            r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
+        }
+        return r;
+    }
+}
+unittest
+{
+    __m128 A  = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f);
+    __m128 B  = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f);
+    __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f);
+    __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f);
+    __m128 R1 = _mm_blendv_ps(A, B, M1);
+    __m128 R2 = _mm_blendv_ps(A, B, M2);
+    float[4] correct1 =    [ 4.0f, 1.0f, 2.0f, 7.0f];
+    float[4] correct2 =    [ 0.0f, 1.0f, 6.0f, 3.0f];
+    assert(R1.array == correct1);
+
+    // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost
+    // See Issue #78
+    assert(R2.array == correct2);
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 
+/// and store the results as packed double-precision floating-point elements.
+__m128d _mm_ceil_pd (__m128d a) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        // LDC arm64 acceptable since 1.8 -O2
+        // Unfortunately x86 intrinsics force a round-trip back to double2
+        // ARM neon semantics wouldn't have that
+        long2 l = vcvtpq_s64_f64(a);
+        double2 r;
+        r.ptr[0] = l.array[0];
+        r.ptr[1] = l.array[1];
+        return r;
+    }
+    else
+    {
+        return _mm_round_pd!2(a);
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.3f, -2.12f);
+    __m128d B = _mm_setr_pd(53.6f, -2.7f);
+    A = _mm_ceil_pd(A);
+    B = _mm_ceil_pd(B);
+    double[2] correctA = [2.0, -2.0];
+    double[2] correctB = [54.0, -2.0];
+    assert(A.array == correctA);
+    assert(B.array == correctB);
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 
+/// and store the results as packed single-precision floating-point elements.
+__m128 _mm_ceil_ps (__m128 a) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        // LDC arm64 acceptable since 1.8 -O1
+        int4 l = vcvtpq_s32_f32(a);
+        float4 r;
+        r.ptr[0] = l.array[0];
+        r.ptr[1] = l.array[1];
+        r.ptr[2] = l.array[2];
+        r.ptr[3] = l.array[3];
+        return r;
+    }
+    else
+    {
+        return _mm_round_ps!2(a);
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
+    __m128 C = _mm_ceil_ps(A);
+    float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f];
+    assert(C.array == correct);
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 
+/// store the result as a double-precision floating-point element in the lower element of result, 
+/// and copy the upper element from `a` to the upper element of dst.
+__m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        a[0] = vcvtps_s64_f64(b[0]);
+        return a;
+    }
+    else
+    {
+        return _mm_round_sd!2(a, b);
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.3, -2.12);
+    __m128d B = _mm_setr_pd(53.6, -3.7);
+    __m128d C = _mm_ceil_sd(A, B);
+    double[2] correct = [54.0, -2.12];
+    assert(C.array == correct);
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value,
+/// store the result as a single-precision floating-point element in the lower element of result, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        a[0] = vcvtps_s32_f32(b[0]);
+        return a;
+    }
+    else
+    {
+        return _mm_round_ss!2(a, b);
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
+    __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f);
+    __m128 C = _mm_ceil_ss(A, B);
+    float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f];
+    assert(C.array == correct);
+}
+
+/// Compare packed 64-bit integers in `a` and `b` for equality.
+__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        version(DigitalMars)
+        {
+            // DMD doesn't recognize long2 == long2
+            long2 la = cast(long2)a;
+            long2 lb = cast(long2)b;
+            long2 res;
+            res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
+            res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
+            return cast(__m128i)res;
+        }
+        else
+        {
+            return cast(__m128i)(cast(long2)a == cast(long2)b);
+        }
+    }
+    else static if (GDC_with_SSE41)
+    {
+        return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b);
+    }
+    else version(LDC)
+    {
+        // LDC x86: generates pcmpeqq since LDC 1.1 -O1
+        //     arm64: generates cmeq since LDC 1.8 -O1
+        return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b);
+    }
+    else
+    {
+        // Clever pcmpeqd + pand use with LDC 1.24 -O2
+        long2 la = cast(long2)a;
+        long2 lb = cast(long2)b;
+        long2 res;
+        res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
+        res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
+        return cast(__m128i)res;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi64(-1, -2);
+    __m128i B = _mm_setr_epi64(-3, -2);
+    __m128i C = _mm_setr_epi64(-1, -4);
+    long2 AB = cast(long2) _mm_cmpeq_epi64(A, B);
+    long2 AC = cast(long2) _mm_cmpeq_epi64(A, C);
+    long[2] correct1 = [0, -1];
+    long[2] correct2 = [-1, 0];
+    assert(AB.array == correct1);
+    assert(AC.array == correct2);
+}
+
+
+/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
+__m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64
+        enum ir = `
+            %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
+            %r = sext <4 x i16> %v to <4 x i32>
+            ret <4 x i32> %r`;
+        return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        int4 r;
+        r.ptr[0] = sa.array[0];
+        r.ptr[1] = sa.array[1];
+        r.ptr[2] = sa.array[2];
+        r.ptr[3] = sa.array[3];
+        return r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
+    int4 C = cast(int4) _mm_cvtepi16_epi32(A);
+    int[4] correct = [-1, 0, -32768, 32767];
+    assert(C.array == correct);
+}
+
+/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
+__m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64
+        enum ir = `
+            %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1>
+            %r = sext <2 x i16> %v to <2 x i64>
+            ret <2 x i64> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        long2 r;
+        r.ptr[0] = sa.array[0];
+        r.ptr[1] = sa.array[1];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0);
+    long2 C = cast(long2) _mm_cvtepi16_epi64(A);
+    long[2] correct = [-32768, 32767];
+    assert(C.array == correct);
+}
+
+/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
+__m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64
+        enum ir = `
+            %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
+            %r = sext <2 x i32> %v to <2 x i64>
+            ret <2 x i64> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a);
+    }
+    else
+    {
+        int4 sa = cast(int4)a;
+        long2 r;
+        r.ptr[0] = sa.array[0];
+        r.ptr[1] = sa.array[1];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(-4, 42, 0, 0);
+    long2 C = cast(long2) _mm_cvtepi32_epi64(A);
+    long[2] correct = [-4, 42];
+    assert(C.array == correct);
+}
+
+
+/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
+__m128i _mm_cvtepi8_epi16 (__m128i a) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        alias ubyte16 = __vector(ubyte[16]);
+        return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 
+        // LDC ARM64: sshll generated since LDC 1.8.0 -O1
+        enum ir = `
+            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+            %r = sext <8 x i8> %v to <8 x i16>
+            ret <8 x i16> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        byte16 sa = cast(byte16)a;
+        short8 r;
+        foreach(n; 0..8)
+            r.ptr[n] = sa.array[n];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+    short8 C = cast(short8) _mm_cvtepi8_epi16(A);
+    short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8];
+    assert(C.array == correct);
+}
+
+
+/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
+__m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        alias ubyte16 = __vector(ubyte[16]);
+        return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_SSE41 && LDC_with_optimizations)
+    {
+        // LDC x86: Generates pmovsxbd since LDC 1.1 -O0
+        enum ir = `
+            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+            %r = sext <4 x i8> %v to <4 x i32>
+            ret <4 x i32> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would
+        byte16 sa = cast(byte16)a;
+        int4 r;
+        r.ptr[0] = sa.array[0];
+        r.ptr[1] = sa.array[1];
+        r.ptr[2] = sa.array[2];
+        r.ptr[3] = sa.array[3];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+    int4 C = cast(int4) _mm_cvtepi8_epi32(A);
+    int[4] correct = [127, -128, 1, -1];
+    assert(C.array == correct);
+}
+
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
+__m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        alias ubyte16 = __vector(ubyte[16]);
+        return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 
+        // LDC arm64: it's ok since LDC 1.8 -O1
+        enum ir = `
+            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1>
+            %r = sext <2 x i8> %v to <2 x i64>
+            ret <2 x i64> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        byte16 sa = cast(byte16)a;
+        long2 r;
+        foreach(n; 0..2)
+            r.ptr[n] = sa.array[n];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+    long2 C = cast(long2) _mm_cvtepi8_epi64(A);
+    long[2] correct = [127, -128];
+    assert(C.array == correct);
+}
+
+
+/// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
+__m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a);
+    }
+    else
+    {
+        // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
+        //     arm64: ushll since LDC 1.12 -O1
+        short8 sa = cast(short8)a;
+        int4 r;
+        r.ptr[0] = cast(ushort)sa.array[0];
+        r.ptr[1] = cast(ushort)sa.array[1];
+        r.ptr[2] = cast(ushort)sa.array[2];
+        r.ptr[3] = cast(ushort)sa.array[3];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
+    int4 C = cast(int4) _mm_cvtepu16_epi32(A);
+    int[4] correct = [65535, 0, 32768, 32767];
+    assert(C.array == correct);
+}
+
+
+/// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
+__m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // LDC arm64: a bit shorter than below, in -O2
+        short8 sa = cast(short8)a;
+        long2 r;
+        for(int n = 0; n < 2; ++n)
+            r.ptr[n] = cast(ushort)sa.array[n];
+        return cast(__m128i)r;
+    }
+    else
+    {
+        // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
+        short8 sa = cast(short8)a;
+        long2 r;
+        r.ptr[0] = cast(ushort)sa.array[0];
+        r.ptr[1] = cast(ushort)sa.array[1];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
+    long2 C = cast(long2) _mm_cvtepu16_epi64(A);
+    long[2] correct = [65535, 0];
+    assert(C.array == correct);
+}
+
+
+/// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
+__m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a);
+    }
+    else
+    {
+        // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1
+        //     arm64: generates ushll since LDC 1.12 -O1
+        int4 sa = cast(int4)a;
+        long2 r;
+        r.ptr[0] = cast(uint)sa.array[0];
+        r.ptr[1] = cast(uint)sa.array[1];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(-1, 42, 0, 0);
+    long2 C = cast(long2) _mm_cvtepu32_epi64(A);
+    long[2] correct = [4294967295, 42];
+    assert(C.array == correct);
+}
+
+
+/// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
+__m128i _mm_cvtepu8_epi16 (__m128i a) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+            %r = zext <8 x i8> %v to <8 x i16>
+            ret <8 x i16> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+    short8 C = cast(short8) _mm_cvtepu8_epi16(A);
+    short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248];
+    assert(C.array == correct);
+}
+
+
+/// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
+__m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        alias ubyte16 = __vector(ubyte[16]);
+        return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // LDC arm64: a bit better than below in -O2
+        byte16 sa = cast(byte16)a;
+        int4 r;
+        for(int n = 0; n < 4; ++n) 
+            r.ptr[n] = cast(ubyte)sa.array[n];
+        return cast(__m128i)r;
+    }
+    else
+    {
+        // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1
+        // PERF: catastrophic with GDC without SSE4.1
+        byte16 sa = cast(byte16)a;
+        int4 r;
+        r.ptr[0] = cast(ubyte)sa.array[0];
+        r.ptr[1] = cast(ubyte)sa.array[1];
+        r.ptr[2] = cast(ubyte)sa.array[2];
+        r.ptr[3] = cast(ubyte)sa.array[3];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+    int4 C = cast(int4) _mm_cvtepu8_epi32(A);
+    int[4] correct = [127, 128, 1, 255];
+    assert(C.array == correct);
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
+__m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        alias ubyte16 = __vector(ubyte[16]);
+        return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // LDC arm64: this optimizes better than the loop below
+        byte16 sa = cast(byte16)a;
+        long2 r;
+        for (int n = 0; n < 2; ++n)
+            r.ptr[n] = cast(ubyte)sa.array[n];
+        return cast(__m128i)r;
+    }
+    else
+    {
+        // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1
+        byte16 sa = cast(byte16)a;
+        long2 r;
+        r.ptr[0] = cast(ubyte)sa.array[0];
+        r.ptr[1] = cast(ubyte)sa.array[1];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+    long2 C = cast(long2) _mm_cvtepu8_epi64(A);
+    long[2] correct = [127, 254];
+    assert(C.array == correct);
+}
+
+/// Conditionally multiply the packed double-precision (64-bit) floating-point elements 
+/// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally
+/// store the sum in dst using the low 4 bits of `imm8`.
+__m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_dppd(a, b, imm8 & 0x33);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_dppd(a, b, imm8 & 0x33);
+    }
+    else
+    {
+        __m128d zero = _mm_setzero_pd();
+        __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b);
+        double sum = temp.array[0] + temp.array[1];
+        return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum));
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.0, 2.0);
+    __m128d B = _mm_setr_pd(4.0, 8.0);
+    double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B);
+    double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B);
+    double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B);
+    double[2] correct1 = [ 4.0,  4.0];
+    double[2] correct2 = [16.0,  0.0];
+    double[2] correct3 = [ 0.0, 20.0];
+    assert(R1.array == correct1);
+    assert(R2.array == correct2);
+    assert(R3.array == correct3);
+}
+
+/// Conditionally multiply the packed single-precision (32-bit) floating-point elements 
+/// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 
+/// and conditionally store the sum in result using the low 4 bits of `imm8`.
+__m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted
+{
+      // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_dpps(a, b, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_dpps(a, b, cast(byte)imm8);
+    }
+    else
+    {
+        __m128 zero = _mm_setzero_ps();
+        __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b);
+        float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
+        return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum));
+    }        
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f);
+    __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f);
+    float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B);
+    float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B);
+    float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B);
+    float[4] correct1 =   [67.0f, 67.0f, 67.0f, 67.0f];
+    float[4] correct2 =   [23.0f, 0.0f, 23.0f, 0.0f];
+    float[4] correct3 =   [0.0f, 29.0f, 0.0f, 29.0f];
+    assert(R1.array == correct1);
+    assert(R2.array == correct2);
+    assert(R3.array == correct3);
+}
+
+
+/// Extract a 32-bit integer from `a`, selected with `imm8`.
+int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted
+{
+    return (cast(int4)a).array[imm8 & 3];
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(1, 2, 3, 4);
+    assert(_mm_extract_epi32(A, 0) == 1);
+    assert(_mm_extract_epi32(A, 1 + 8) == 2);
+    assert(_mm_extract_epi32(A, 3 + 4) == 4);
+}
+
+/// Extract a 64-bit integer from `a`, selected with `imm8`.
+long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted
+{
+    long2 la = cast(long2)a;
+    return la.array[imm8 & 1];
+}
+unittest
+{
+    __m128i A = _mm_setr_epi64(45, -67);
+    assert(_mm_extract_epi64(A, 0) == 45);
+    assert(_mm_extract_epi64(A, 1) == -67);
+    assert(_mm_extract_epi64(A, 2) == 45);
+}
+
+/// Extract an 8-bit integer from `a`, selected with `imm8`.
+/// Warning: the returned value is zero-extended to 32-bits.
+int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
+{
+    byte16 ba = cast(byte16)a;
+    return cast(ubyte) ba.array[imm8 & 15];
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15);
+    assert(_mm_extract_epi8(A, 7) == 7);
+    assert(_mm_extract_epi8(A, 13) == 255);
+    assert(_mm_extract_epi8(A, 7 + 16) == 7);
+}
+
+/// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`.
+/// Note: returns a 32-bit $(I integer).
+int _mm_extract_ps (__m128 a, const int imm8) @trusted
+{
+    return (cast(int4)a).array[imm8 & 3];
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f);
+    assert(_mm_extract_ps(A, 0) == 0x3f800000);
+    assert(_mm_extract_ps(A, 1 + 8) == 0x40000000);
+    assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000);
+}
+
+
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 
+/// integer value, and store the results as packed double-precision floating-point elements.
+__m128d _mm_floor_pd (__m128d a) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        // LDC arm64 acceptable since 1.8 -O2
+        long2 l = vcvtmq_s64_f64(a);
+        double2 r;
+        r.ptr[0] = l.array[0];
+        r.ptr[1] = l.array[1];
+        return r;
+    }
+    else
+    {
+        return _mm_round_pd!1(a);
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.3f, -2.12f);
+    __m128d B = _mm_setr_pd(53.6f, -2.7f);
+    A = _mm_floor_pd(A);
+    B = _mm_floor_pd(B);
+    double[2] correctA = [1.0, -3.0];
+    double[2] correctB = [53.0, -3.0];
+    assert(A.array == correctA);
+    assert(B.array == correctB);
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 
+/// integer value, and store the results as packed single-precision floating-point elements.
+__m128 _mm_floor_ps (__m128 a) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        // LDC arm64 acceptable since 1.8 -O1
+        int4 l = vcvtmq_s32_f32(a);
+        float4 r;
+        r.ptr[0] = l.array[0];
+        r.ptr[1] = l.array[1];
+        r.ptr[2] = l.array[2];
+        r.ptr[3] = l.array[3];
+        return r;
+    }
+    else
+    {
+        return _mm_round_ps!1(a);
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
+    __m128 C = _mm_floor_ps(A);
+    float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f];
+    assert(C.array == correct);
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b` down to an 
+/// integer value, store the result as a double-precision floating-point element in the 
+/// lower element, and copy the upper element from `a` to the upper element.
+__m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        a[0] = vcvtms_s64_f64(b[0]);
+        return a;
+    }
+    else
+    {
+        return _mm_round_sd!1(a, b);
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.3, -2.12);
+    __m128d B = _mm_setr_pd(-53.1, -3.7);
+    __m128d C = _mm_floor_sd(A, B);
+    double[2] correct = [-54.0, -2.12];
+    assert(C.array == correct);
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b` down to an
+/// integer value, store the result as a single-precision floating-point element in the
+/// lower element, and copy the upper 3 packed elements from `a` to the upper elements.
+__m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        a[0] = vcvtms_s32_f32(b[0]);
+        return a;
+    }
+    else
+    {
+        return _mm_round_ss!1(a, b);
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
+    __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f);
+    __m128 C = _mm_floor_ss(A, B);
+    float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f];
+    assert(C.array == correct);
+}
+
+/// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`.
+__m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted
+{
+    // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1
+    // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1
+    // LDC arm64: ins.s since LDC 1.8 -O2
+    int4 ia = cast(int4)a;
+    ia.ptr[imm8 & 3] = i;
+    return cast(__m128i)ia; 
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(1, 2, 3, 4);
+    int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4);
+    int[4] result = [1, 2, 5, 4];
+    assert(C.array == result);
+}
+
+/// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`.
+__m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted
+{
+    // GDC: nothing special to do, psinrq generated with -O1 -msse4.1
+    // LDC x86: always do something sensible.
+    long2 la = cast(long2)a;
+    la.ptr[imm8 & 1] = i;
+    return cast(__m128i)la;
+}
+unittest
+{
+    __m128i A = _mm_setr_epi64(1, 2);
+    long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2);
+    long[2] result = [1, 5];
+    assert(C.array == result);
+}
+
+/// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`.
+/// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
+__m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
+{
+    // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1
+    // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory.
+    byte16 ba = cast(byte16)a;
+    ba.ptr[imm8 & 15] = cast(byte)i;
+    return cast(__m128i)ba; 
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16);
+    byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+    assert(C.array == result);
+}
+
+
+/// Warning: of course it does something totally different from `_mm_insert_epi32`!
+/// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 
+/// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 
+/// (elements are zeroed out when the corresponding bit is set).
+__m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
+    }
+    else
+    {
+        float4 tmp2 = a;
+        float tmp1 = b.array[(imm8 >> 6) & 3];
+        tmp2.ptr[(imm8 >> 4) & 3] = tmp1;
+        return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps());
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
+    __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
+    __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B);
+    float[4] correct =    [1.0f, 2.0f, 0.0f, 7.0f];
+    assert(C.array == correct);
+}
+
+
+/// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
+__m128i _mm_max_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b);
+    }
+    else version(LDC)
+    {
+        // x86: pmaxsd since LDC 1.1 -O1
+        // ARM: smax.4s since LDC 1.8 -01
+        int4 sa = cast(int4)a;
+        int4 sb = cast(int4)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            int4 greater = sa > sb;
+        else
+            int4 greater = greaterMask!int4(sa, sb);
+        return cast(__m128i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+    {
+        __m128i higher = _mm_cmpgt_epi32(a, b);
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, higher);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7),
+                                      _mm_setr_epi32(        -4,-8,  9, -8));
+    int[4] correct =                               [0x7fffffff, 1,  9,  7];
+    assert(R.array == correct);
+}
+
+/// Compare packed signed 8-bit integers in `a` and `b`, 
+/// and return packed maximum values.
+__m128i _mm_max_epi8 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else version(LDC)
+    {
+        // x86: pmaxsb since LDC 1.1 -O1
+        // ARM64: smax.16b since LDC 1.8.0 -O1
+        byte16 sa = cast(byte16)a;
+        byte16 sb = cast(byte16)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            byte16 greater = sa > sb;
+        else
+            byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
+        return cast(__m128i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+    {
+        __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, lower);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
+    __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
+    byte16 R = cast(byte16) _mm_max_epi8(A, B);
+    byte[16] correct =       [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values.
+__m128i _mm_max_epu16 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b);
+    }
+    else version(LDC)
+    {
+        // x86: pmaxuw since LDC 1.1 -O1
+        // ARM64: umax.8h since LDC 1.8.0 -O1
+        // PERF: without sse4.1, LLVM 12 produces a very interesting
+        //          psubusw xmm0, xmm1
+        //          paddw   xmm0, xmm1
+        //       sequence that maybe should go in other min/max intrinsics? 
+        ushort8 sa = cast(ushort8)a;
+        ushort8 sb = cast(ushort8)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+        {
+            // Note: doesn't work well with GDC, which prefers the builtin.
+            ushort8 greater = sa > sb;
+        }
+        else
+            ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb);
+        return cast(__m128i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+    {
+        b = _mm_subs_epu16(b, a);
+        b = _mm_add_epi16(b, a);
+        return b;
+    }
+}
+unittest
+{
+    short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
+                                          _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
+    short[8] correct =                                  [   -4, -8, -4, -7, 9,-32768, 0, 57];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values.
+__m128i _mm_max_epu32 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b);
+    }
+    else version(LDC)
+    {
+        // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1
+        // ARM64: umax.4s since LDC 1.8.0 -O1
+        uint4 sa = cast(uint4)a;
+        uint4 sb = cast(uint4)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            uint4 greater = sa > sb;
+        else
+            uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
+        return cast(__m128i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+    {
+        // PERF: LLVM suggests to replace the _mm_add_epi32 by _mm_xor_si128, and the last xor by an "_mm_or_si128"
+        /+
+        movdqa  xmm2, xmmword ptr [-0x80000000, -0x80000000, -0x80000000, -0x80000000]
+        movdqa  xmm3, xmm1
+        pxor    xmm3, xmm2
+        pxor    xmm2, xmm0
+        pcmpgtd xmm2, xmm3
+        pand    xmm0, xmm2
+        pandn   xmm2, xmm1
+        por     xmm0, xmm2
+        +/
+        __m128i valueShift = _mm_set1_epi32(-0x80000000);
+        __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift));
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, higher);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
+                                      _mm_setr_epi32(        -4,-8,  9, -8));
+    int[4] correct =                                [        -4,-8,  9, -7];
+    assert(R.array == correct);
+}
+
+/// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
+__m128i _mm_min_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b);
+    }
+    else version(LDC)
+    {
+        // x86: pminsd since LDC 1.1 -O1, also good without sse4.1
+        // ARM: smin.4s since LDC 1.8 -01
+        int4 sa = cast(int4)a;
+        int4 sb = cast(int4)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            int4 greater = sa > sb;
+        else
+            int4 greater = greaterMask!int4(sa, sb);
+        return cast(__m128i)( (~greater & sa) | (greater & sb) );
+    }
+    else
+    {
+        __m128i higher = _mm_cmplt_epi32(a, b);
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, higher);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff,  1, -4, 7),
+                                      _mm_setr_epi32(        -4, -8,  9, -8));
+    int[4] correct =                               [         -4, -8, -4, -8];
+    assert(R.array == correct);
+}
+
+/// Compare packed signed 8-bit integers in `a` and `b`, 
+/// and return packed minimum values.
+__m128i _mm_min_epi8 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else version(LDC)
+    {
+        // x86: pminsb since LDC 1.1 -O1
+        // ARM64: smin.16b since LDC 1.8.0 -O1
+        byte16 sa = cast(byte16)a;
+        byte16 sb = cast(byte16)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            byte16 greater = sa > sb;
+        else
+            byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
+        return cast(__m128i)( (~greater & sa) | (greater & sb) );
+    }
+    else
+    {
+        __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, lower);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
+    __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
+    byte16 R = cast(byte16) _mm_min_epi8(A, B);
+    byte[16] correct =       [  4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
+__m128i _mm_min_epu16 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b);
+    }
+    else version(LDC)
+    {
+        // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1
+        // ARM64: umin.8h since LDC 1.8.0 -O1
+        ushort8 sa = cast(ushort8)a;
+        ushort8 sb = cast(ushort8)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            ushort8 greater = (sb > sa);
+        else
+            ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa);
+        return cast(__m128i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+    {
+        __m128i c = _mm_subs_epu16(b, a);
+        b = _mm_sub_epi16(b, c);
+        return b;
+    }
+}
+unittest
+{
+    short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
+                                          _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
+    short[8] correct =                                  [32767,  1,  9, -8, 0,     7, 0,  0];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
+__m128i _mm_min_epu32 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b);
+    }
+    else version(LDC)
+    {
+        // x86: pminud since LDC 1.1 -O1, also good without sse4.1
+        // ARM64: umin.4s since LDC 1.8.0 -O1
+        uint4 sa = cast(uint4)a;
+        uint4 sb = cast(uint4)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            uint4 greater = sa > sb;
+        else
+            uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
+        return cast(__m128i)( (~greater & sa) | (greater & sb) );
+    }
+    else
+    {
+        // PERF: same remark as in _mm_max_epu32
+        __m128i valueShift = _mm_set1_epi32(-0x80000000);
+        __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift));
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, higher);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
+                                      _mm_setr_epi32(        -4,-8,  9, -8));
+    int[4] correct =                                [0x7fffffff, 1,  4, -8];
+    assert(R.array == correct);
+}
+
+/// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 
+/// store the minimum and index in return value, and zero the remaining bits.
+__m128i _mm_minpos_epu16 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        __m128i combinedLo = _mm_unpacklo_epi16(indices, a);
+        __m128i combinedHi = _mm_unpackhi_epi16(indices, a);
+        __m128i best = _mm_min_epu32(combinedLo, combinedHi);
+        best = _mm_min_epu32(best, _mm_srli_si128!8(best));
+        best = _mm_min_epu32(best, _mm_srli_si128!4(best));
+        short8 sbest = cast(short8)best;
+        short8 r;
+        r[0] = sbest[1];
+        r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie
+        r[2] = 0;
+        r[3] = 0;
+        r[4] = 0;
+        r[5] = 0;
+        r[6] = 0;
+        r[7] = 0;
+        return cast(__m128i)r;
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        ushort min = 0xffff;
+        int index = 0;
+        for(int n = 0; n < 8; ++n)
+        {
+            ushort c = sa.array[n];
+            if (c < min)
+            {
+                min = c;
+                index = n;
+            }
+        }
+        short8 r;
+        r.ptr[0] = min;
+        r.ptr[1] = cast(short)index;
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6);
+    __m128i B = _mm_setr_epi16(14,  4, 4, 2, -3, 2, 5, 6);
+    short8 R1 = cast(short8) _mm_minpos_epu16(A);
+    short8 R2 = cast(short8) _mm_minpos_epu16(B);
+    short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0];
+    short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0];
+    assert(R1.array == correct1);
+    assert(R2.array == correct2);
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 
+/// in `a` compared to those in `b`, and store the 16-bit results in dst. 
+/// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 
+/// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 
+/// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 
+/// at the offset specified in `imm8[2]`.
+__m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8);  
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
+    }
+    else
+    {
+        int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable...
+        int b_offset = (imm8 & 3) * 4;
+
+        byte16 ba = cast(byte16)a;
+        byte16 bb = cast(byte16)b;
+        short8 r;
+
+        __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0);
+
+        for (int j = 0; j < 8; j += 2)
+        {
+            int k = a_offset + j;
+            __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3],
+                                           0, 0, 0, 0, 
+                                           ba[k+1], ba[k+2], ba[k+3], ba[k+4],
+                                           0, 0, 0, 0);
+            short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64
+            r.ptr[j] = diffs.array[0];
+            r.ptr[j+1] = diffs.array[4];
+        }
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(0, 1, 2, 3,  4,  5, 6,  7, 8, 9, 10, 11, 12, 13, 14, 15);
+    __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5,  5,  5, 12, 13, 14, 15);
+    short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23];
+    short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749];
+    short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35];
+    short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741];
+    short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4];
+    short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B);
+    short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B);
+    short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B);
+    short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B);
+    short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B);
+    assert(r1.array == correct1);
+    assert(r4.array == correct4);
+    assert(r5.array == correct5);
+    assert(r7.array == correct7);
+    assert(r8.array == correct0);
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
+__m128i _mm_mul_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b);
+    }
+    else static if (LDC_with_SSE41 && LDC_with_optimizations)
+    {
+        // For some reason, clang has the builtin but it's not in IntrinsicsX86.td
+        // Use IR instead.
+        // This generates pmuldq with since LDC 1.2.0 -O0 
+        enum ir = `
+            %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2>
+            %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2>
+            %la = sext <2 x i32> %ia to <2 x i64>
+            %lb = sext <2 x i32> %ib to <2 x i64>
+            %r = mul <2 x i64> %la, %lb
+            ret <2 x i64> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b);
+    }
+    else static if (LDC_with_ARM64)  
+    {
+        // 3 instructions since LDC 1.8 -O2
+        // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull
+        int2 a_lo = vmovn_s64(cast(long2)a);
+        int2 b_lo = vmovn_s64(cast(long2)b);
+        return cast(__m128i) vmull_s32(a_lo, b_lo);
+    }
+    else
+    {
+        int4 ia = cast(int4)a;
+        int4 ib = cast(int4)b;
+        long2 r;
+        r.ptr[0] = cast(long)ia.array[0] * ib.array[0];
+        r.ptr[1] = cast(long)ia.array[2] * ib.array[2];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
+    __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
+    long2 R = cast(long2) _mm_mul_epi32(A, B);
+    long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144];
+    assert(R.array == correct);
+}
+
+/// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 
+/// return the low 32 bits of the intermediate integers.
+__m128i _mm_mullo_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    // PERF GDC without SSE4.1 could be better
+    static if (GDC_with_SSE41)
+    {
+        int4 ia = cast(int4)a;
+        int4 ib = cast(int4)b;
+        // Note: older GDC doesn't have that op, but older GDC
+        // also has no support for -msse4.1 detection
+        return cast(__m128i)(a * b); 
+    }
+    else version(LDC)
+    {
+        int4 ia = cast(int4)a;
+        int4 ib = cast(int4)b;
+        return cast(__m128i)(a * b);
+    }
+    else
+    {
+        // DMD doesn't take the above
+        int4 ia = cast(int4)a;
+        int4 ib = cast(int4)b;
+        int4 r;
+        r.ptr[0] = ia.array[0] * ib.array[0];
+        r.ptr[1] = ia.array[1] * ib.array[1];
+        r.ptr[2] = ia.array[2] * ib.array[2];
+        r.ptr[3] = ia.array[3] * ib.array[3];
+        return r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
+    __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
+    int4 R = cast(int4) _mm_mullo_epi32(A, B);
+    int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0];
+    assert(R.array == correct);
+}
+
+
+/// Convert packed signed 32-bit integers from `a` and `b` 
+/// to packed 16-bit integers using unsigned saturation.
+__m128i _mm_packus_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+       int4 z;
+       z = 0;       
+       return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)),
+                                         vqmovn_u32(vmaxq_s32(z, cast(int4)b)));
+    }
+    else
+    {
+        __m128i i32768 = _mm_set1_epi32(32768);
+        __m128i s32768 = _mm_set1_epi16(-32768);
+        a = _mm_sub_epi32(a, i32768);
+        b = _mm_sub_epi32(b, i32768);
+        __m128i clampedSigned = _mm_packs_epi32(a, b);
+        return _mm_add_epi16(clampedSigned, s32768);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
+    short8 R = cast(short8) _mm_packus_epi32(A, A);
+    short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0];
+    assert(R.array == correct);
+}
+
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
+/// rounding parameter, and store the results as packed double-precision floating-point elements.
+/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+__m128d _mm_round_pd(int rounding)(__m128d a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_roundpd(a, rounding);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_roundpd(a, rounding);
+    }
+    else
+    {
+        static if (rounding & _MM_FROUND_CUR_DIRECTION)
+        {
+            // Convert to 64-bit integers
+            long lo = _mm_cvtsd_si64(a);
+            a.ptr[0] = a.array[1];
+            long hi = _mm_cvtsd_si64(a);
+            return _mm_setr_pd(lo, hi);
+        }
+        else
+        {
+            version(GNU) pragma(inline, false); // else fail unittest with optimizations
+
+            uint old = _MM_GET_ROUNDING_MODE();
+            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
+            
+            // Convert to 64-bit integers
+            long lo = _mm_cvtsd_si64(a);
+            a.ptr[0] = a.array[1];
+            long hi = _mm_cvtsd_si64(a);
+
+            // Convert back to double to achieve the rounding
+            // The problem is that a 64-bit double can't represent all the values 
+            // a 64-bit integer can (and vice-versa). So this function won't work for
+            // large values. (MAYDO: what range exactly?)
+            _MM_SET_ROUNDING_MODE(old);
+            return _mm_setr_pd(lo, hi);
+        }
+    }
+}
+unittest
+{
+    // tested in other intrinsics
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
+/// rounding parameter, and store the results as packed single-precision floating-point elements.
+/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+__m128 _mm_round_ps(int rounding)(__m128 a) @trusted
+{
+    // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally
+    static if (GDC_or_LDC_with_SSE41)
+    {
+        return __builtin_ia32_roundps(a, rounding);
+    }
+    else
+    {
+        static if (rounding & _MM_FROUND_CUR_DIRECTION)
+        {
+            __m128i integers = _mm_cvtps_epi32(a);
+            return _mm_cvtepi32_ps(integers);
+        }
+        else
+        {
+            version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
+            uint old = _MM_GET_ROUNDING_MODE();
+            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
+            scope(exit) _MM_SET_ROUNDING_MODE(old);
+
+            // Convert to 64-bit integers
+            __m128i integers = _mm_cvtps_epi32(a);
+
+            // Convert back to float to achieve the rounding
+            // The problem is that a 32-float can't represent all the values 
+            // a 32-bit integer can (and vice-versa). So this function won't work for
+            // large values. (MAYDO: what range exactly?)
+            __m128 result = _mm_cvtepi32_ps(integers);
+
+            return result;
+        }
+    }
+}
+unittest
+{
+    // tested in other intrinsics
+}
+
+
+/// Round the lower double-precision (64-bit) floating-point element in `b` using the
+/// rounding parameter, store the result as a double-precision floating-point element 
+/// in the lower element of result, and copy the upper element from `a` to the upper element of result.
+/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+__m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted
+{
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_roundsd(a, b, rounding);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_roundsd(a, b, rounding);
+    }
+    else
+    {
+        static if (rounding & _MM_FROUND_CUR_DIRECTION)
+        {
+            // Convert to 64-bit integer
+            long b0 = _mm_cvtsd_si64(b);
+            a.ptr[0] = b0;
+            return a;
+        }
+        else
+        {
+            version(GNU) pragma(inline, false); // else fail unittest with optimizations
+
+            uint old = _MM_GET_ROUNDING_MODE();
+            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
+            
+            // Convert to 64-bit integer
+            long b0 = _mm_cvtsd_si64(b);
+            a.ptr[0] = b0;
+
+            // Convert back to double to achieve the rounding
+            // The problem is that a 64-bit double can't represent all the values 
+            // a 64-bit integer can (and vice-versa). So this function won't work for
+            // large values. (MAYDO: what range exactly?)
+            _MM_SET_ROUNDING_MODE(old);
+            return a;
+        }
+    }
+}
+unittest
+{
+    // tested in other intrinsics
+}
+
+
+/// Round the lower single-precision (32-bit) floating-point element in `b` using the 
+/// rounding parameter, store the result as a single-precision floating-point element 
+/// in the lower element of result, and copy the upper 3 packed elements from `a`
+/// to the upper elements of result.
+/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+__m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted
+{
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_roundss(a, b, rounding);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_roundss(a, b, rounding);
+    }
+    else
+    {
+        static if (rounding & _MM_FROUND_CUR_DIRECTION)
+        {
+            int b0 = _mm_cvtss_si32(b);
+            a.ptr[0] = b0;   
+            return a;
+        }
+        else version(GNU)
+        {
+            pragma(inline, false)
+            __m128 GDCworkaround() nothrow @nogc @trusted 
+            {
+                uint old = _MM_GET_ROUNDING_MODE();
+                _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
+
+                // Convert to 32-bit integer
+                int b0 = _mm_cvtss_si32(b);
+                a.ptr[0] = b0;       
+
+                // Convert back to double to achieve the rounding
+                // The problem is that a 32-bit float can't represent all the values 
+                // a 32-bit integer can (and vice-versa). So this function won't work for
+                // large values. (MAYDO: what range exactly?)
+                _MM_SET_ROUNDING_MODE(old);
+                return a;
+            }
+            return GDCworkaround();
+        }
+        else
+        {
+            uint old = _MM_GET_ROUNDING_MODE();
+            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
+
+            // Convert to 32-bit integer
+            int b0 = _mm_cvtss_si32(b);
+            a.ptr[0] = b0;       
+
+            // Convert back to double to achieve the rounding
+            // The problem is that a 32-bit float can't represent all the values 
+            // a 32-bit integer can (and vice-versa). So this function won't work for
+            // large values. (MAYDO: what range exactly?)
+            _MM_SET_ROUNDING_MODE(old);
+            return a;
+        }
+    }
+}
+unittest
+{
+    // tested in other intrinsics
+}
+
+
+/// Load 128-bits of integer data from memory using a non-temporal memory hint. 
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 
+/// exception may be generated.
+__m128i _mm_stream_load_si128 (void* mem_addr) pure @trusted
+{
+    // PERF DMD D_SIMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr);
+    }
+    else static if (LDC_with_InlineIREx && LDC_with_optimizations)
+    {
+        enum prefix = `!0 = !{ i32 1 }`;
+        enum ir = `
+            %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0
+            ret <4 x i32> %r`;
+        return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(cast(__m128i*)mem_addr);
+    }
+    else
+    {
+        return *cast(__m128i*)mem_addr; // regular move instead
+    }
+}
+unittest
+{
+    align(16) static immutable int[4] correct = [1, 2, 3, 4];
+    __m128i A = _mm_stream_load_si128(cast(__m128i*)(correct.ptr));
+    _mm_mfence();
+    assert(A.array == correct);
+}
+
+/// Return 1 if all bits in `a` are all 1's. Else return 0.
+int _mm_test_all_ones (__m128i a) @safe
+{
+    return _mm_testc_si128(a, _mm_set1_epi32(-1));
+}
+unittest
+{
+    __m128i A = _mm_set1_epi32(-1);
+    __m128i B = _mm_set_epi32(-1, -2, -1, -1);
+    assert(_mm_test_all_ones(A) == 1);
+    assert(_mm_test_all_ones(B) == 0);
+}
+
+/// Return 1 if all bits in `a` are all 0's. Else return 0.
+// This is a #BONUS since it was lacking in Intel Intrinsics API.
+int _mm_test_all_zeros (__m128i a) @safe
+{
+    return _mm_testz_si128(a, _mm_set1_epi32(-1));
+}
+unittest
+{
+    __m128i A = _mm_set1_epi32(0);
+    __m128i B = _mm_set_epi32(0, 8, 0, 0);
+    assert(_mm_test_all_zeros(A) == 1);
+    assert(_mm_test_all_zeros(B) == 0);
+}
+
+/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 
+/// and return 1 if the result is zero, otherwise return 0.
+int _mm_test_all_zeros (__m128i a, __m128i mask) @safe
+{
+    return _mm_testz_si128(a, mask); // it's really the same, but with a good name
+}
+
+/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 
+/// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with 
+/// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and
+/// CF values are zero, otherwise return 0.
+int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
+{
+    return _mm_testnzc_si128(a, mask);
+}
+
+/// Compute the bitwise NOT of a and then AND with b, and return 1 if the 
+/// result is zero, otherwise return 0.
+/// In other words, test if all bits masked by `b` are 1 in `a`.
+int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // Acceptable since LDC 1.8 -02
+        long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a);
+        return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+    }
+    else
+    {
+        __m128i c = ~a & b;
+        int[4] zero = [0, 0, 0, 0];
+        return c.array == zero;
+    }
+}
+unittest
+{
+    __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
+    __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00);
+    __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
+    assert(_mm_testc_si128(A, A) == 1);
+    assert(_mm_testc_si128(A, M1) == 0);
+    assert(_mm_testc_si128(A, M2) == 1);
+}
+
+/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 
+/// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
+/// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
+/// result is zero, otherwise set CF to 0. 
+/// Return 1 if both the ZF and CF values are zero, otherwise return 0.
+int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        long2 s640 = vandq_s64(cast(long2)b, cast(long2)a);
+        long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a);
+
+        return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1))
+                | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) );
+    }
+    else
+    {
+        __m128i c = a & b;
+        __m128i d = ~a & b;
+        int[4] zero = [0, 0, 0, 0];
+        return !( (c.array == zero) || (d.array == zero));
+    }    
+}
+unittest
+{
+    __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
+    __m128i M  = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00);
+    __m128i Z = _mm_setzero_si128();
+    assert(_mm_testnzc_si128(A, Z) == 0);
+    assert(_mm_testnzc_si128(A, M) == 1);
+    assert(_mm_testnzc_si128(A, A) == 0);
+}
+
+/// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 
+/// and return 1 if the result is zero, otherwise return 0.
+/// In other words, test if all bits masked by `b` are 0 in `a`.
+int _mm_testz_si128 (__m128i a, __m128i b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // Acceptable since LDC 1.8 -02
+        long2 s64 = vandq_s64(cast(long2)a, cast(long2)b);
+        return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+    }
+    else 
+    {
+        __m128i c = a & b;
+        int[4] zero = [0, 0, 0, 0];
+        return c.array == zero;
+    }    
+}
+unittest
+{
+    __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
+    __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07);
+    __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
+    assert(_mm_testz_si128(A, A) == 0);
+    assert(_mm_testz_si128(A, M1) == 1);
+    assert(_mm_testz_si128(A, M2) == 0);
+}
+
diff --git a/external/inteli/tmmintrin.d b/external/inteli/tmmintrin.d
new file mode 100644
index 0000000..ecc84d1
--- /dev/null
+++ b/external/inteli/tmmintrin.d
@@ -0,0 +1,1322 @@
+/**
+* SSSE3 intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3
+*
+* Copyright: Guillaume Piolat 2021.
+*            Johan Engelen 2021.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.tmmintrin;
+
+public import inteli.types;
+import inteli.internals;
+
+public import inteli.pmmintrin;
+import inteli.mmx;
+
+nothrow @nogc:
+
+
+// SSSE3 instructions
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSSE3
+// Note: this header will work whether you have SSSE3 enabled or not.
+// With LDC, use "dflags-ldc": ["-mattr=+ssse3"] or equivalent to actively 
+// generate SSE3 instructions.
+// With GDC, use "dflags-gdc": ["-mssse3"] or equivalent to generate SSSE3 instructions.
+
+/// Compute the absolute value of packed signed 16-bit integers in `a`.
+__m128i _mm_abs_epi16 (__m128i a) @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i)__simd(XMM.PABSW, a);
+    }
+    else static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i) __builtin_ia32_pabsw128(cast(short8)a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        return cast(__m128i) vabsq_s16(cast(short8)a);
+    }
+    else
+    {
+        // LDC x86: generate pabsw since LDC 1.1 -O2
+        short8 sa = cast(short8)a;
+        for (int i = 0; i < 8; ++i)
+        {
+            short s = sa.array[i];
+            sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s));
+        }  
+        return cast(__m128i)sa;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000);
+    short8 B = cast(short8) _mm_abs_epi16(A);
+    short[8] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000];
+    assert(B.array == correct);
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in `a`.
+__m128i _mm_abs_epi32 (__m128i a) @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i)__simd(XMM.PABSD, cast(int4)a);
+    }
+    else static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i) __builtin_ia32_pabsd128(cast(int4)a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        return cast(__m128i) vabsq_s32(cast(int4)a);
+    }
+    else
+    {
+        // LDC x86: generates pabsd since LDC 1.1 -O2
+        int4 sa = cast(int4)a;
+        for (int i = 0; i < 4; ++i)
+        {
+            int s = sa.array[i];
+            sa.ptr[i] = s >= 0 ? s : -s;
+        }  
+        return cast(__m128i)sa;
+    } 
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647);
+    int4 B = cast(int4) _mm_abs_epi32(A);
+    int[4] correct = [0, 1, -2_147_483_648, 2_147_483_647];
+    assert(B.array == correct);
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in `a`.
+__m128i _mm_abs_epi8 (__m128i a) @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128i)__simd(XMM.PABSB, cast(byte16)a);
+    }
+    else static if (GDC_with_SSSE3)
+    {
+        alias ubyte16 = __vector(ubyte[16]);
+        return cast(__m128i) __builtin_ia32_pabsb128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        return cast(__m128i) vabsq_s8(cast(byte16)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // LDC x86: generates pabsb since LDC 1.1 -O1
+        //     arm64: generates abs since LDC 1.8 -O1
+        enum ir = `
+                %n = sub <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0
+                %s = icmp slt <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0
+                %r = select <16 x i1> %s, <16 x i8> %0, <16 x i8> %n
+                ret <16 x i8> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, byte16, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        // A loop version like in _mm_abs_epi16/_mm_abs_epi32 would be very slow 
+        // in LDC x86 and wouldn't vectorize. Doesn't generate pabsb in LDC though.
+        return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a));
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    byte16 B = cast(byte16) _mm_abs_epi8(A);
+    byte[16] correct =       [0,  1, -128,  127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(B.array == correct);
+}
+
+/// Compute the absolute value of packed 64-bit floating-point elements in `a`.
+/// #BONUS.
+__m128d _mm_abs_pd (__m128d a) @trusted
+{
+    long2 mask = 0x7fff_ffff_ffff_ffff;
+    return cast(__m128d)((cast(long2)a) & mask);
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(-42.0f, -double.infinity);
+    __m128d R = _mm_abs_pd(A);
+    double[2] correct =    [42.0f, +double.infinity];
+    assert(R.array == correct);
+}
+
+/// Compute the absolute value of packed signed 16-bit integers in `a`.
+__m64 _mm_abs_pi16 (__m64 a) @trusted
+{
+    return to_m64(_mm_abs_epi16(to_m128i(a)));
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(0, -1, -32768, 32767);
+    short4 B = cast(short4) _mm_abs_pi16(A);
+    short[4] correct = [0, 1, -32768, 32767];
+    assert(B.array == correct);
+}
+
+/// Compute the absolute value of packed signed 32-bit integers in `a`.
+__m64 _mm_abs_pi32 (__m64 a) @trusted
+{
+     return to_m64(_mm_abs_epi32(to_m128i(a)));
+}
+unittest
+{
+    __m64 A = _mm_setr_pi32(-1, -2_147_483_648);
+    int2 B = cast(int2) _mm_abs_pi32(A);
+    int[2] correct = [1, -2_147_483_648];
+    assert(B.array == correct);
+}
+
+/// Compute the absolute value of packed signed 8-bit integers in `a`.
+__m64 _mm_abs_pi8 (__m64 a) @trusted
+{
+    return to_m64(_mm_abs_epi8(to_m128i(a)));
+}
+unittest
+{
+    __m64 A = _mm_setr_pi8(0, -1, -128, -127, 127, 0, 0, 0);
+    byte8 B = cast(byte8) _mm_abs_pi8(A);
+    byte[8] correct =       [0,  1, -128,  127, 127, 0, 0, 0];
+    assert(B.array == correct);
+}
+
+/// Compute the absolute value of packed 32-bit floating-point elements in `a`.
+/// #BONUS.
+__m128 _mm_abs_ps (__m128 a) @trusted
+{
+    __m128i mask = 0x7fffffff;
+    return cast(__m128)((cast(__m128i)a) & mask);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(-0.0f, 10.0f, -42.0f, -float.infinity);
+    __m128 R = _mm_abs_ps(A);
+    float[4] correct =    [0.0f, 10.0f, 42.0f, +float.infinity];
+    assert(R.array == correct);
+}
+
+/// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the result right by `count` bytes, and return the low 16 bytes.
+__m128i _mm_alignr_epi8(ubyte count)(__m128i a, __m128i b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_palignr128(cast(long2)a, cast(long2)b, count * 8);
+    }
+    else version(LDC)
+    {
+        static if (count >= 32)
+        {
+            return _mm_setzero_si128();
+        }
+        else static if (count < 16)
+        {
+            // Generates palignr since LDC 1.1 -O1
+            // Also generates a single ext instruction on arm64.
+            return cast(__m128i) shufflevectorLDC!(byte16, ( 0 + count),
+                                                        ( 1 + count),
+                                                        ( 2 + count),
+                                                        ( 3 + count),
+                                                        ( 4 + count),
+                                                        ( 5 + count),
+                                                        ( 6 + count),
+                                                        ( 7 + count),
+                                                        ( 8 + count),
+                                                        ( 9 + count),
+                                                        (10 + count),
+                                                        (11 + count),
+                                                        (12 + count),
+                                                        (13 + count),
+                                                        (14 + count),
+                                                        (15 + count))(cast(byte16)b, cast(byte16)a);
+        }
+        else
+        {
+            return cast(__m128i) shufflevectorLDC!(byte16, ( 0 + count) % 32,
+                                                        ( 1 + count) % 32,
+                                                        ( 2 + count) % 32,
+                                                        ( 3 + count) % 32,
+                                                        ( 4 + count) % 32,
+                                                        ( 5 + count) % 32,
+                                                        ( 6 + count) % 32,
+                                                        ( 7 + count) % 32,
+                                                        ( 8 + count) % 32,
+                                                        ( 9 + count) % 32,
+                                                        (10 + count) % 32,
+                                                        (11 + count) % 32,
+                                                        (12 + count) % 32,
+                                                        (13 + count) % 32,
+                                                        (14 + count) % 32,
+                                                        (15 + count) % 32)(cast(byte16)_mm_setzero_si128(), cast(byte16)a);
+        }
+    }
+    else
+    {
+        byte16 ab = cast(byte16)a;
+        byte16 bb = cast(byte16)b;
+        byte16 r;
+
+        for (int i = 0; i < 16; ++i)
+        {
+            const int srcpos = count + cast(int)i;
+            if (srcpos > 31) 
+            {
+                r.ptr[i] = 0;
+            } 
+            else if (srcpos > 15) 
+            {
+                r.ptr[i] = ab.array[(srcpos) & 15];
+            } 
+            else 
+            {
+                r.ptr[i] = bb.array[srcpos];
+            }
+       }
+       return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+    __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+
+    {
+        byte16 C = cast(byte16)_mm_alignr_epi8!0(A ,B);
+        byte[16] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
+        assert(C.array == correct);
+    }
+    {
+        byte16 C = cast(byte16)_mm_alignr_epi8!20(A ,B);
+        byte[16] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0];
+        assert(C.array == correct);
+    }
+    {
+        byte16 C = cast(byte16)_mm_alignr_epi8!34(A ,B);
+        byte[16] correct = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        assert(C.array == correct);
+    }
+
+    __m128i D = _mm_setr_epi8(-123, -82, 103, -69, 103, -26, 9, 106, 58, -11, 79, -91, 114, -13, 110, 60);
+    __m128i E = _mm_setr_epi8(25, -51, -32, 91, -85, -39, -125, 31, -116, 104, 5, -101, 127, 82, 14, 81);
+    byte16 F = cast(byte16)_mm_alignr_epi8!8(D, E);
+    byte[16] correct = [-116, 104, 5, -101, 127, 82, 14, 81, -123, -82, 103, -69, 103, -26, 9, 106];
+    assert(F.array == correct);
+}
+
+/// Concatenate 8-byte blocks in `a` and `b` into a 16-byte temporary result, shift the result right by `count` bytes, and return the low 8 bytes.
+__m64 _mm_alignr_pi8(ubyte count)(__m64 a, __m64 b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m64)__builtin_ia32_palignr(cast(long1)a, cast(long1)b, count * 8);
+    }
+    else version(LDC)
+    {
+        static if (count >= 16)
+        {
+            return _mm_setzero_si64();
+        }
+        else static if (count < 8)
+        {
+            // Note: in LDC x86 this uses a pshufb.
+            // Generates ext in arm64.
+            return cast(__m64) shufflevectorLDC!(byte8, (0 + count),
+                                                     (1 + count),
+                                                     (2 + count),
+                                                     (3 + count),
+                                                     (4 + count),
+                                                     (5 + count),
+                                                     (6 + count),
+                                                     (7 + count))(cast(byte8)b, cast(byte8)a);
+        }
+        else
+        {
+            return cast(__m64) shufflevectorLDC!(byte8, (0 + count)%16,
+                                                     (1 + count)%16,
+                                                     (2 + count)%16,
+                                                     (3 + count)%16,
+                                                     (4 + count)%16,
+                                                     (5 + count)%16,
+                                                     (6 + count)%16,
+                                                     (7 + count)%16)(cast(byte8)_mm_setzero_si64(), cast(byte8)a);
+        }
+    }
+    else
+    {
+        byte8 ab = cast(byte8)a;
+        byte8 bb = cast(byte8)b;
+        byte8 r;
+
+        for (int i = 0; i < 8; ++i)
+        {
+            const int srcpos = count + cast(int)i;
+            if (srcpos > 15) 
+            {
+                r.ptr[i] = 0;
+            } 
+            else if (srcpos > 7) 
+            {
+                r.ptr[i] = ab.array[(srcpos) & 7];
+            } 
+            else 
+            {
+                r.ptr[i] = bb.array[srcpos];
+            }
+       }
+       return cast(__m64)r;
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
+    __m64 B = _mm_setr_pi8(17, 18, 19, 20, 21, 22, 23, 24);
+
+    {
+        byte8 C = cast(byte8)_mm_alignr_pi8!0(A ,B);
+        byte[8] correct = [17, 18, 19, 20, 21, 22, 23, 24];
+        assert(C.array == correct);
+    }
+
+    {
+        byte8 C = cast(byte8)_mm_alignr_pi8!3(A ,B);
+        byte[8] correct = [ 20, 21, 22, 23, 24, 1, 2, 3];
+        assert(C.array == correct);
+    }
+    {
+        byte8 C = cast(byte8)_mm_alignr_pi8!11(A ,B);
+        byte[8] correct = [4, 5, 6, 7, 8, 0, 0, 0];
+        assert(C.array == correct);
+    }
+    {
+        byte8 C = cast(byte8)_mm_alignr_pi8!17(A ,B);
+        byte[8] correct = [0, 0, 0, 0, 0, 0, 0, 0];
+        assert(C.array == correct);
+    }
+}
+
+/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
+__m128i _mm_hadd_epi16 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        return cast(__m128i)vpaddq_s16(cast(short8)a, cast(short8)b);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        short8 r;
+        r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]);
+        r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]);
+        r.ptr[2] = cast(short)(sa.array[4] + sa.array[5]);
+        r.ptr[3] = cast(short)(sa.array[6] + sa.array[7]);
+        r.ptr[4] = cast(short)(sb.array[0] + sb.array[1]);
+        r.ptr[5] = cast(short)(sb.array[2] + sb.array[3]);
+        r.ptr[6] = cast(short)(sb.array[4] + sb.array[5]);
+        r.ptr[7] = cast(short)(sb.array[6] + sb.array[7]);
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768);
+    short8 C = cast(short8) _mm_hadd_epi16(A, A);
+    short[8] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767];
+    assert(C.array == correct);
+}
+
+/// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
+__m128i _mm_hadd_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        return cast(__m128i)vpaddq_s32(cast(int4)a, cast(int4)b);
+    }
+    else
+    {
+        int4 ia = cast(int4)a;
+        int4 ib = cast(int4)b;
+        int4 r;
+        r.ptr[0] = ia.array[0] + ia.array[1];
+        r.ptr[1] = ia.array[2] + ia.array[3];
+        r.ptr[2] = ib.array[0] + ib.array[1];
+        r.ptr[3] = ib.array[2] + ib.array[3];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(1, -2, int.min, -1);
+    __m128i B = _mm_setr_epi32(1, int.max, 4, -4);
+    int4 C = cast(int4) _mm_hadd_epi32(A, B);
+    int[4] correct = [ -1, int.max, int.min, 0 ];
+    assert(C.array == correct);
+}
+
+/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
+__m64 _mm_hadd_pi16 (__m64 a, __m64 b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m64) __builtin_ia32_phaddw(cast(short4)a, cast(short4)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        return cast(__m64) vpadd_s16(cast(short4)a, cast(short4)b);
+    }
+    else
+    {
+        // LDC x86: generates phaddw since LDC 1.24 -O2.
+        short4 r;
+        short4 sa = cast(short4)a;
+        short4 sb = cast(short4)b;
+        r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 
+        r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]);
+        r.ptr[2] = cast(short)(sb.array[0] + sb.array[1]);
+        r.ptr[3] = cast(short)(sb.array[2] + sb.array[3]);
+        return cast(__m64)r;
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(1, -2, 4, 8);
+    __m64 B = _mm_setr_pi16(16, 32, -1, -32768);
+    short4 C = cast(short4) _mm_hadd_pi16(A, B);
+    short[4] correct = [ -1, 12, 48, 32767 ];
+    assert(C.array == correct);
+}
+
+/// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, 
+/// and pack the signed 32-bit results.
+__m64 _mm_hadd_pi32 (__m64 a, __m64 b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m64) __builtin_ia32_phaddd(cast(int2)a, cast(int2)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        return cast(__m64)vpadd_s32(cast(int2)a, cast(int2)b);
+    }
+    else
+    {
+        // LDC x86: generates phaddd since LDC 1.24 -O2
+        int2 ia = cast(int2)a;
+        int2 ib = cast(int2)b;
+        int2 r;
+        r.ptr[0] = ia.array[0] + ia.array[1];
+        r.ptr[1] = ib.array[0] + ib.array[1];
+        return cast(__m64)r;
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi32(int.min, -1);
+    __m64 B = _mm_setr_pi32(1, int.max);
+    int2 C = cast(int2) _mm_hadd_pi32(A, B);
+    int[2] correct = [ int.max, int.min ];
+    assert(C.array == correct);
+}
+
+/// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
+/// and pack the signed 16-bit results.
+__m128i _mm_hadds_epi16 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // uzp1/uzp2/sqadd sequence
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        short8 c = shufflevectorLDC!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
+        short8 d = shufflevectorLDC!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
+        return cast(__m128i)vqaddq_s16(c, d);
+    }
+    else
+    {
+        // PERF well that doesn't look very fast?
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        short8 r;
+        r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]);
+        r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]);
+        r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] + sa.array[5]);
+        r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] + sa.array[7]);
+        r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]);
+        r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]);
+        r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] + sb.array[5]);
+        r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] + sb.array[7]);
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768);
+    short8 C = cast(short8) _mm_hadds_epi16(A, A);
+    short[8] correct = [ -1, 12, 48, -32768, -1, 12, 48, -32768];
+    assert(C.array == correct);
+}
+
+/// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
+/// and pack the signed 16-bit results.
+__m64 _mm_hadds_pi16 (__m64 a, __m64 b) @trusted
+{
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m64)__builtin_ia32_phaddsw(cast(short4)a, cast(short4)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        // Note: LDC doesn't have __builtin_ia32_phaddsw
+        long2 la;
+        la.ptr[0] = a.array[0];
+        long2 lb;
+        lb.ptr[0] = b.array[0];
+        int4 sum = cast(int4)__builtin_ia32_phaddsw128(cast(short8)la, cast(short8)lb);
+        int2 r;
+        r.ptr[0] = sum.array[0];
+        r.ptr[1] = sum.array[2];
+        return cast(__m64)r;
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // uzp1/uzp2/sqadd sequence
+        short4 sa = cast(short4)a;
+        short4 sb = cast(short4)b;
+        short4 c = shufflevectorLDC!(short4, 0, 2, 4, 6)(sa, sb);
+        short4 d = shufflevectorLDC!(short4, 1, 3, 5, 7)(sa, sb);
+        return cast(__m64)vqadd_s16(c, d);
+    }
+    else
+    {
+        short4 sa = cast(short4)a;
+        short4 sb = cast(short4)b;
+        short4 r;
+        r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]);
+        r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]);
+        r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]);
+        r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]);
+        return cast(__m64)r;
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(-16, 32, -100, -32768);
+    __m64 B = _mm_setr_pi16( 64, 32,    1,  32767);
+    short4 C = cast(short4) _mm_hadds_pi16(A, B);
+    short[4] correct = [ 16, -32768,  96,  32767];
+    assert(C.array == correct);
+}
+
+
+/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
+__m128i _mm_hsub_epi16 (__m128i a, __m128i b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        short8 c = shufflevectorLDC!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
+        short8 d = shufflevectorLDC!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
+        return cast(__m128i)(c - d);
+    }
+    else 
+    {
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        short8 r;
+        r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]);
+        r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]);
+        r.ptr[2] = cast(short)(sa.array[4] - sa.array[5]);
+        r.ptr[3] = cast(short)(sa.array[6] - sa.array[7]);
+        r.ptr[4] = cast(short)(sb.array[0] - sb.array[1]);
+        r.ptr[5] = cast(short)(sb.array[2] - sb.array[3]);
+        r.ptr[6] = cast(short)(sb.array[4] - sb.array[5]);
+        r.ptr[7] = cast(short)(sb.array[6] - sb.array[7]);
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(short.min, 1, 4, 8, 16, 32, 1, -32768);
+    short8 C = cast(short8) _mm_hsub_epi16(A, A);
+    short[8] correct = [ short.max, -4, -16, -32767, short.max, -4, -16, -32767];
+    assert(C.array == correct);
+}
+
+/// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
+__m128i _mm_hsub_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 
+        int4 ia = cast(int4)a;
+        int4 ib = cast(int4)b;
+        int4 c = shufflevectorLDC!(int4, 0, 2, 4, 6)(ia, ib);
+        int4 d = shufflevectorLDC!(int4, 1, 3, 5, 7)(ia, ib);
+        return cast(__m128i)(c - d);
+    }
+    else
+    {
+        int4 ia = cast(int4)a;
+        int4 ib = cast(int4)b;
+        int4 r;
+        r.ptr[0] = ia.array[0] - ia.array[1];
+        r.ptr[1] = ia.array[2] - ia.array[3];
+        r.ptr[2] = ib.array[0] - ib.array[1];
+        r.ptr[3] = ib.array[2] - ib.array[3];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(1, 2, int.min, 1);
+    __m128i B = _mm_setr_epi32(int.max, -1, 4, 4);
+    int4 C = cast(int4) _mm_hsub_epi32(A, B);
+    int[4] correct = [ -1, int.max, int.min, 0 ];
+    assert(C.array == correct);
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`, 
+/// and pack the signed 16-bit results.
+__m64 _mm_hsub_pi16 (__m64 a, __m64 b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m64)__builtin_ia32_phsubw(cast(short4)a, cast(short4)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // Produce uzp1 uzp2 sub sequence since LDC 1.3 -O1 
+        short4 sa = cast(short4)a;
+        short4 sb = cast(short4)b;
+        short4 c = shufflevectorLDC!(short4, 0, 2, 4, 6)(sa, sb);
+        short4 d = shufflevectorLDC!(short4, 1, 3, 5, 7)(sa, sb);
+        return cast(__m64)(c - d);
+    }
+    else
+    {
+        // LDC x86: generates phsubw since LDC 1.24 -O2
+        short4 sa = cast(short4)a;
+        short4 sb = cast(short4)b;
+        short4 r;
+        r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]);
+        r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]);
+        r.ptr[2] = cast(short)(sb.array[0] - sb.array[1]);
+        r.ptr[3] = cast(short)(sb.array[2] - sb.array[3]);
+        return cast(__m64)r;
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(short.min, 1, 4, 8);
+    __m64 B = _mm_setr_pi16(16, 32, 1, -32768);
+    short4 C = cast(short4) _mm_hsub_pi16(A, B);
+    short[4] correct = [ short.max, -4, -16, -32767];
+    assert(C.array == correct);
+}
+
+/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`, 
+/// and pack the signed 32-bit results.
+__m64 _mm_hsub_pi32 (__m64 a, __m64 b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m64)__builtin_ia32_phsubd(cast(int2)a, cast(int2)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // LDC arm64: generates zip1+zip2+sub sequence since LDC 1.8 -O1
+        int2 ia = cast(int2)a;
+        int2 ib = cast(int2)b;
+        int2 c = shufflevectorLDC!(int2, 0, 2)(ia, ib);
+        int2 d = shufflevectorLDC!(int2, 1, 3)(ia, ib);
+        return cast(__m64)(c - d);
+    }
+    else
+    {
+        // LDC x86: generates phsubd since LDC 1.24 -O2
+        int2 ia = cast(int2)a;
+        int2 ib = cast(int2)b;
+        int2 r;
+        r.ptr[0] = ia.array[0] - ia.array[1];
+        r.ptr[1] = ib.array[0] - ib.array[1];
+        return cast(__m64)r;
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi32(int.min, 1);
+    __m64 B = _mm_setr_pi32(int.max, -1);
+    int2 C = cast(int2) _mm_hsub_pi32(A, B);
+    int[2] correct = [ int.max, int.min ];
+    assert(C.array == correct);
+}
+
+/// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
+/// and pack the signed 16-bit results.
+__m128i _mm_hsubs_epi16 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // uzp1/uzp2/sqsub sequence
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        short8 c = shufflevectorLDC!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
+        short8 d = shufflevectorLDC!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
+        return cast(__m128i)vqsubq_s16(c, d);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        short8 r;
+        r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]);
+        r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]);
+        r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] - sa.array[5]);
+        r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] - sa.array[7]);
+        r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]);
+        r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]);
+        r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] - sb.array[5]);
+        r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] - sb.array[7]);
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(1, -2, 4, 8, 32767, -1, -10, 32767);
+    short8 C = cast(short8) _mm_hsubs_epi16(A, A);
+    short[8] correct = [ 3, -4, 32767, -32768, 3, -4, 32767, -32768 ];
+    assert(C.array == correct);
+}
+
+
+/// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
+/// and pack the signed 16-bit results.
+__m64 _mm_hsubs_pi16 (__m64 a, __m64 b) @trusted
+{
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m64)__builtin_ia32_phsubsw(cast(short4)a, cast(short4)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        // Note: LDC doesn't have __builtin_ia32_phsubsw
+        long2 la;
+        la.ptr[0] = a.array[0];
+        long2 lb;
+        lb.ptr[0] = b.array[0];
+        int4 sum = cast(int4)__builtin_ia32_phsubsw128(cast(short8)la, cast(short8)lb);
+        int2 r;
+        r.ptr[0] = sum.array[0];
+        r.ptr[1] = sum.array[2];
+        return cast(__m64)r;
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // uzp1/uzp2/sqsub sequence in -O1
+        short4 sa = cast(short4)a;
+        short4 sb = cast(short4)b;
+        short4 c = shufflevectorLDC!(short4, 0, 2, 4, 6)(sa, sb);
+        short4 d = shufflevectorLDC!(short4, 1, 3, 5, 7)(sa, sb);
+        return cast(__m64)vqsub_s16(c, d);
+    }
+    else
+    {
+        short4 sa = cast(short4)a;
+        short4 sb = cast(short4)b;
+        short4 r;
+        r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]);
+        r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]);
+        r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]);
+        r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]);
+        return cast(__m64)r;
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(-16, 32, 100, -32768);
+    __m64 B = _mm_setr_pi16( 64, 30,   -9,  32767);
+    short4 C = cast(short4) _mm_hsubs_pi16(A, B);
+    short[4] correct = [ -48, 32767,  34,  -32768];
+    assert(C.array == correct);
+}
+
+
+/// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 
+/// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 
+/// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 
+/// and pack the saturated results.
+__m128i _mm_maddubs_epi16 (__m128i a, __m128i b) @trusted
+{
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b);
+    }
+    else
+    {
+        // zero-extend a to 16-bit
+        __m128i zero = _mm_setzero_si128();
+        __m128i a_lo = _mm_unpacklo_epi8(a, zero);
+        __m128i a_hi = _mm_unpackhi_epi8(a, zero);
+
+        // sign-extend b to 16-bit
+        __m128i b_lo = _mm_unpacklo_epi8(b, zero);
+        __m128i b_hi = _mm_unpackhi_epi8(b, zero);    
+        b_lo = _mm_srai_epi16( _mm_slli_epi16(b_lo, 8), 8);
+        b_hi = _mm_srai_epi16( _mm_slli_epi16(b_hi, 8), 8); 
+
+        // Multiply element-wise, no overflow can occur
+        __m128i c_lo = _mm_mullo_epi16(a_lo, b_lo);  
+        __m128i c_hi = _mm_mullo_epi16(a_hi, b_hi);
+
+        // Add pairwise with saturating horizontal add
+        return _mm_hadds_epi16(c_lo, c_hi);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(  -1,  10, 100, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // u8
+    __m128i B = _mm_setr_epi8(-128, -30, 100,  127, -1, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0); // i8
+    short8 C = cast(short8) _mm_maddubs_epi16(A, B);
+    short[8] correct =       [   -32768,     26256, 0, 0, 0, 0, 0, 0];
+    assert(C.array == correct);
+}
+
+/// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 
+/// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 
+/// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 
+/// and pack the saturated results.
+__m64 _mm_maddubs_pi16 (__m64 a, __m64 b) @trusted
+{
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m64)__builtin_ia32_pmaddubsw(cast(ubyte8)a, cast(ubyte8)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        __m128i A = to_m128i(a);
+        __m128i B = to_m128i(b);
+        return to_m64( cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16) to_m128i(a), cast(byte16) to_m128i(b)));
+    }
+    else
+    {
+        // zero-extend a to 16-bit
+        __m128i zero = _mm_setzero_si128();
+        __m128i A = _mm_unpacklo_epi8(to_m128i(a), zero);
+
+        // sign-extend b to 16-bit
+        __m128i B = _mm_unpacklo_epi8(to_m128i(b), zero);    
+        B = _mm_srai_epi16( _mm_slli_epi16(B, 8), 8);
+
+        // Multiply element-wise, no overflow can occur
+        __m128i c = _mm_mullo_epi16(A, B);
+
+        // Add pairwise with saturating horizontal add
+        return to_m64( _mm_hadds_epi16(c, zero));
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi8(  -1,  10, 100, -128, 0, 0, 0, 0); // u8
+    __m64 B = _mm_setr_pi8(-128, -30, 100,  127, -1, 2, 4, 6); // i8
+    short4 C = cast(short4) _mm_maddubs_pi16(A, B);
+    short[4] correct =       [   -32768,   26256, 0, 0];
+    assert(C.array == correct);
+}
+
+/// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers.
+/// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`.
+__m128i _mm_mulhrs_epi16 (__m128i a, __m128i b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        int4 mul_lo = vmull_s16(vget_low_s16(cast(short8)a),
+                                vget_low_s16(cast(short8)b));
+        int4 mul_hi = vmull_s16(vget_high_s16(cast(short8)a),
+                                vget_high_s16(cast(short8)b));
+
+        // Rounding narrowing shift right
+        // narrow = (int16_t)((mul + 16384) >> 15);
+        short4 narrow_lo = vrshrn_n_s32(mul_lo, 15);
+        short4 narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+        // Join together.
+        return cast(__m128i) vcombine_s16(narrow_lo, narrow_hi);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        short8 r;
+
+        for (int i = 0; i < 8; ++i)
+        {
+            // I doubted it at first, but an exhaustive search show this to be equivalent to Intel pseudocode.
+            r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15);
+        }
+
+        return cast(__m128i)r;
+    }
+}
+
+unittest
+{
+    __m128i A = _mm_setr_epi16(12345, -32768, 32767, 0, 1, 845, -6999, -1);
+    __m128i B = _mm_setr_epi16(8877, -24487, 15678, 32760, 1, 0, -149, -1);
+    short8 C = cast(short8) _mm_mulhrs_epi16(A, B);
+    short[8] correct = [3344, 24487, 15678, 0, 0, 0, 32, 0];
+    assert(C.array == correct);
+}
+
+/// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers.
+/// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`.
+__m64 _mm_mulhrs_pi16 (__m64 a, __m64 b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m64) __builtin_ia32_pmulhrsw(cast(short4)a, cast(short4)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        return cast(__m64) to_m64( cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8) to_m128i(a), cast(short8) to_m128i(b)));
+    }
+    else static if (LDC_with_ARM64)
+    {
+        int4 mul = vmull_s16(cast(short4)a, cast(short4)b);
+
+        // Rounding narrowing shift right
+        // (int16_t)((mul + 16384) >> 15);
+        return cast(__m64) vrshrn_n_s32(mul, 15);
+    }
+    else
+    {
+        short4 sa = cast(short4)a;
+        short4 sb = cast(short4)b;
+        short4 r;
+
+        for (int i = 0; i < 4; ++i)
+        {
+            r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15);
+        }
+        return cast(__m64)r;
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(12345, -32768, 32767, 0);
+    __m64 B = _mm_setr_pi16(8877, -24487, 15678, 32760);
+    short4 C = cast(short4) _mm_mulhrs_pi16(A, B);
+    short[4] correct = [3344, 24487, 15678, 0];
+    assert(C.array == correct);
+}
+
+
+/// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`.
+__m128i _mm_shuffle_epi8 (__m128i a, __m128i b) pure @trusted
+{
+    // This is the lovely pshufb.
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i) __builtin_ia32_pshufb128(cast(ubyte16) a, cast(ubyte16) b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        byte16 bb = cast(byte16)b;
+        byte16 mask;
+        mask = cast(byte)(0x8F);
+        bb = bb & mask;
+        byte16 r = vqtbl1q_s8(cast(byte16)a, bb);
+        return cast(__m128i)r;
+    }
+    else
+    {
+        byte16 r;
+        byte16 ba = cast(byte16)a;
+        byte16 bb = cast(byte16)b;
+        for (int i = 0; i < 16; ++i)
+        {
+            byte s = bb.array[i];
+            r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 15 ];
+        }
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(15,   14,      13,  12, 11,  10, 9, 8, 7, 6,  5,  4,  3,  2,  1,  0);
+    __m128i B = _mm_setr_epi8(15, -128, 13 + 16, -12, 11, -10, 9, 8, 7, 6, -5,  4,  3, -2,  1,  0);
+    byte16 C = cast(byte16) _mm_shuffle_epi8(A, B);
+    byte[16] correct =         [0,   0,       2,  0,  4,   0, 6, 7, 8, 9,  0, 11, 12,  0, 14, 15];
+    assert(C.array == correct);
+}
+
+/// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`.
+__m64 _mm_shuffle_pi8 (__m64 a, __m64 b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        alias ubyte8  =__vector(ubyte[8]);
+        return cast(__m64) __builtin_ia32_pshufb(cast(ubyte8) a, cast(ubyte8) b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        // GDC does proper dance to avoid mmx registers, do it manually in LDC since __builtin_ia32_pshufb doesn't exist there
+        __m128i A = to_m128i(a);
+        __m128i index = to_m128i(b);
+        index = index & _mm_set1_epi32(0xF7F7F7F7);
+        return to_m64( cast(__m128i) __builtin_ia32_pshufb128(cast(byte16)A, cast(byte16) index) );
+    }
+    else static if (LDC_with_ARM64)
+    {
+        byte8 bb = cast(byte8)b;
+        byte8 mask;
+        mask = cast(byte)(0x87);
+        bb = bb & mask;
+        __m128i l = to_m128i(a);
+        byte8 r = vtbl1_s8(cast(byte16)l, cast(byte8)bb);
+        return cast(__m64)r;
+    }
+    else
+    {
+        byte8 r;
+        byte8 ba = cast(byte8)a;
+        byte8 bb = cast(byte8)b;
+        for (int i = 0; i < 8; ++i)
+        {
+            byte s = bb.array[i];
+            r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 7 ];
+        }
+        return cast(__m64)r;
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi8(7,  6,  5,  4,      3,  2,  1,  0);
+    __m64 B = _mm_setr_pi8(7,  6, -5,  4,  3 + 8, -2,  1,  0);
+    byte8 C = cast(byte8) _mm_shuffle_pi8(A, B);
+    byte[8] correct =    [0,  1,  0,  3,      4,  0,  6,  7];
+    assert(C.array == correct);
+}
+
+/// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative.
+/// Elements in result are zeroed out when the corresponding element in `b` is zero.
+__m128i _mm_sign_epi16 (__m128i a, __m128i b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b);       
+    }
+    else
+    {
+        // LDC arm64: 5 instructions
+        __m128i mask = _mm_srai_epi16(b, 15);
+        __m128i zeromask = _mm_cmpeq_epi16(b, _mm_setzero_si128());
+        return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi16(a, mask), mask));
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-2, -1, 0, 1,  2, short.min, short.min, short.min);
+    __m128i B = _mm_setr_epi16(-1,  0,-1, 1, -2,       -50,         0,        50);
+    short8 C = cast(short8) _mm_sign_epi16(A, B);
+    short[8] correct =        [ 2,  0, 0, 1, -2, short.min,         0, short.min];
+    assert(C.array == correct);
+}
+
+/// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 
+/// Elements in result are zeroed out when the corresponding element in `b` is zero.
+__m128i _mm_sign_epi32 (__m128i a, __m128i b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b);
+    }
+    else
+    {
+        __m128i mask = _mm_srai_epi32(b, 31);
+        __m128i zeromask = _mm_cmpeq_epi32(b, _mm_setzero_si128());
+        return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi32(a, mask), mask));
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(-2, -1,  0, int.max);
+    __m128i B = _mm_setr_epi32(-1,  0, -1, 1);
+    int4 C = cast(int4) _mm_sign_epi32(A, B);
+    int[4] correct =          [ 2,  0, 0, int.max];
+    assert(C.array == correct);
+}
+
+/// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 
+/// Elements in result are zeroed out when the corresponding element in `b` is zero.
+__m128i _mm_sign_epi8 (__m128i a, __m128i b) pure @safe
+{
+    // PERF DMD
+    static if (GDC_with_SSSE3)
+    {
+        return cast(__m128i) __builtin_ia32_psignb128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else static if (LDC_with_SSSE3)
+    {
+        return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b);
+    }
+    else
+    {
+        __m128i mask = _mm_cmplt_epi8(b, _mm_setzero_si128()); // extend sign bit
+        __m128i zeromask = _mm_cmpeq_epi8(b, _mm_setzero_si128());
+        return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi8(a, mask), mask));
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(-2, -1, 0, 1,  2, byte.min, byte.min, byte.min, -1,  0,-1, 1, -2,      -50,        0,       50);
+    __m128i B = _mm_setr_epi8(-1,  0,-1, 1, -2,      -50,        0,       50, -2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
+    byte16  C = cast(byte16) _mm_sign_epi8(A, B);
+    byte[16] correct =       [ 2,  0, 0, 1, -2, byte.min,        0, byte.min,  1,  0, 0, 1, -2,       50,        0,      -50];
+    assert(C.array == correct);
+}
+
+/// Negate packed 16-bit integers in `a`  when the corresponding signed 16-bit integer in `b` is negative.
+/// Element in result are zeroed out when the corresponding element in `b` is zero.
+__m64 _mm_sign_pi16 (__m64 a, __m64 b) @trusted
+{
+    return to_m64( _mm_sign_epi16( to_m128i(a), to_m128i(b)) );
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16( 2, short.min, short.min, short.min);
+    __m64 B = _mm_setr_pi16(-2,       -50,         0,        50);
+    short4 C = cast(short4) _mm_sign_pi16(A, B);
+    short[4] correct =     [-2, short.min,         0, short.min];
+    assert(C.array == correct);
+}
+
+/// Negate packed 32-bit integers in `a`  when the corresponding signed 32-bit integer in `b` is negative.
+/// Element in result are zeroed out when the corresponding element in `b` is zero.
+__m64 _mm_sign_pi32 (__m64 a, __m64 b) @trusted
+{
+    return to_m64( _mm_sign_epi32( to_m128i(a), to_m128i(b)) );
+}
+unittest
+{
+    __m64 A = _mm_setr_pi32(-2, -100);
+    __m64 B = _mm_setr_pi32(-1,  0);
+    int2 C = cast(int2) _mm_sign_pi32(A, B);
+    int[2] correct =          [ 2,  0];
+    assert(C.array == correct);
+}
+
+/// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 
+/// Elements in result are zeroed out when the corresponding element in `b` is zero.
+__m64 _mm_sign_pi8 (__m64 a, __m64 b) @trusted
+{
+    return to_m64( _mm_sign_epi8( to_m128i(a), to_m128i(b)) );
+}
+unittest
+{
+    __m64 A = _mm_setr_pi8(-2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
+    __m64 B = _mm_setr_pi8(-1,  0,-1, 1, -2,      -50,        0,       50);
+    byte8  C = cast(byte8) _mm_sign_pi8(A, B);
+    byte[8] correct =     [ 2,  0, 0, 1, -2, byte.min,        0, byte.min];
+    assert(C.array == correct);
+}
diff --git a/external/inteli/types.d b/external/inteli/types.d
new file mode 100644
index 0000000..0e6aad2
--- /dev/null
+++ b/external/inteli/types.d
@@ -0,0 +1,456 @@
+/**
+* `core.simd` emulation layer.
+*
+* Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
+*            cet 2024.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.types;
+
+
+pure:
+nothrow:
+@nogc:
+
+version(GNU)
+{
+    // Note: for GDC support, be sure to use https://explore.dgnu.org/
+
+    // Future: just detect vectors, do not base upon arch.
+
+    version(X86_64)
+    {
+        enum MMXSizedVectorsAreEmulated = false;
+        enum SSESizedVectorsAreEmulated = false;
+
+        // Does GDC support AVX-sized vectors?
+        static if (__VERSION__ >= 2100) // Starting at GDC 12.1 only.
+        {
+            enum AVXSizedVectorsAreEmulated = !(is(__vector(double[4]))); 
+        }
+        else
+        {
+            enum AVXSizedVectorsAreEmulated = true;
+        }
+
+        import gcc.builtins;
+    }
+    else
+    {
+        enum MMXSizedVectorsAreEmulated = true;
+        enum SSESizedVectorsAreEmulated = true;
+        enum AVXSizedVectorsAreEmulated = true;
+    }
+}
+else version(LDC)
+{
+    public import ldc.simd;
+
+    // Use this alias to mention it should only be used with LDC,
+    // for example when emulated shufflevector would just be wasteful.
+    alias shufflevectorLDC = shufflevector;
+
+    enum MMXSizedVectorsAreEmulated = false;
+    enum SSESizedVectorsAreEmulated = false;
+    enum AVXSizedVectorsAreEmulated = false;
+}
+else version(DigitalMars)
+{
+    public import core.simd;
+
+    static if (__VERSION__ >= 2100)
+    {
+        // Note: turning this true is very desirable for DMD performance,
+        // but also leads to many bugs being discovered upstream.
+        // The fact that it works at all relies on many workardounds.
+        // In particular intel-intrinsics with this "on" is a honeypot for DMD backend bugs,
+        // and a very strong DMD codegen test suite.
+        // What happens typically is that contributors end up on a DMD bug in their PR.
+        // But finally, in 2022 D_SIMD has been activated, at least for SSE and some instructions.
+        enum bool tryToEnableCoreSimdWithDMD = true;
+    }
+    else
+    {
+        enum bool tryToEnableCoreSimdWithDMD = false;
+    }
+
+    version(D_SIMD)
+    {
+        enum MMXSizedVectorsAreEmulated = true;
+        enum SSESizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD;
+
+        // Note: with DMD, AVX-sized vectors can't be enabled yet.
+        // On linux + x86_64, this will fail since a few operands seem to be missing. 
+        // FUTURE: enable AVX-sized vectors in DMD. :)
+        //
+        // Blockers: https://issues.dlang.org/show_bug.cgi?id=24283 and 24284
+        //           Probably other, unreported issues.
+        version(D_AVX)
+            enum AVXSizedVectorsAreEmulated = true;
+        else
+            enum AVXSizedVectorsAreEmulated = true;
+    }
+    else
+    {
+        // Some DMD 32-bit targets don't have D_SIMD
+        enum MMXSizedVectorsAreEmulated = true;
+        enum SSESizedVectorsAreEmulated = true;
+        enum AVXSizedVectorsAreEmulated = true;
+    }
+}
+
+enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated;
+
+static if (CoreSimdIsEmulated)
+{
+    // core.simd is emulated in some capacity: introduce `VectorOps`
+
+    mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N)
+    {
+        enum Count = N;
+        alias Base = BaseType;
+
+        BaseType* ptr() return pure nothrow @nogc
+        {
+            return array.ptr;
+        }
+
+        // Unary operators
+        VectorType opUnary(string op)() pure nothrow @safe @nogc
+        {
+            VectorType res = void;
+            mixin("res.array[] = " ~ op ~ "array[];");
+            return res;
+        }
+
+        // Binary operators
+        VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc
+        {
+            VectorType res = void;
+            mixin("res.array[] = array[] " ~ op ~ " other.array[];");
+            return res;
+        }
+
+        // Assigning a BaseType value
+        void opAssign(BaseType e) pure nothrow @safe @nogc
+        {
+            array[] = e;
+        }
+
+        // Assigning a static array
+        void opAssign(ArrayType v) pure nothrow @safe @nogc
+        {
+            array[] = v[];
+        }
+
+        void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc
+        {
+            mixin("array[] "  ~ op ~ "= other.array[];");
+        }
+
+        // Assigning a dyn array
+        this(ArrayType v) pure nothrow @safe @nogc
+        {
+            array[] = v[];
+        }
+
+        // Broadcast constructor
+        this(BaseType x) pure nothrow @safe @nogc
+        {
+            array[] = x;
+        }
+
+        /// We can't support implicit conversion but do support explicit casting.
+        /// "Vector types of the same size can be implicitly converted among each other."
+        /// Casting to another vector type is always just a raw copy.
+        VecDest opCast(VecDest)() pure const nothrow @trusted @nogc
+            if (VecDest.sizeof == VectorType.sizeof)
+            {
+                VecDest dest = void;
+                // Copy
+                dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[];
+                return dest;
+            }
+
+        ref inout(BaseType) opIndex(size_t i) inout return pure nothrow @safe @nogc
+        {
+            return array[i];
+        }
+
+    }
+}
+else
+{
+    public import core.simd;
+
+    // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can
+    // And GDC sometimes need those unsigned vector types for some intrinsics.
+    // For internal use only.
+    package alias ushort8 = Vector!(ushort[8]);
+    package alias ubyte8  = Vector!(ubyte[8]);
+    package alias ubyte16 = Vector!(ubyte[16]);
+
+    static if (!AVXSizedVectorsAreEmulated)
+    {
+        package alias ushort16 = Vector!(ushort[16]);
+        package alias ubyte32  = Vector!(ubyte[32]);
+    }
+}
+
+// Emulate ldc.simd cmpMask and other masks.
+// Note: these should be deprecated on non-LDC, 
+// since it's slower to generate that code.
+version(LDC)
+{} 
+else
+{
+    // TODO: deprecated and write plain versions instead
+
+    private template BaseType(V)
+    {
+        alias typeof( ( { V v; return v; }()).array[0]) BaseType;
+    }
+
+    private template TrueMask(V)
+    {
+        alias Elem = BaseType!V;
+
+        static if (is(Elem == float))
+        {
+            immutable uint m1 = 0xffffffff;
+            enum Elem TrueMask = *cast(float*)(&m1);
+        }
+        else static if (is(Elem == double))
+        {
+            immutable ulong m1 = 0xffffffff_ffffffff;
+            enum Elem TrueMask = *cast(double*)(&m1);
+        }
+        else // integer case
+        {
+            enum Elem TrueMask = -1;
+        }
+    }
+
+    Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison
+    {
+        enum size_t Count = Vec.array.length;
+        Vec result;
+        foreach(int i; 0..Count)
+        {
+            bool cond = a.array[i] == b.array[i];
+            result.ptr[i] = cond ? TrueMask!Vec : 0;
+        }
+        return result;
+    }
+
+    Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison
+    {
+        enum size_t Count = Vec.array.length;
+        Vec result;
+        foreach(int i; 0..Count)
+        {
+            bool cond = a.array[i] > b.array[i];
+            result.ptr[i] = cond ? TrueMask!Vec : 0;
+        }
+        return result;
+    }
+}
+
+unittest
+{
+    float4 a = [1, 3, 5, 7];
+    float4 b = [2, 3, 4, 5];
+    int4 c = cast(int4)(greaterMask!float4(a, b));
+    static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff];
+    assert(c.array == correct);
+}
+
+static if (MMXSizedVectorsAreEmulated)
+{
+    /// MMX-like SIMD types
+    struct float2
+    {
+        float[2] array;
+        mixin VectorOps!(float2, float[2]);
+    }
+
+    struct byte8
+    {
+        byte[8] array;
+        mixin VectorOps!(byte8, byte[8]);
+    }
+
+    struct short4
+    {
+        short[4] array;
+        mixin VectorOps!(short4, short[4]);
+    }
+
+    struct int2
+    {
+        int[2] array;
+        mixin VectorOps!(int2, int[2]);
+    }
+
+    struct long1
+    {
+        long[1] array;
+        mixin VectorOps!(long1, long[1]);
+    }
+}
+else
+{
+    // For this compiler, defining MMX-sized vectors is working.
+    public import core.simd;
+    alias long1 = Vector!(long[1]);
+    alias float2 = Vector!(float[2]);
+    alias int2 = Vector!(int[2]);
+    alias short4 = Vector!(short[4]);
+    alias byte8 = Vector!(byte[8]);
+}
+
+static assert(float2.sizeof == 8);
+static assert(byte8.sizeof == 8);
+static assert(short4.sizeof == 8);
+static assert(int2.sizeof == 8);
+static assert(long1.sizeof == 8);
+
+
+static if (SSESizedVectorsAreEmulated)
+{
+    /// SSE-like SIMD types
+
+    struct float4
+    {
+        float[4] array;
+        mixin VectorOps!(float4, float[4]);
+    }
+
+    struct byte16
+    {
+        byte[16] array;
+        mixin VectorOps!(byte16, byte[16]);
+    }
+
+    struct short8
+    {
+        short[8] array;
+        mixin VectorOps!(short8, short[8]);
+    }
+
+    struct int4
+    {
+        int[4] array;
+        mixin VectorOps!(int4, int[4]);
+    }
+
+    struct long2
+    {
+        long[2] array;
+        mixin VectorOps!(long2, long[2]);
+    }
+
+    struct double2
+    {
+        double[2] array;
+        mixin VectorOps!(double2, double[2]);
+    }
+}
+
+static assert(float4.sizeof == 16);
+static assert(byte16.sizeof == 16);
+static assert(short8.sizeof == 16);
+static assert(int4.sizeof == 16);
+static assert(long2.sizeof == 16);
+static assert(double2.sizeof == 16);
+
+
+static if (AVXSizedVectorsAreEmulated)
+{
+    /// AVX-like SIMD types
+
+    struct float8
+    {
+        float[8] array;
+        mixin VectorOps!(float8, float[8]);
+    }
+
+    struct byte32
+    {
+        byte[32] array;
+        mixin VectorOps!(byte32, byte[32]);
+    }
+
+    struct short16
+    {
+        short[16] array;
+        mixin VectorOps!(short16, short[16]);
+    }
+
+    struct int8
+    {
+        int[8] array;
+        mixin VectorOps!(int8, int[8]);
+    }
+
+    struct long4
+    {
+        long[4] array;
+        mixin VectorOps!(long4, long[4]);
+    }
+
+    struct double4
+    {
+        double[4] array;
+        mixin VectorOps!(double4, double[4]);
+    }
+}
+else
+{
+    public import core.simd;    
+}
+static assert(float8.sizeof == 32);
+static assert(byte32.sizeof == 32);
+static assert(short16.sizeof == 32);
+static assert(int8.sizeof == 32);
+static assert(long4.sizeof == 32);
+static assert(double4.sizeof == 32);
+
+
+
+
+alias __m256 = float8;
+alias __m256i = long4; // long long __vector with ICC, GCC, and clang
+alias __m256d = double4;
+alias __m128 = float4;
+alias __m128i = int4;
+alias __m128d = double2;
+alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long
+
+int _MM_SHUFFLE2(int x, int y) pure @safe
+{
+    assert(x >= 0 && x <= 1);
+    assert(y >= 0 && y <= 1);
+    return (x << 1) | y;
+}
+
+int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe
+{
+    assert(x >= 0 && x <= 3);
+    assert(y >= 0 && y <= 3);
+    assert(z >= 0 && z <= 3);
+    assert(w >= 0 && w <= 3);
+    return (z<<6) | (y<<4) | (x<<2) | w;
+}
+
+// test assignment from scalar to vector type
+unittest
+{
+    float4 A = 3.0f;
+    float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f];
+    assert(A.array == correctA);
+
+    int2 B = 42;
+    int[2] correctB = [42, 42];
+    assert(B.array == correctB);
+}
\ No newline at end of file
diff --git a/external/inteli/xmmintrin.d b/external/inteli/xmmintrin.d
new file mode 100644
index 0000000..e53caff
--- /dev/null
+++ b/external/inteli/xmmintrin.d
@@ -0,0 +1,3219 @@
+/**
+* SSE intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE
+* 
+* Copyright: Copyright Guillaume Piolat 2016-2020.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.xmmintrin;
+
+public import inteli.types;
+
+import inteli.internals;
+
+import inteli.mmx;
+import inteli.emmintrin;
+
+version(D_InlineAsm_X86)
+    version = InlineX86Asm;
+else version(D_InlineAsm_X86_64)
+    version = InlineX86Asm;
+
+
+// SSE1
+
+nothrow @nogc:
+
+
+enum int _MM_EXCEPT_INVALID    = 0x0001; /// MXCSR Exception states.
+enum int _MM_EXCEPT_DENORM     = 0x0002; ///ditto
+enum int _MM_EXCEPT_DIV_ZERO   = 0x0004; ///ditto
+enum int _MM_EXCEPT_OVERFLOW   = 0x0008; ///ditto
+enum int _MM_EXCEPT_UNDERFLOW  = 0x0010; ///ditto
+enum int _MM_EXCEPT_INEXACT    = 0x0020; ///ditto
+enum int _MM_EXCEPT_MASK       = 0x003f; /// MXCSR Exception states mask.
+
+enum int _MM_MASK_INVALID      = 0x0080; /// MXCSR Exception masks.
+enum int _MM_MASK_DENORM       = 0x0100; ///ditto
+enum int _MM_MASK_DIV_ZERO     = 0x0200; ///ditto
+enum int _MM_MASK_OVERFLOW     = 0x0400; ///ditto
+enum int _MM_MASK_UNDERFLOW    = 0x0800; ///ditto
+enum int _MM_MASK_INEXACT      = 0x1000; ///ditto
+enum int _MM_MASK_MASK         = 0x1f80; /// MXCSR Exception masks mask.
+
+enum int _MM_ROUND_NEAREST     = 0x0000; /// MXCSR Rounding mode.
+enum int _MM_ROUND_DOWN        = 0x2000; ///ditto
+enum int _MM_ROUND_UP          = 0x4000; ///ditto
+enum int _MM_ROUND_TOWARD_ZERO = 0x6000; ///ditto
+enum int _MM_ROUND_MASK        = 0x6000; /// MXCSR Rounding mode mask.
+
+enum int _MM_FLUSH_ZERO_MASK   = 0x8000; /// MXCSR Denormal flush to zero mask.
+enum int _MM_FLUSH_ZERO_ON     = 0x8000; /// MXCSR Denormal flush to zero modes.
+enum int _MM_FLUSH_ZERO_OFF    = 0x0000; ///ditto
+
+/// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
+__m128 _mm_add_ps(__m128 a, __m128 b) pure @safe
+{
+    pragma(inline, true);
+    return a + b;
+}
+unittest
+{
+    __m128 a = [1, 2, 3, 4];
+    a = _mm_add_ps(a, a);
+    assert(a.array[0] == 2);
+    assert(a.array[1] == 4);
+    assert(a.array[2] == 6);
+    assert(a.array[3] == 8);
+}
+
+/// Add the lower single-precision (32-bit) floating-point element 
+/// in `a` and `b`, store the result in the lower element of result, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_add_ss(__m128 a, __m128 b) pure @safe
+{
+    static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_addss(a, b);
+    }
+    else static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.ADDSS, a, b);
+    }
+    else
+    {
+        a[0] += b[0];
+        return a;
+    }
+}
+unittest
+{
+    __m128 a = [1, 2, 3, 4];
+    a = _mm_add_ss(a, a);
+    assert(a.array == [2.0f, 2, 3, 4]);
+}
+
+/// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
+__m128 _mm_and_ps (__m128 a, __m128 b) pure @safe
+{
+    pragma(inline, true);
+    return cast(__m128)(cast(__m128i)a & cast(__m128i)b);
+}
+unittest
+{
+    float a = 4.32f;
+    float b = -78.99f;
+    int correct = (*cast(int*)(&a)) & (*cast(int*)(&b));
+    __m128 A = _mm_set_ps(a, b, a, b);
+    __m128 B = _mm_set_ps(b, a, b, a);
+    int4 R = cast(int4)( _mm_and_ps(A, B) );
+    assert(R.array[0] == correct);
+    assert(R.array[1] == correct);
+    assert(R.array[2] == correct);
+    assert(R.array[3] == correct);
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`.
+__m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.ANDNPS, a, b);
+    else
+        return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b );
+}
+unittest
+{
+    float a = 4.32f;
+    float b = -78.99f;
+    int correct  = ~(*cast(int*)(&a)) &  (*cast(int*)(&b));
+    int correct2 =  (*cast(int*)(&a)) & ~(*cast(int*)(&b));
+    __m128 A = _mm_set_ps(a, b, a, b);
+    __m128 B = _mm_set_ps(b, a, b, a);
+    int4 R = cast(int4)( _mm_andnot_ps(A, B) );
+    assert(R.array[0] == correct2);
+    assert(R.array[1] == correct);
+    assert(R.array[2] == correct2);
+    assert(R.array[3] == correct);
+}
+
+/// Average packed unsigned 16-bit integers in ``a` and `b`.
+__m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b)));
+}
+
+/// Average packed unsigned 8-bit integers in ``a` and `b`.
+__m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b)));
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for equality.
+__m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPPS, a, b, 0);
+    else
+        return cast(__m128) cmpps!(FPComparison.oeq)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
+    __m128 B = _mm_setr_ps(3.0f, 2.0f, float.nan, float.nan);
+    __m128i R = cast(__m128i) _mm_cmpeq_ps(A, B);
+    int[4] correct = [0, -1, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for equality, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPSS, a, b, 0);
+    else
+        return cast(__m128) cmpss!(FPComparison.oeq)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
+    __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
+    __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
+    __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
+    __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
+    __m128i R1 = cast(__m128i) _mm_cmpeq_ss(A, B);
+    __m128i R2 = cast(__m128i) _mm_cmpeq_ss(A, C);
+    __m128i R3 = cast(__m128i) _mm_cmpeq_ss(A, D);
+    __m128i R4 = cast(__m128i) _mm_cmpeq_ss(A, E);
+    int[4] correct1 = [-1, 0, 0, 0];
+    int[4] correct2 = [0, 0, 0, 0];
+    int[4] correct3 = [0, 0, 0, 0];
+    int[4] correct4 = [0, 0, 0, 0];
+    assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal.
+__m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPPS, b, a, 2);
+    else
+        return cast(__m128) cmpps!(FPComparison.oge)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
+    __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
+    __m128i R = cast(__m128i) _mm_cmpge_ps(A, B);
+    int[4] correct = [0, -1,-1, 0];
+    assert(R.array == correct);
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 2);
+        a[0] = c[0];
+        return a;
+    }
+    else
+        return cast(__m128) cmpss!(FPComparison.oge)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
+    __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
+    __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
+    __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
+    __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
+    __m128i R1 = cast(__m128i) _mm_cmpge_ss(A, B);
+    __m128i R2 = cast(__m128i) _mm_cmpge_ss(A, C);
+    __m128i R3 = cast(__m128i) _mm_cmpge_ss(A, D);
+    __m128i R4 = cast(__m128i) _mm_cmpge_ss(A, E);
+    int[4] correct1 = [-1, 0, 0, 0];
+    int[4] correct2 = [-1, 0, 0, 0];
+    int[4] correct3 = [0, 0, 0, 0];
+    int[4] correct4 = [0, 0, 0, 0];
+    assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than.
+__m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPPS, b, a, 1);
+    else
+        return cast(__m128) cmpps!(FPComparison.ogt)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
+    __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
+    __m128i R = cast(__m128i) _mm_cmpgt_ps(A, B);
+    int[4] correct = [0, 0,-1, 0];
+    assert(R.array == correct);
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 1);
+        a[0] = c[0];
+        return a;
+    }
+    else
+        return cast(__m128) cmpss!(FPComparison.ogt)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
+    __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
+    __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
+    __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
+    __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
+    __m128i R1 = cast(__m128i) _mm_cmpgt_ss(A, B);
+    __m128i R2 = cast(__m128i) _mm_cmpgt_ss(A, C);
+    __m128i R3 = cast(__m128i) _mm_cmpgt_ss(A, D);
+    __m128i R4 = cast(__m128i) _mm_cmpgt_ss(A, E);
+    int[4] correct1 = [0, 0, 0, 0];
+    int[4] correct2 = [-1, 0, 0, 0];
+    int[4] correct3 = [0, 0, 0, 0];
+    int[4] correct4 = [0, 0, 0, 0];
+    assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal.
+__m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPPS, a, b, 2);
+    else
+        return cast(__m128) cmpps!(FPComparison.ole)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
+    __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
+    __m128i R = cast(__m128i) _mm_cmple_ps(A, B);
+    int[4] correct = [-1, -1, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPSS, a, b, 2);
+    else
+        return cast(__m128) cmpss!(FPComparison.ole)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
+    __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
+    __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
+    __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
+    __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
+    __m128i R1 = cast(__m128i) _mm_cmple_ss(A, B);
+    __m128i R2 = cast(__m128i) _mm_cmple_ss(A, C);
+    __m128i R3 = cast(__m128i) _mm_cmple_ss(A, D);
+    __m128i R4 = cast(__m128i) _mm_cmple_ss(A, E);
+    int[4] correct1 = [-1, 0, 0, 0];
+    int[4] correct2 = [0, 0, 0, 0];
+    int[4] correct3 = [0, 0, 0, 0];
+    int[4] correct4 = [-1, 0, 0, 0];
+    assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than.
+__m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPPS, a, b, 1);
+    else
+        return cast(__m128) cmpps!(FPComparison.olt)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
+    __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
+    __m128i R = cast(__m128i) _mm_cmplt_ps(A, B);
+    int[4] correct = [-1, 0, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPSS, a, b, 1);
+    else
+        return cast(__m128) cmpss!(FPComparison.olt)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
+    __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
+    __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
+    __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
+    __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
+    __m128i R1 = cast(__m128i) _mm_cmplt_ss(A, B);
+    __m128i R2 = cast(__m128i) _mm_cmplt_ss(A, C);
+    __m128i R3 = cast(__m128i) _mm_cmplt_ss(A, D);
+    __m128i R4 = cast(__m128i) _mm_cmplt_ss(A, E);
+    int[4] correct1 = [0, 0, 0, 0];
+    int[4] correct2 = [0, 0, 0, 0];
+    int[4] correct3 = [0, 0, 0, 0];
+    int[4] correct4 = [-1, 0, 0, 0];
+    assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-equal.
+__m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPPS, a, b, 4);
+    else
+        return cast(__m128) cmpps!(FPComparison.une)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
+    __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
+    __m128i R = cast(__m128i) _mm_cmpneq_ps(A, B);
+    int[4] correct = [-1, 0, -1, -1];
+    assert(R.array == correct);
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-equal, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPSS, a, b, 4);
+    else
+        return cast(__m128) cmpss!(FPComparison.une)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
+    __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
+    __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
+    __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
+    __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
+    __m128i R1 = cast(__m128i) _mm_cmpneq_ss(A, B);
+    __m128i R2 = cast(__m128i) _mm_cmpneq_ss(A, C);
+    __m128i R3 = cast(__m128i) _mm_cmpneq_ss(A, D);
+    __m128i R4 = cast(__m128i) _mm_cmpneq_ss(A, E);
+    int[4] correct1 = [0, 0, 0, 0];
+    int[4] correct2 = [-1, 0, 0, 0];
+    int[4] correct3 = [-1, 0, 0, 0];
+    int[4] correct4 = [-1, 0, 0, 0];
+    assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal.
+__m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPPS, b, a, 6);
+    else
+        return cast(__m128) cmpps!(FPComparison.ult)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
+    __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
+    __m128i R = cast(__m128i) _mm_cmpnge_ps(A, B);
+    int[4] correct = [-1, 0, 0, -1];
+    assert(R.array == correct);
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 6);
+        a[0] = c[0];
+        return a;
+    }
+    else
+        return cast(__m128) cmpss!(FPComparison.ult)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
+    __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
+    __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
+    __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
+    __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
+    __m128i R1 = cast(__m128i) _mm_cmpnge_ss(A, B);
+    __m128i R2 = cast(__m128i) _mm_cmpnge_ss(A, C);
+    __m128i R3 = cast(__m128i) _mm_cmpnge_ss(A, D);
+    __m128i R4 = cast(__m128i) _mm_cmpnge_ss(A, E);
+    int[4] correct1 = [0, 0, 0, 0];
+    int[4] correct2 = [0, 0, 0, 0];
+    int[4] correct3 = [-1, 0, 0, 0];
+    int[4] correct4 = [-1, 0, 0, 0];
+    assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than.
+__m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPPS, b, a, 5);
+    else
+        return cast(__m128) cmpps!(FPComparison.ule)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
+    __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
+    __m128i R = cast(__m128i) _mm_cmpngt_ps(A, B);
+    int[4] correct = [-1, -1, 0, -1];
+    assert(R.array == correct);
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 5);
+        a[0] = c[0];
+        return a;
+    }
+    else
+        return cast(__m128) cmpss!(FPComparison.ule)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
+    __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
+    __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
+    __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
+    __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
+    __m128i R1 = cast(__m128i) _mm_cmpngt_ss(A, B);
+    __m128i R2 = cast(__m128i) _mm_cmpngt_ss(A, C);
+    __m128i R3 = cast(__m128i) _mm_cmpngt_ss(A, D);
+    __m128i R4 = cast(__m128i) _mm_cmpngt_ss(A, E);
+    int[4] correct1 = [-1, 0, 0, 0];
+    int[4] correct2 = [0, 0, 0, 0];
+    int[4] correct3 = [-1, 0, 0, 0];
+    int[4] correct4 = [-1, 0, 0, 0];
+    assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal.
+__m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPPS, a, b, 6);
+    else
+        return cast(__m128) cmpps!(FPComparison.ugt)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
+    __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
+    __m128i R = cast(__m128i) _mm_cmpnle_ps(A, B);
+    int[4] correct = [0, 0, -1, -1];
+    assert(R.array == correct);
+}
+
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPSS, a, b, 6);
+    else
+        return cast(__m128) cmpss!(FPComparison.ugt)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
+    __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
+    __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
+    __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
+    __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
+    __m128i R1 = cast(__m128i) _mm_cmpnle_ss(A, B);
+    __m128i R2 = cast(__m128i) _mm_cmpnle_ss(A, C);
+    __m128i R3 = cast(__m128i) _mm_cmpnle_ss(A, D);
+    __m128i R4 = cast(__m128i) _mm_cmpnle_ss(A, E);
+    int[4] correct1 = [0, 0, 0, 0];
+    int[4] correct2 = [-1, 0, 0, 0];
+    int[4] correct3 = [-1, 0, 0, 0];
+    int[4] correct4 = [0, 0, 0, 0];
+    assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than.
+__m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPPS, a, b, 5);
+    else
+        return cast(__m128) cmpps!(FPComparison.uge)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
+    __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
+    __m128i R = cast(__m128i) _mm_cmpnlt_ps(A, B);
+    int[4] correct = [0, -1, -1, -1];
+    assert(R.array == correct);
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPSS, a, b, 5);
+    else
+        return cast(__m128) cmpss!(FPComparison.uge)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
+    __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
+    __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
+    __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
+    __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
+    __m128i R1 = cast(__m128i) _mm_cmpnlt_ss(A, B);
+    __m128i R2 = cast(__m128i) _mm_cmpnlt_ss(A, C);
+    __m128i R3 = cast(__m128i) _mm_cmpnlt_ss(A, D);
+    __m128i R4 = cast(__m128i) _mm_cmpnlt_ss(A, E);
+    int[4] correct1 = [-1, 0, 0, 0];
+    int[4] correct2 = [-1, 0, 0, 0];
+    int[4] correct3 = [-1, 0, 0, 0];
+    int[4] correct4 = [0, 0, 0, 0];
+    assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN.
+__m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPPS, a, b, 7);
+    else
+        return cast(__m128) cmpps!(FPComparison.ord)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
+    __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
+    __m128i R = cast(__m128i) _mm_cmpord_ps(A, B);
+    int[4] correct = [-1, -1, -1, 0];
+    assert(R.array == correct);
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPSS, a, b, 7);
+    else
+        return cast(__m128) cmpss!(FPComparison.ord)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
+    __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
+    __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
+    __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
+    __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
+    __m128i R1 = cast(__m128i) _mm_cmpord_ss(A, B);
+    __m128i R2 = cast(__m128i) _mm_cmpord_ss(A, C);
+    __m128i R3 = cast(__m128i) _mm_cmpord_ss(A, D);
+    __m128i R4 = cast(__m128i) _mm_cmpord_ss(A, E);
+    int[4] correct1 = [-1, 0, 0, 0];
+    int[4] correct2 = [-1, 0, 0, 0];
+    int[4] correct3 = [0, 0, 0, 0];
+    int[4] correct4 = [-1, 0, 0, 0];
+    assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
+__m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPPS, a, b, 3);
+    else
+        return cast(__m128) cmpps!(FPComparison.uno)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
+    __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
+    __m128i R = cast(__m128i) _mm_cmpunord_ps(A, B);
+    int[4] correct = [0, 0, 0, -1];
+    assert(R.array == correct);
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.CMPSS, a, b, 3);
+    else return cast(__m128) cmpss!(FPComparison.uno)(a, b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
+    __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
+    __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
+    __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
+    __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
+    __m128i R1 = cast(__m128i) _mm_cmpunord_ss(A, B);
+    __m128i R2 = cast(__m128i) _mm_cmpunord_ss(A, C);
+    __m128i R3 = cast(__m128i) _mm_cmpunord_ss(A, D);
+    __m128i R4 = cast(__m128i) _mm_cmpunord_ss(A, E);
+    int[4] correct1 = [0, 0, 0, 0];
+    int[4] correct2 = [0, 0, 0, 0];
+    int[4] correct3 = [-1, 0, 0, 0];
+    int[4] correct4 = [0, 0, 0, 0];
+    assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
+}
+
+
+/// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for equality, 
+/// and return the boolean result (0 or 1).
+int _mm_comieq_ss (__m128 a, __m128 b) pure @safe
+{
+    return a.array[0] == b.array[0];
+}
+unittest
+{
+    assert(1 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
+    assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
+    assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
+    assert(0 == _mm_comieq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
+    assert(1 == _mm_comieq_ss(_mm_set_ss(0.0), _mm_set_ss(-0.0)));
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than-or-equal, 
+/// and return the boolean result (0 or 1).
+int _mm_comige_ss (__m128 a, __m128 b) pure @safe
+{
+    return a.array[0] >= b.array[0];
+}
+unittest
+{
+    assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
+    assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
+    assert(0 == _mm_comige_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
+    assert(0 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
+    assert(0 == _mm_comige_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
+    assert(1 == _mm_comige_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f)));
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than, 
+/// and return the boolean result (0 or 1).
+int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta
+{
+    return a.array[0] > b.array[0];
+}
+unittest
+{
+    assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
+    assert(1 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
+    assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
+    assert(0 == _mm_comigt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
+    assert(0 == _mm_comigt_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than-or-equal, 
+/// and return the boolean result (0 or 1).
+int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe
+{
+    return a.array[0] <= b.array[0];
+}
+unittest
+{
+    assert(1 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
+    assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
+    assert(1 == _mm_comile_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
+    assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
+    assert(0 == _mm_comile_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
+    assert(1 == _mm_comile_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than, 
+/// and return the boolean result (0 or 1).
+int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb
+{
+    return a.array[0] < b.array[0];
+}
+unittest
+{
+    assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
+    assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
+    assert(1 == _mm_comilt_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
+    assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
+    assert(0 == _mm_comilt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
+    assert(0 == _mm_comilt_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f)));
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for not-equal, 
+/// and return the boolean result (0 or 1).
+int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne
+{
+    return a.array[0] != b.array[0];
+}
+unittest
+{
+    assert(0 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
+    assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
+    assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
+    assert(1 == _mm_comineq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
+    assert(0 == _mm_comineq_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
+}
+
+/// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
+/// floating-point elements, store the results in the lower 2 elements, 
+/// and copy the upper 2 packed elements from `a` to the upper elements of result.
+alias _mm_cvt_pi2ps = _mm_cvtpi32_ps;
+
+/// Convert 2 lower packed single-precision (32-bit) floating-point elements in `a` 
+/// to packed 32-bit integers.
+__m64 _mm_cvt_ps2pi (__m128 a) @safe
+{
+    return to_m64(_mm_cvtps_epi32(a));
+}
+
+/// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
+/// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
+/// upper elements of the result.
+__m128 _mm_cvt_si2ss (__m128 v, int x) pure @trusted
+{
+    v.ptr[0] = cast(float)x;
+    return v;
+}
+unittest
+{
+    __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42);
+    assert(a.array == [42f, 0, 0, 0]);
+}
+
+/// Convert packed 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
+__m128 _mm_cvtpi16_ps (__m64 a) pure @safe
+{
+    __m128i ma = to_m128i(a);
+    ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
+    ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit
+    return _mm_cvtepi32_ps(ma);
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
+    __m128 R = _mm_cvtpi16_ps(A);
+    float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
+    assert(R.array == correct);
+}
+
+/// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
+/// floating-point elements, store the results in the lower 2 elements, 
+/// and copy the upper 2 packed elements from `a` to the upper elements of result.
+__m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted
+{
+    __m128 fb = _mm_cvtepi32_ps(to_m128i(b));
+    a.ptr[0] = fb.array[0];
+    a.ptr[1] = fb.array[1];
+    return a;
+}
+unittest
+{
+    __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2));
+    float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f];
+    assert(R.array == correct);
+}
+
+/// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements, 
+/// store the results in the lower 2 elements, then covert the packed signed 32-bit integers in `b` to 
+/// single-precision (32-bit) floating-point element, and store the results in the upper 2 elements.
+__m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted
+{
+    long2 l;
+    l.ptr[0] = a.array[0];
+    l.ptr[1] = b.array[0];
+    return _mm_cvtepi32_ps(cast(__m128i)l);
+}
+unittest
+{
+    __m64 A = _mm_setr_pi32(-45, 128);
+    __m64 B = _mm_setr_pi32(0, 1000);
+    __m128 R = _mm_cvtpi32x2_ps(A, B);
+    float[4] correct = [-45.0f, 128.0f, 0.0f, 1000.0f];
+    assert(R.array == correct);
+}
+
+/// Convert the lower packed 8-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
+__m128 _mm_cvtpi8_ps (__m64 a) pure @safe
+{
+    __m128i b = to_m128i(a); 
+
+    // Zero extend to 32-bit
+    b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
+    b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+
+    // Replicate sign bit
+    b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit
+    return _mm_cvtepi32_ps(b);
+}
+unittest
+{
+    __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
+    __m128 R = _mm_cvtpi8_ps(A);
+    float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
+    assert(R.array == correct);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 16-bit integers.
+/// Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF.
+__m64 _mm_cvtps_pi16 (__m128 a) @safe
+{
+    // The C++ version of this intrinsic convert to 32-bit float, then use packssdw
+    // Which means the 16-bit integers should be saturated
+    __m128i b = _mm_cvtps_epi32(a);
+    b = _mm_packs_epi32(b, b);
+    return to_m64(b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f);
+    short4 R = cast(short4) _mm_cvtps_pi16(A);
+    short[4] correct = [-1, 2, -32768, 32767];
+    assert(R.array == correct);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers.
+__m64 _mm_cvtps_pi32 (__m128 a) @safe
+{
+    return to_m64(_mm_cvtps_epi32(a));
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, );
+    int2 R = cast(int2) _mm_cvtps_pi32(A);
+    int[2] correct = [-33000, 70000];
+    assert(R.array == correct);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 8-bit integers, 
+/// and store the results in lower 4 elements. 
+/// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF.
+__m64 _mm_cvtps_pi8 (__m128 a) @safe
+{
+    // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb
+    // Which means the 8-bit integers should be saturated
+    __m128i b = _mm_cvtps_epi32(a);
+    b = _mm_packs_epi32(b, _mm_setzero_si128());
+    b = _mm_packs_epi16(b, _mm_setzero_si128());
+    return to_m64(b);
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f);
+    byte8 R = cast(byte8) _mm_cvtps_pi8(A);
+    byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Convert packed unsigned 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
+__m128 _mm_cvtpu16_ps (__m64 a) pure @safe
+{
+    __m128i ma = to_m128i(a);
+    ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
+    return _mm_cvtepi32_ps(ma);
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
+    __m128 R = _mm_cvtpu16_ps(A);
+    float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f];
+    assert(R.array == correct);
+}
+
+/// Convert the lower packed unsigned 8-bit integers in `a` to packed single-precision (32-bit) floating-point element.
+__m128 _mm_cvtpu8_ps (__m64 a) pure @safe
+{
+    __m128i b = to_m128i(a); 
+
+    // Zero extend to 32-bit
+    b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
+    b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+    return _mm_cvtepi32_ps(b);
+}
+unittest
+{
+    __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
+    __m128 R = _mm_cvtpu8_ps(A);
+    float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f];
+    assert(R.array == correct);
+}
+
+/// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
+/// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
+/// upper elements of result.
+__m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted
+{
+    v.ptr[0] = cast(float)x;
+    return v;
+}
+unittest
+{
+    __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42);
+    assert(a.array == [42.0f, 0, 0, 0]);
+}
+
+
+/// Convert the signed 64-bit integer `b` to a single-precision (32-bit) floating-point element, 
+/// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
+/// upper elements of result.
+__m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted
+{
+    v.ptr[0] = cast(float)x;
+    return v;
+}
+unittest
+{
+    __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42);
+    assert(a.array == [42.0f, 0, 0, 0]);
+}
+
+/// Take the lower single-precision (32-bit) floating-point element of `a`.
+float _mm_cvtss_f32(__m128 a) pure @safe
+{
+    return a.array[0];
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer.
+int _mm_cvtss_si32 (__m128 a) @safe // PERF GDC
+{
+    static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_cvtss2si(a);
+    }
+    else static if (LDC_with_SSE)
+    {
+        return __builtin_ia32_cvtss2si(a);
+    }
+    else static if (DMD_with_DSIMD)
+    {
+        __m128 b;
+        __m128i r = cast(__m128i) __simd(XMM.CVTPS2DQ, a); // Note: converts 4 integers.
+        return r.array[0];
+    }
+    else
+    {
+        return convertFloatToInt32UsingMXCSR(a.array[0]);
+    }
+}
+unittest
+{
+    assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer.
+long _mm_cvtss_si64 (__m128 a) @safe
+{
+    static if (LDC_with_SSE2)
+    {
+        version(X86_64)
+        {
+            return __builtin_ia32_cvtss2si64(a);
+        }
+        else
+        {
+            // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
+            // using SSE instructions only. So the builtin doesn't exit for this arch.
+            return convertFloatToInt64UsingMXCSR(a.array[0]);
+        }
+    }
+    else
+    {
+        return convertFloatToInt64UsingMXCSR(a.array[0]);
+    }
+}
+unittest
+{
+    assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
+
+    uint savedRounding = _MM_GET_ROUNDING_MODE();
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+    assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f)));
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+    assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f)));
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+    assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f)));
+
+    _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+    assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f)));
+
+    _MM_SET_ROUNDING_MODE(savedRounding);
+}
+
+
+/// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit 
+/// integer with truncation.
+int _mm_cvtt_ss2si (__m128 a) pure @safe
+{
+    // x86: cvttss2si always generated, even in -O0
+    return cast(int)(a.array[0]);
+}
+alias _mm_cvttss_si32 = _mm_cvtt_ss2si; ///ditto
+unittest
+{
+    assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
+}
+
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 
+/// integers with truncation.
+__m64 _mm_cvtt_ps2pi (__m128 a) pure @safe
+{
+    return to_m64(_mm_cvttps_epi32(a));
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit 
+/// integer with truncation.
+long _mm_cvttss_si64 (__m128 a) pure @safe
+{
+    return cast(long)(a.array[0]);
+}
+unittest
+{
+    assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
+}
+
+/// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`.
+__m128 _mm_div_ps(__m128 a, __m128 b) pure @safe
+{
+    pragma(inline, true);
+    return a / b;
+}
+unittest
+{
+    __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
+    a = _mm_div_ps(a, a);
+    float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f];
+    assert(a.array == correct);
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in `a` by the lower 
+/// single-precision (32-bit) floating-point element in `b`, store the result in the lower 
+/// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_div_ss(__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.DIVSS, a, b);
+    else static if (GDC_with_SSE)
+        return __builtin_ia32_divss(a, b);
+    else
+    {
+        a[0] /= b[0];
+        return a;
+    }
+}
+unittest
+{
+    __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
+    a = _mm_div_ss(a, a);
+    float[4] correct = [1.0f, -2.0, 3.0f, 1.0f];
+    assert(a.array == correct);
+}
+
+/// Extract a 16-bit unsigned integer from `a`, selected with `imm8`. Zero-extended.
+int _mm_extract_pi16 (__m64 a, int imm8)
+{
+    short4 sa = cast(short4)a;
+    return cast(ushort)(sa.array[imm8]);
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(-1, 6, 0, 4);
+    assert(_mm_extract_pi16(A, 0) == 65535);
+    assert(_mm_extract_pi16(A, 1) == 6);
+    assert(_mm_extract_pi16(A, 2) == 0);
+    assert(_mm_extract_pi16(A, 3) == 4);
+}
+
+/// Free aligned memory that was allocated with `_mm_malloc` or `_mm_realloc`.
+void _mm_free(void * mem_addr) @trusted
+{
+    // support for free(NULL)
+    if (mem_addr is null)
+        return;
+
+    // Technically we don't need to store size and alignement in the chunk, but we do in case we
+    // have to implement _mm_realloc
+
+    size_t pointerSize = (void*).sizeof;
+    void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof);
+    size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize);
+    size_t alignment = *alignmentLocation;
+    assert(alignment != 0);
+    assert(isPointerAligned(mem_addr, alignment));
+    free(*rawLocation);
+}
+
+/// Get the exception mask bits from the MXCSR control and status register. 
+/// The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, 
+/// `_MM_MASK_DIV_ZERO`, `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
+/// Note: won't correspond to reality on non-x86, where MXCSR this is emulated.
+uint _MM_GET_EXCEPTION_MASK() @safe
+{
+    return _mm_getcsr() & _MM_MASK_MASK;
+}
+
+/// Get the exception state bits from the MXCSR control and status register. 
+/// The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 
+/// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`.
+/// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. No exception reported.
+uint _MM_GET_EXCEPTION_STATE() @safe
+{
+    return _mm_getcsr() & _MM_EXCEPT_MASK;
+}
+
+/// Get the flush zero bits from the MXCSR control and status register. 
+/// The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`
+uint _MM_GET_FLUSH_ZERO_MODE() @safe
+{
+    return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
+}
+
+/// Get the rounding mode bits from the MXCSR control and status register. The rounding mode may 
+/// contain any of the following flags: `_MM_ROUND_NEAREST, `_MM_ROUND_DOWN`, `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`.
+uint _MM_GET_ROUNDING_MODE() @safe
+{
+    return _mm_getcsr() & _MM_ROUND_MASK;
+}
+
+/// Get the unsigned 32-bit value of the MXCSR control and status register.
+/// Note: this is emulated on ARM, because there is no MXCSR register then.
+uint _mm_getcsr() @trusted
+{
+    static if (LDC_with_ARM)
+    {
+        // Note: we convert the ARM FPSCR into a x86 SSE control word.
+        // However, only rounding mode and flush to zero are actually set.
+        // The returned control word will have all exceptions masked, and no exception detected.
+
+        uint fpscr = arm_get_fpcr();
+
+        uint cw = 0; // No exception detected
+        if (fpscr & _MM_FLUSH_ZERO_MASK_ARM)
+        {
+            // ARM has one single flag for ARM.
+            // It does both x86 bits.
+            // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode
+            cw |= _MM_FLUSH_ZERO_ON;
+            cw |= 0x40; // set "denormals are zeros"
+        } 
+        cw |= _MM_MASK_MASK; // All exception maske
+
+        // Rounding mode
+        switch(fpscr & _MM_ROUND_MASK_ARM)
+        {
+            default:
+            case _MM_ROUND_NEAREST_ARM:     cw |= _MM_ROUND_NEAREST;     break;
+            case _MM_ROUND_DOWN_ARM:        cw |= _MM_ROUND_DOWN;        break;
+            case _MM_ROUND_UP_ARM:          cw |= _MM_ROUND_UP;          break;
+            case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break;
+        }
+        return cw;
+    }
+    else version(GNU)
+    {
+        static if (GDC_with_SSE)
+        {
+            return __builtin_ia32_stmxcsr();
+        }
+        else version(X86)
+        {
+            uint sseRounding = 0;
+            asm pure nothrow @nogc @trusted
+            {
+                "stmxcsr %0;\n" 
+                  : "=m" (sseRounding)
+                  : 
+                  : ;
+            }
+            return sseRounding;
+        }
+        else return __warn_noop_ret!uint();
+    }
+    else version (InlineX86Asm)
+    {
+        uint controlWord;
+        asm nothrow @nogc pure @trusted
+        {
+            stmxcsr controlWord;
+        }
+        return controlWord;
+    }
+    else
+        static assert(0, "Not yet supported");
+}
+unittest
+{
+    uint csr = _mm_getcsr();
+}
+
+/// Insert a 16-bit integer `i` inside `a` at the location specified by `imm8`.
+__m64 _mm_insert_pi16 (__m64 v, int i, int imm8) pure @trusted
+{
+    short4 r = cast(short4)v;
+    r.ptr[imm8 & 3] = cast(short)i;
+    return cast(__m64)r;
+}
+unittest
+{
+    __m64 A = _mm_set_pi16(3, 2, 1, 0);
+    short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4);
+    short[4] correct = [0, 42, 2, 3];
+    assert(R.array == correct);
+}
+
+/// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory.
+//  `p` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+__m128 _mm_load_ps(const(float)*p) pure @trusted // FUTURE shouldn't be trusted, see #62
+{
+    pragma(inline, true);
+    return *cast(__m128*)p;
+}
+unittest
+{
+    static immutable align(16) float[4] correct = [1.0f, 2.0f, 3.0f, 4.0f];
+    __m128 A = _mm_load_ps(correct.ptr);
+    assert(A.array == correct);
+}
+
+/// Load a single-precision (32-bit) floating-point element from memory into all elements.
+__m128 _mm_load_ps1(const(float)*p) pure @trusted
+{
+    return __m128(*p);
+}
+unittest
+{
+    float n = 2.5f;
+    float[4] correct = [2.5f, 2.5f, 2.5f, 2.5f];
+    __m128 A = _mm_load_ps1(&n);
+    assert(A.array == correct);
+}
+
+/// Load a single-precision (32-bit) floating-point element from memory into the lower of dst, and zero the upper 3 
+/// elements. `mem_addr` does not need to be aligned on any particular boundary.
+__m128 _mm_load_ss (const(float)* mem_addr) pure @trusted
+{
+    pragma(inline, true);
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128)__simd(XMM.LODSS, *cast(__m128*)mem_addr);
+    }
+    else
+    {
+        __m128 r; // PERf =void;
+        r.ptr[0] = *mem_addr;
+        r.ptr[1] = 0;
+        r.ptr[2] = 0;
+        r.ptr[3] = 0;
+        return r;
+    }
+}
+unittest
+{
+    float n = 2.5f;
+    float[4] correct = [2.5f, 0.0f, 0.0f, 0.0f];
+    __m128 A = _mm_load_ss(&n);
+    assert(A.array == correct);
+}
+
+/// Load a single-precision (32-bit) floating-point element from memory into all elements.
+alias _mm_load1_ps = _mm_load_ps1;
+
+/// Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of result, 
+/// and copy the lower 2 elements from `a` to result. `mem_addr does` not need to be aligned on any particular boundary.
+__m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
+{
+    pragma(inline, true);
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.LODHPS, a, *cast(const(__m128)*)mem_addr); 
+    }
+    else
+    {
+        // x86: movlhps generated since LDC 1.9.0 -O1
+        long2 la = cast(long2)a;
+        la.ptr[1] = (*mem_addr).array[0];
+        return cast(__m128)la;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
+    __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
+    __m64 M = to_m64(cast(__m128i)B);
+     __m128 R = _mm_loadh_pi(A, &M);
+    float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f];
+    assert(R.array == correct);
+}
+
+/// Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of result, 
+/// and copy the upper 2 elements from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
+__m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
+{
+    pragma(inline, true);
+
+    // Disabled because of https://issues.dlang.org/show_bug.cgi?id=23046
+    /*
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.LODLPS, a, *cast(const(__m128)*)mem_addr); 
+    }
+    else */
+    {
+        // x86: movlpd/movlps generated with all LDC -01
+        long2 la = cast(long2)a;
+        la.ptr[0] = (*mem_addr).array[0];
+        return cast(__m128)la;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
+    __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
+    __m64 M = to_m64(cast(__m128i)B);
+     __m128 R = _mm_loadl_pi(A, &M);
+    float[4] correct = [5.0f, 6.0f, 3.0f, 4.0f];
+    assert(R.array == correct);
+}
+
+/// Load 4 single-precision (32-bit) floating-point elements from memory in reverse order. 
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+__m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted // FUTURE shouldn't be trusted, see #62
+{
+    __m128* aligned = cast(__m128*)mem_addr; // x86: movaps + shups since LDC 1.0.0 -O1
+    __m128 a = *aligned;
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.SHUFPS, a, a, 27);
+    }
+    else
+    {
+        __m128 r; // PERF =void;
+        r.ptr[0] = a.array[3];
+        r.ptr[1] = a.array[2];
+        r.ptr[2] = a.array[1];
+        r.ptr[3] = a.array[0];
+        return r;
+    }
+}
+unittest
+{
+    align(16) static immutable float[4] arr = [ 1.0f, 2.0f, 3.0f, 8.0f ];
+    __m128 A = _mm_loadr_ps(arr.ptr);
+    float[4] correct = [ 8.0f, 3.0f, 2.0f, 1.0f ];
+    assert(A.array == correct);
+}
+
+/// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 
+/// `mem_addr` does not need to be aligned on any particular boundary.
+__m128 _mm_loadu_ps(const(float)* mem_addr) pure @trusted
+{
+    pragma(inline, true);
+    static if (GDC_with_SSE2)
+    {
+        return __builtin_ia32_loadups(mem_addr);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        static if (LDC_with_optimizations)
+        {
+            return loadUnaligned!(__m128)(mem_addr);
+        }
+        else
+        {
+            __m128 result;
+            result.ptr[0] = mem_addr[0];
+            result.ptr[1] = mem_addr[1];
+            result.ptr[2] = mem_addr[2];
+            result.ptr[3] = mem_addr[3];
+            return result;
+        }
+    }
+    else version(DigitalMars)
+    {
+        static if (DMD_with_DSIMD)
+        {
+            return cast(__m128)__simd(XMM.LODUPS, *cast(const(float4*))mem_addr);
+        }
+        else static if (SSESizedVectorsAreEmulated)
+        {
+            // Since this vector is emulated, it doesn't have alignement constraints
+            // and as such we can just cast it.
+            return *cast(__m128*)(mem_addr);
+        }
+        else
+        {
+            __m128 result;
+            result.ptr[0] = mem_addr[0];
+            result.ptr[1] = mem_addr[1];
+            result.ptr[2] = mem_addr[2];
+            result.ptr[3] = mem_addr[3];
+            return result;
+        }
+    }
+    else
+    {
+        __m128 result;
+        result.ptr[0] = mem_addr[0];
+        result.ptr[1] = mem_addr[1];
+        result.ptr[2] = mem_addr[2];
+        result.ptr[3] = mem_addr[3];
+        return result;
+    }
+}
+unittest
+{
+    align(16) static immutable float[5] arr = [ 1.0f, 2.0f, 3.0f, 8.0f, 9.0f ];  // force unaligned load
+    __m128 A = _mm_loadu_ps(&arr[1]);
+    float[4] correct = [ 2.0f, 3.0f, 8.0f, 9.0f ];
+    assert(A.array == correct);
+}
+
+/// Allocate size bytes of memory, aligned to the alignment specified in align,
+/// and return a pointer to the allocated memory. `_mm_free` should be used to free
+/// memory that is allocated with `_mm_malloc`.
+void* _mm_malloc(size_t size, size_t alignment) @trusted
+{
+    assert(alignment != 0);
+    size_t request = requestedSize(size, alignment);
+    void* raw = malloc(request);
+    if (request > 0 && raw == null) // malloc(0) can validly return anything
+        onOutOfMemoryError();
+    return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size
+}
+
+/// Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest 
+/// bit is not set in the corresponding element) and a non-temporal memory hint.
+void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted
+{
+    // this works since mask is zero-extended
+    return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr);
+}
+
+deprecated("Use _mm_maskmove_si64 instead") alias _m_maskmovq = _mm_maskmove_si64;///
+
+/// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum value.
+__m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b)));
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values.
+__m128 _mm_max_ps(__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.MAXPS, a, b);
+    }
+    else static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_maxps(a, b);
+    }
+    else static if (LDC_with_SSE)
+    {
+        return __builtin_ia32_maxps(a, b);
+    }
+    else
+    {
+        // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
+        __m128 r; // PERF =void;
+        r[0] = (a[0] > b[0]) ? a[0] : b[0];
+        r[1] = (a[1] > b[1]) ? a[1] : b[1];
+        r[2] = (a[2] > b[2]) ? a[2] : b[2];
+        r[3] = (a[3] > b[3]) ? a[3] : b[3];
+        return r;    
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
+    __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
+    __m128 M = _mm_max_ps(A, B);
+    assert(M.array[0] == 4);
+    assert(M.array[1] == 2);
+    assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
+    assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
+}
+
+/// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values.
+__m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b)));
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the maximum value in the 
+/// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result.
+ __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.MAXSS, a, b);
+    }
+    else static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_maxss(a, b);
+    }
+    else static if (LDC_with_SSE)
+    {
+        return __builtin_ia32_maxss(a, b); 
+    }
+    else
+    {  
+        __m128 r = a;
+        r[0] = (a[0] > b[0]) ? a[0] : b[0];
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1, 2, 3, 4);
+    __m128 B = _mm_setr_ps(4, 1, 4, 1);
+    __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
+    __m128 M = _mm_max_ss(A, B);
+    assert(M.array[0] == 4);
+    assert(M.array[1] == 2);
+    assert(M.array[2] == 3);
+    assert(M.array[3] == 4);
+    M = _mm_max_ps(A, C); // in case of NaN, second operand prevails
+    assert(M.array[0] != M.array[0]);
+    M = _mm_max_ps(C, A); // in case of NaN, second operand prevails
+    assert(M.array[0] == 1);
+}
+
+/// Compare packed signed 16-bit integers in a and b, and return packed minimum values.
+__m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b)));
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values.
+__m128 _mm_min_ps(__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.MINPS, a, b);
+    }
+    else static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_minps(a, b);
+    }
+    else static if (LDC_with_SSE)
+    {
+        // not technically needed, but better perf in debug mode
+        return __builtin_ia32_minps(a, b);
+    }
+    else
+    {
+        // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
+        __m128 r; // PERF =void;
+        r[0] = (a[0] < b[0]) ? a[0] : b[0];
+        r[1] = (a[1] < b[1]) ? a[1] : b[1];
+        r[2] = (a[2] < b[2]) ? a[2] : b[2];
+        r[3] = (a[3] < b[3]) ? a[3] : b[3];
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
+    __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
+    __m128 M = _mm_min_ps(A, B);
+    assert(M.array[0] == 1);
+    assert(M.array[1] == 1);
+    assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
+    assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
+}
+
+/// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
+__m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b)));
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the minimum value in the 
+/// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result.
+__m128 _mm_min_ss(__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.MINSS, a, b);
+    }
+    else static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_minss(a, b);
+    }
+    else static if (LDC_with_SSE)
+    {
+        return __builtin_ia32_minss(a, b);
+    }
+    else
+    {
+        // Generates minss since LDC 1.3 -O1
+        __m128 r = a;
+        r[0] = (a[0] < b[0]) ? a[0] : b[0];
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1, 2, 3, 4);
+    __m128 B = _mm_setr_ps(4, 1, 4, 1);
+    __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
+    __m128 M = _mm_min_ss(A, B);
+    assert(M.array[0] == 1);
+    assert(M.array[1] == 2);
+    assert(M.array[2] == 3);
+    assert(M.array[3] == 4);
+    M = _mm_min_ps(A, C); // in case of NaN, second operand prevails
+    assert(M.array[0] != M.array[0]);
+    M = _mm_min_ps(C, A); // in case of NaN, second operand prevails
+    assert(M.array[0] == 1);
+}
+
+/// Move the lower single-precision (32-bit) floating-point element from `b` to the lower element of result, and copy 
+/// the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted
+{
+    // Workaround https://issues.dlang.org/show_bug.cgi?id=21673
+    // inlining of this function fails.
+    version(DigitalMars) asm nothrow @nogc pure { nop; }
+
+    a.ptr[0] = b.array[0];
+    return a;
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
+    __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
+    __m128 R = _mm_move_ss(A, B);
+    float[4] correct = [5.0f, 2.0f, 3.0f, 4.0f];
+    assert(R.array == correct);
+}
+
+/// Move the upper 2 single-precision (32-bit) floating-point elements from `b` to the lower 2 elements of result, and 
+/// copy the upper 2 elements from `a` to the upper 2 elements of dst.
+__m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted
+{
+    // PERF DMD
+    // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443
+    /*
+    static if (DMD_with_DSIMD)
+    {
+        
+        return cast(__m128) __simd(XMM.MOVHLPS, a, b);
+    }
+    else */
+    {
+        a.ptr[0] = b.array[2];
+        a.ptr[1] = b.array[3];
+        return a;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
+    __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
+    __m128 R = _mm_movehl_ps(A, B);
+    float[4] correct = [7.0f, 8.0f, 3.0f, 4.0f];
+    assert(R.array == correct);
+}
+
+/// Move the lower 2 single-precision (32-bit) floating-point elements from `b` to the upper 2 elements of result, and 
+/// copy the lower 2 elements from `a` to the lower 2 elements of result
+__m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted
+{    
+    // Was disabled because of https://issues.dlang.org/show_bug.cgi?id=19443
+    static if (DMD_with_DSIMD && __VERSION__ >= 2101)
+    {
+        return cast(__m128) __simd(XMM.MOVLHPS, a, b);
+    }
+    else
+    {
+        a.ptr[2] = b.array[0];
+        a.ptr[3] = b.array[1];
+        return a;
+    }    
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
+    __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
+    __m128 R = _mm_movelh_ps(A, B);
+    float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f];
+    assert(R.array == correct);
+}
+
+/// Create mask from the most significant bit of each 8-bit element in `a`.
+int _mm_movemask_pi8 (__m64 a) pure @safe
+{
+    return _mm_movemask_epi8(to_m128i(a));
+}
+unittest
+{
+    assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0)));
+}
+
+/// Set each bit of result based on the most significant bit of the corresponding packed single-precision (32-bit) 
+/// floating-point element in `a`.
+int _mm_movemask_ps (__m128 a) pure @trusted
+{
+    // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
+    static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_movmskps(a);
+    }
+    else static if (LDC_with_SSE)
+    {
+        return __builtin_ia32_movmskps(a);
+    }
+    else static if (LDC_with_ARM)
+    {
+        int4 ai = cast(int4)a;
+        int4 shift31 = [31, 31, 31, 31]; 
+        ai = ai >>> shift31;
+        int4 shift = [0, 1, 2, 3]; 
+        ai = ai << shift; // 4-way shift, only efficient on ARM.
+        int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]);
+        return r;
+    }
+    else
+    {
+        int4 ai = cast(int4)a;
+        int r = 0;
+        if (ai.array[0] < 0) r += 1;
+        if (ai.array[1] < 0) r += 2;
+        if (ai.array[2] < 0) r += 4;
+        if (ai.array[3] < 0) r += 8;
+        return r;
+    }
+}
+unittest
+{
+    int4 A = [-1, 0, -43, 0];
+    assert(5 == _mm_movemask_ps(cast(float4)A));
+}
+
+/// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`.
+__m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe
+{
+    pragma(inline, true);
+    return a * b;
+}
+unittest
+{
+    __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
+    a = _mm_mul_ps(a, a);
+    float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f];
+    assert(a.array == correct);
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in `a` and `b`, store the result in the lower 
+/// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.MULSS, a, b);
+    else static if (GDC_with_SSE)
+        return __builtin_ia32_mulss(a, b);
+    else
+    {
+        a[0] *= b[0];
+        return a;
+    }
+}
+unittest
+{
+    __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
+    a = _mm_mul_ss(a, a);
+    float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f];
+    assert(a.array == correct);
+}
+
+/// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
+/// and return the high 16 bits of the intermediate integers.
+__m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b)));
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(0, -16, 2, 3);
+    __m64 B = _mm_set1_pi16(16384);
+    short4 R = cast(short4)_mm_mulhi_pu16(A, B);
+    short[4] correct = [0, 0x3FFC, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`, and 
+/// return the result.
+__m128 _mm_or_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128)__simd(XMM.ORPS, a, b);
+    else
+        return cast(__m128)(cast(__m128i)a | cast(__m128i)b);
+}
+unittest
+{
+    __m128 A = cast(__m128) _mm_set1_epi32(0x80000000);
+    __m128 B = _mm_setr_ps(4.0f, -5.0, -9.5f, float.infinity);
+    __m128 C = _mm_or_ps(A, B);
+    float[4] correct = [-4.0f, -5.0, -9.5f, -float.infinity];
+    assert(C.array == correct);
+}
+
+deprecated("Use _mm_avg_pu8 instead") alias _m_pavgb = _mm_avg_pu8;///
+deprecated("Use _mm_avg_pu16 instead") alias _m_pavgw = _mm_avg_pu16;///
+deprecated("Use _mm_extract_pi16 instead") alias _m_pextrw = _mm_extract_pi16;///
+deprecated("Use _mm_insert_pi16 instead") alias _m_pinsrw = _mm_insert_pi16;///
+deprecated("Use _mm_max_pi16 instead") alias _m_pmaxsw = _mm_max_pi16;///
+deprecated("Use _mm_max_pu8 instead") alias _m_pmaxub = _mm_max_pu8;///
+deprecated("Use _mm_min_pi16 instead") alias _m_pminsw = _mm_min_pi16;///
+deprecated("Use _mm_min_pu8 instead") alias _m_pminub = _mm_min_pu8;///
+deprecated("Use _mm_movemask_pi8 instead") alias _m_pmovmskb = _mm_movemask_pi8;///
+deprecated("Use _mm_mulhi_pu16 instead") alias _m_pmulhuw = _mm_mulhi_pu16;///
+
+enum _MM_HINT_T0  = 3; ///
+enum _MM_HINT_T1  = 2; ///
+enum _MM_HINT_T2  = 1; ///
+enum _MM_HINT_NTA = 0; ///
+
+
+version(LDC)
+{
+    // Starting with LLVM 10, it seems llvm.prefetch has changed its name.
+    // Was reported at: https://github.com/ldc-developers/ldc/issues/3397
+    static if (__VERSION__ >= 2091) 
+    {
+        pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch"
+            void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe;
+    }
+}
+
+/// Fetch the line of data from memory that contains address `p` to a location in the 
+/// cache hierarchy specified by the locality hint i.
+///
+/// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API.
+void _mm_prefetch(int locality)(const(void)* p) pure @trusted
+{
+    static if (GDC_with_SSE)
+    {
+        return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3);
+    }
+    else static if (DMD_with_DSIMD)
+    {
+        enum bool isWrite = (locality & 0x4) != 0;
+        enum level = locality & 3;
+        return prefetch!(isWrite, level)(p);
+    }
+    else version(LDC)
+    {
+        static if ((__VERSION__ >= 2091) && (__VERSION__ < 2106))
+        {
+            // const_cast here. `llvm_prefetch` wants a mutable pointer
+            llvm_prefetch_fixed( cast(void*)p, 0, locality, 1);
+        }
+        else
+        {
+            // const_cast here. `llvm_prefetch` wants a mutable pointer
+            llvm_prefetch( cast(void*)p, 0, locality, 1);
+        }
+    }
+    else version(D_InlineAsm_X86_64)
+    {
+        static if (locality == _MM_HINT_NTA)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                mov RAX, p;
+                prefetchnta [RAX];
+            }
+        }
+        else static if (locality == _MM_HINT_T0)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                mov RAX, p;
+                prefetcht0 [RAX];
+            }
+        }
+        else static if (locality == _MM_HINT_T1)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                mov RAX, p;
+                prefetcht1 [RAX];
+            }
+        }
+        else static if (locality == _MM_HINT_T2)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                mov RAX, p;
+                prefetcht2 [RAX];
+            }
+        }
+        else
+            assert(false); // invalid locality hint
+    }
+    else version(D_InlineAsm_X86)
+    {
+        static if (locality == _MM_HINT_NTA)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                mov EAX, p;
+                prefetchnta [EAX];
+            }
+        }
+        else static if (locality == _MM_HINT_T0)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                mov EAX, p;
+                prefetcht0 [EAX];
+            }
+        }
+        else static if (locality == _MM_HINT_T1)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                mov EAX, p;
+                prefetcht1 [EAX];
+            }
+        }
+        else static if (locality == _MM_HINT_T2)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                mov EAX, p;
+                prefetcht2 [EAX];
+            }
+        }
+        else 
+            assert(false); // invalid locality hint
+    }
+    else
+    {
+        // Generic version: do nothing. From bitter experience, 
+        // it's unlikely you get ANY speed-up with manual prefetching.
+        // Prefetching or not doesn't change program behaviour.
+    }
+}
+unittest
+{
+    // From Intel documentation:
+    // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 
+    // 32 bytes."
+    ubyte[256] cacheline; // though it seems it cannot generate GP fault
+    _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 
+    _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 
+    _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 
+    _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 
+}
+
+deprecated("Use _mm_sad_pu8 instead") alias _m_psadbw = _mm_sad_pu8;///
+deprecated("Use _mm_shuffle_pi16 instead") alias _m_pshufw = _mm_shuffle_pi16;///
+
+
+/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a`` , 
+/// and return the results. The maximum relative error for this approximation is less than 1.5*2^-12.
+__m128 _mm_rcp_ps (__m128 a) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.RCPPS, a);
+    }
+    else static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_rcpps(a);
+    }
+    else static if (LDC_with_SSE)
+    {
+        return __builtin_ia32_rcpps(a);
+    }
+    else
+    {        
+        a.ptr[0] = 1.0f / a.array[0];
+        a.ptr[1] = 1.0f / a.array[1];
+        a.ptr[2] = 1.0f / a.array[2];
+        a.ptr[3] = 1.0f / a.array[3];
+        return a;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f);
+    __m128 groundTruth = _mm_set1_ps(1.0f) / A;
+    __m128 result = _mm_rcp_ps(A);
+    foreach(i; 0..4)
+    {
+        double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
+        assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
+    }
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in `a`, store it 
+/// in the lower element of the result, and copy the upper 3 packed elements from `a` to the upper elements of result. 
+/// The maximum relative error for this approximation is less than 1.5*2^-12.
+__m128 _mm_rcp_ss (__m128 a) pure @trusted
+{
+    // Disabled, see https://issues.dlang.org/show_bug.cgi?id=23049
+    /*static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.RCPSS, a);
+    }
+    else*/
+    static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_rcpss(a);
+    }
+    else static if (LDC_with_SSE)
+    {
+        return __builtin_ia32_rcpss(a);
+    }
+    else
+    {
+        a.ptr[0] = 1.0f / a.array[0];
+        return a;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f);
+    __m128 correct = _mm_setr_ps(1 / 2.34f, -70000.0f, 0.00001f, 345.5f);
+    __m128 R = _mm_rcp_ss(A);
+    double relError = (cast(double)(correct.array[0]) / R.array[0]) - 1;
+    assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
+    assert(R.array[1] == correct.array[1]);
+    assert(R.array[2] == correct.array[2]);
+    assert(R.array[3] == correct.array[3]);
+}
+
+/// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, and 
+/// return a pointer to the newly allocated memory. 
+/// Previous data is preserved if any.
+///
+/// IMPORTANT: `size` MUST be > 0.
+///
+/// `_mm_free` MUST be used to free memory that is allocated with `_mm_malloc` or `_mm_realloc`.
+/// Do NOT call _mm_realloc with size = 0.
+void* _mm_realloc(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS
+{
+    return alignedReallocImpl!true(aligned, size, alignment);
+}
+unittest
+{
+    enum NALLOC = 8;
+    enum size_t[8] ALIGNMENTS = [1, 2, 4, 8, 16, 32, 64, 128];
+    
+    void*[NALLOC] alloc;
+
+    foreach(t; 0..100)
+    {
+        foreach(n; 0..NALLOC)
+        {
+            size_t alignment = ALIGNMENTS[n];
+            size_t s = 1 + ( (n + t * 69096) & 0xffff );
+            alloc[n] = _mm_realloc(alloc[n], s, alignment);
+            assert(isPointerAligned(alloc[n], alignment));
+            foreach(b; 0..s)
+                (cast(ubyte*)alloc[n])[b] = cast(ubyte)n;
+        }
+    }
+    foreach(n; 0..NALLOC)
+    {        
+        _mm_free(alloc[n]);
+    }
+}
+
+/// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, and 
+/// return a pointer to the newly allocated memory. 
+/// Previous data is discarded.
+///
+/// IMPORTANT: `size` MUST be > 0.
+///
+/// `_mm_free` MUST be used to free memory that is allocated with `_mm_malloc` or `_mm_realloc`.
+void* _mm_realloc_discard(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS
+{
+    return alignedReallocImpl!false(aligned, size, alignment);
+}
+
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in `a`. 
+/// The maximum relative error for this approximation is less than 1.5*2^-12.
+__m128 _mm_rsqrt_ps (__m128 a) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.RSQRTPS, a);
+    }
+    else static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_rsqrtps(a);
+    }
+    else static if (LDC_with_SSE)
+    {
+        return __builtin_ia32_rsqrtps(a);
+    }
+    else version(LDC)
+    {
+        a[0] = 1.0f / llvm_sqrt(a[0]);
+        a[1] = 1.0f / llvm_sqrt(a[1]);
+        a[2] = 1.0f / llvm_sqrt(a[2]);
+        a[3] = 1.0f / llvm_sqrt(a[3]);
+        return a;
+    }
+    else
+    {
+        a.ptr[0] = 1.0f / sqrt(a.array[0]);
+        a.ptr[1] = 1.0f / sqrt(a.array[1]);
+        a.ptr[2] = 1.0f / sqrt(a.array[2]);
+        a.ptr[3] = 1.0f / sqrt(a.array[3]);
+        return a;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f);
+    __m128 groundTruth = _mm_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f);
+    __m128 result = _mm_rsqrt_ps(A);
+    foreach(i; 0..4)
+    {
+        double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
+        assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
+    }
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in `a`,
+/// store the result in the lower element. Copy the upper 3 packed elements from `a` to the upper elements of result. 
+/// The maximum relative error for this approximation is less than 1.5*2^-12.
+__m128 _mm_rsqrt_ss (__m128 a) pure @trusted
+{   
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.RSQRTSS, a);
+    }
+    else static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_rsqrtss(a);
+    }
+    else static if (LDC_with_SSE)
+    {
+        return __builtin_ia32_rsqrtss(a);
+    }
+    else version(LDC)
+    {
+        a[0] = 1.0f / llvm_sqrt(a[0]);
+        return a;
+    }
+    else
+    {
+        a[0] = 1.0f / sqrt(a[0]);
+        return a;
+    }
+}
+unittest // this one test 4 different intrinsics: _mm_rsqrt_ss, _mm_rsqrt_ps, _mm_rcp_ps, _mm_rcp_ss
+{
+    double maxRelativeError = 0.000245; // -72 dB, stuff is apparently more precise than said in the doc?
+    void testApproximateSSE(float number) nothrow @nogc
+    {
+        __m128 A = _mm_set1_ps(number);
+
+        // test _mm_rcp_ps
+        __m128 B = _mm_rcp_ps(A);
+        foreach(i; 0..4)
+        {
+            double exact = 1.0f / A.array[i];
+            double ratio = cast(double)(B.array[i]) / cast(double)(exact);
+            assert(abs_double(ratio - 1) <= maxRelativeError);
+        }
+
+        // test _mm_rcp_ss
+        {
+            B = _mm_rcp_ss(A);
+            double exact = 1.0f / A.array[0];
+            double ratio = cast(double)(B.array[0]) / cast(double)(exact);
+            assert(abs_double(ratio - 1) <= maxRelativeError);
+        }
+
+        // test _mm_rsqrt_ps
+        B = _mm_rsqrt_ps(A);
+        foreach(i; 0..4)
+        {
+            double exact = 1.0f / sqrt(A.array[i]);
+            double ratio = cast(double)(B.array[i]) / cast(double)(exact);
+            assert(abs_double(ratio - 1) <= maxRelativeError);
+        }
+
+        // test _mm_rsqrt_ss
+        {
+            B = _mm_rsqrt_ss(A);
+            double exact = 1.0f / sqrt(A.array[0]);
+            double ratio = cast(double)(B.array[0]) / cast(double)(exact);
+            assert(abs_double(ratio - 1) <= maxRelativeError);
+        }
+    }
+
+    testApproximateSSE(0.00001f);
+    testApproximateSSE(1.1f);
+    testApproximateSSE(345.0f);
+    testApproximateSSE(2.45674864151f);
+    testApproximateSSE(700000.0f);
+    testApproximateSSE(10000000.0f);
+    testApproximateSSE(27841456468.0f);
+}
+
+/// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
+/// consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
+/// low 16 bits of result.
+__m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe
+{
+    return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b)));
+}
+
+/// Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
+/// `_MM_MASK_xxxx`. The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, `_MM_MASK_DIV_ZERO`,
+/// `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
+void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe
+{
+    // Note: unsupported on ARM
+    _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx);
+}
+
+/// Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
+/// `_MM_EXCEPT_xxxx`. The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 
+/// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`.
+void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe
+{
+    // Note: unsupported on ARM
+    _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx);
+}
+
+/// Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
+/// `_MM_FLUSH_xxxx`. The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`.
+void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe
+{
+    _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx);
+}
+
+/// Set packed single-precision (32-bit) floating-point elements with the supplied values.
+__m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted
+{
+    __m128 r;
+    r.ptr[0] = e0;
+    r.ptr[1] = e1;
+    r.ptr[2] = e2;
+    r.ptr[3] = e3;
+    return r;
+}
+unittest
+{
+    __m128 A = _mm_set_ps(3, 2, 1, 546);
+    float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f];
+    assert(A.array == correct);
+
+    // Very old LDC, like 1.17, cannot case __vector at CT
+    static if (__VERSION__ >= 2094)
+    {
+        static immutable B = _mm_set_ps(3, 2, 1, 546);
+        enum C = _mm_set_ps(3, 2, 1, 546);
+    }
+}
+
+deprecated("Use _mm_set1_ps instead") alias _mm_set_ps1 = _mm_set1_ps; ///
+
+/// Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
+/// `_MM_ROUND_xxxx`. The rounding mode may contain any of the following flags: `_MM_ROUND_NEAREST`, `_MM_ROUND_DOWN`, 
+/// `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`.
+void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe
+{
+    // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
+    version(GNU) asm nothrow @nogc @trusted { "" : : : "memory"; }
+    _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx);
+}
+
+/// Copy single-precision (32-bit) floating-point element `a` to the lower element of result, and zero the upper 3 elements.
+__m128 _mm_set_ss (float a) pure @trusted
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.LODSS, a);
+    }
+    else
+    {
+        __m128 r = _mm_setzero_ps();
+        r.ptr[0] = a;
+        return r;
+    }
+}
+unittest
+{
+    float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f];
+    __m128 A = _mm_set_ss(42.0f);
+    assert(A.array == correct);
+}
+
+/// Broadcast single-precision (32-bit) floating-point value `a` to all elements.
+__m128 _mm_set1_ps (float a) pure @trusted
+{
+    pragma(inline, true);
+    __m128 r = a;
+    return r;
+}
+unittest
+{
+    float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f];
+    __m128 A = _mm_set1_ps(42.0f);
+    assert(A.array == correct);
+    
+    static if (__VERSION__ >= 2094)
+    {
+        enum __m128 B = _mm_set1_ps(2.4f);
+    }
+}
+
+/// Set the MXCSR control and status register with the value in unsigned 32-bit integer `controlWord`.
+void _mm_setcsr(uint controlWord) @trusted
+{
+    static if (LDC_with_ARM)
+    {
+        // Convert from SSE to ARM control word. This is done _partially_
+        // and only support rounding mode changes.
+
+        // "To alter some bits of a VFP system register without 
+        // affecting other bits, use a read-modify-write procedure"
+        uint fpscr = arm_get_fpcr();
+        
+        // Bits 23 to 22 are rounding modes, however not used in NEON
+        fpscr = fpscr & ~_MM_ROUND_MASK_ARM;
+        switch(controlWord & _MM_ROUND_MASK)
+        {
+            default:
+            case _MM_ROUND_NEAREST:     fpscr |= _MM_ROUND_NEAREST_ARM;     break;
+            case _MM_ROUND_DOWN:        fpscr |= _MM_ROUND_DOWN_ARM;        break;
+            case _MM_ROUND_UP:          fpscr |= _MM_ROUND_UP_ARM;          break;
+            case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break;
+        }
+        fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM;
+        if (controlWord & _MM_FLUSH_ZERO_MASK)
+            fpscr |= _MM_FLUSH_ZERO_MASK_ARM;
+        arm_set_fpcr(fpscr);
+    }
+    else version(GNU)
+    {
+        static if (GDC_with_SSE)
+        {
+            // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
+            version(GNU) asm nothrow @nogc @trusted { "" : : : "memory"; }
+            __builtin_ia32_ldmxcsr(controlWord);
+        }
+        else version(X86)
+        {
+            asm nothrow @nogc @trusted
+            {
+                "ldmxcsr %0;\n" 
+                  : 
+                  : "m" (controlWord)
+                  : ;
+            }
+        }
+        else return __warn_noop();
+    }
+    else version (InlineX86Asm)
+    {
+        asm nothrow @nogc @trusted
+        {
+            ldmxcsr controlWord;
+        }
+    }
+    else
+        static assert(0, "Not yet supported");
+}
+unittest
+{
+    _mm_setcsr(_mm_getcsr());
+}
+
+/// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order.
+__m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted
+{
+    pragma(inline, true);
+
+    if (__ctfe)
+    {  
+        __m128 r;
+        r.ptr[0] = e3;
+        r.ptr[1] = e2;
+        r.ptr[2] = e1;
+        r.ptr[3] = e0;
+        return r;
+    }
+    else
+    {
+        // This small = void here wins a bit in all optimization levels in GDC
+        // and in -O0 in LDC.
+        __m128 r = void;
+        r.ptr[0] = e3;
+        r.ptr[1] = e2;
+        r.ptr[2] = e1;
+        r.ptr[3] = e0;
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(3, 2, 1, 546);
+    float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f];
+    assert(A.array == correct);
+
+    // Very old LDC, like 1.17, cannot case __vector at CT
+    static if (__VERSION__ >= 2094)
+    {
+        static immutable B = _mm_setr_ps(3, 2, 1, 546);
+        enum C = _mm_setr_ps(3, 2, 1, 546);
+    }
+}
+
+/// Return vector of type `__m128` with all elements set to zero.
+__m128 _mm_setzero_ps() pure @trusted
+{
+    pragma(inline, true);
+
+    // Note: for all compilers, this works best in debug builds, and in DMD -O
+    int4 r; 
+    return cast(__m128)r;
+}
+unittest
+{
+    __m128 R = _mm_setzero_ps();
+    float[4] correct = [0.0f, 0, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Do a serializing operation on all store-to-memory instructions that were issued prior 
+/// to this instruction. Guarantees that every store instruction that precedes, in program order, 
+/// is globally visible before any store instruction which follows the fence in program order.
+void _mm_sfence() @trusted
+{
+    version(GNU)
+    {
+        static if (GDC_with_SSE)
+        {
+            __builtin_ia32_sfence();
+        }
+        else version(X86)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                "sfence;\n" : : : ;
+            }
+        }
+        else return __warn_noop();
+    }
+    else static if (LDC_with_SSE)
+    {
+        __builtin_ia32_sfence();
+    }
+    else static if (DMD_with_asm)
+    {
+        // PERF: can't be inlined in DMD, probably because of that assembly.
+        asm nothrow @nogc pure @trusted
+        {
+            sfence;
+        }
+    }
+    else static if (LDC_with_ARM64)
+    {
+        __builtin_arm_dmb(10); // dmb ishst
+    }
+    else version(LDC)
+    {
+        // When the architecture is unknown, generate a full memory barrier,
+        // as the semantics of sfence do not really match those of atomics.
+        llvm_memory_fence();
+    }
+    else
+        static assert(false);
+}
+unittest
+{
+    _mm_sfence();
+}
+
+
+__m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @trusted
+{
+    // PERF DMD + D_SIMD
+    version(LDC)
+    {
+        return cast(__m64) shufflevectorLDC!(short4, ( (imm8 >> 0) & 3 ),
+                                                     ( (imm8 >> 2) & 3 ),
+                                                     ( (imm8 >> 4) & 3 ),
+                                                     ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a);
+    }
+    else
+    {
+        // GDC optimizes that correctly starting with -O2
+        short4 sa = cast(short4)a;
+        short4 r = void;
+        r.ptr[0] = sa.array[ (imm8 >> 0) & 3 ];
+        r.ptr[1] = sa.array[ (imm8 >> 2) & 3 ];
+        r.ptr[2] = sa.array[ (imm8 >> 4) & 3 ];
+        r.ptr[3] = sa.array[ (imm8 >> 6) & 3 ];
+        return cast(__m64)r;
+    }
+}
+unittest
+{
+    __m64 A = _mm_setr_pi16(0, 1, 2, 3);
+    enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
+    short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A);
+    short[4] expectedB = [ 3, 2, 1, 0 ];
+    assert(B.array == expectedB);
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in `a` and `b` using the control in `imm8`, 
+/// Warning: the immediate shuffle value `imm` is given at compile-time instead of runtime.
+__m128 _mm_shuffle_ps(ubyte imm8)(__m128 a, __m128 b) pure @trusted
+{
+    static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_shufps(a, b, imm8);
+    }
+    else static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.SHUFPS, a, b, imm8);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        return shufflevectorLDC!(__m128, imm8 & 3, (imm8>>2) & 3, 
+                                 4 + ((imm8>>4) & 3), 4 + ((imm8>>6) & 3) )(a, b);
+    }
+    else
+    {
+        float4 r = void;
+        r.ptr[0] = a.array[ (imm8 >> 0) & 3 ];
+        r.ptr[1] = a.array[ (imm8 >> 2) & 3 ];
+        r.ptr[2] = b.array[ (imm8 >> 4) & 3 ];
+        r.ptr[3] = b.array[ (imm8 >> 6) & 3 ];
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(0, 1, 2, 3);
+    __m128 B = _mm_setr_ps(4, 5, 6, 7);
+    __m128 C = _mm_shuffle_ps!0x9c(A, B);
+    float[4] correct = [0.0f, 3, 5, 6];
+    assert(C.array == correct);
+}
+
+/// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`.
+__m128 _mm_sqrt_ps(__m128 a) @trusted
+{
+    static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_sqrtps(a);
+    }
+    else static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.SQRTPS, a);
+    }
+    else version(LDC)
+    {
+        // Disappeared with LDC 1.11
+        static if (__VERSION__ < 2081)
+            return __builtin_ia32_sqrtps(a);
+        else
+        {
+            // PERF: use llvm_sqrt on the vector, works better
+            a[0] = llvm_sqrt(a[0]);
+            a[1] = llvm_sqrt(a[1]);
+            a[2] = llvm_sqrt(a[2]);
+            a[3] = llvm_sqrt(a[3]);
+            return a;
+        }
+    }
+    else
+    {
+        a.ptr[0] = sqrt(a.array[0]);
+        a.ptr[1] = sqrt(a.array[1]);
+        a.ptr[2] = sqrt(a.array[2]);
+        a.ptr[3] = sqrt(a.array[3]);
+        return a;
+    }
+}
+unittest
+{
+    __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f));
+    assert(A.array[0] == 2.0f);
+    assert(A.array[1] == 2.0f);
+    assert(A.array[2] == 2.0f);
+    assert(A.array[3] == 2.0f);
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in `a`, store it in the lower
+/// element, and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_sqrt_ss(__m128 a) @trusted
+{
+    static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_sqrtss(a);
+    }
+    // PERF DMD
+    // TODO: enable when https://issues.dlang.org/show_bug.cgi?id=23437 is fixed for good
+    /*else static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.SQRTSS, a);
+    }*/
+    else version(LDC)
+    {
+        a.ptr[0] = llvm_sqrt(a.array[0]);
+        return a;
+    }
+    else
+    {   
+        a.ptr[0] = sqrt(a.array[0]);
+        return a;
+    }
+}
+unittest
+{
+    __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f));
+    assert(A.array[0] == 2.0f);
+    assert(A.array[1] == 4.0f);
+    assert(A.array[2] == 4.0f);
+    assert(A.array[3] == 4.0f);
+}
+
+/// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+void _mm_store_ps (float* mem_addr, __m128 a) pure
+{
+    pragma(inline, true);
+    __m128* aligned = cast(__m128*)mem_addr;
+    *aligned = a;
+}
+
+deprecated("Use _mm_store1_ps instead") alias _mm_store_ps1 = _mm_store1_ps; ///
+
+/// Store the lower single-precision (32-bit) floating-point element from `a` into memory. 
+/// `mem_addr` does not need to be aligned on any particular boundary.
+void _mm_store_ss (float* mem_addr, __m128 a) pure @safe
+{
+    pragma(inline, true);
+    *mem_addr = a.array[0];
+}
+unittest
+{
+    float a;
+    _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546));
+    assert(a == 546);
+}
+
+/// Store the lower single-precision (32-bit) floating-point element from `a` into 4 contiguous elements in memory. 
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE: shouldn't be trusted, see #62
+{
+    __m128* aligned = cast(__m128*)mem_addr;
+    static if (DMD_with_DSIMD)
+    {
+        __m128 r = cast(__m128) __simd(XMM.SHUFPS, a, a, 0);
+    }
+    else
+    {
+        __m128 r; // PERF =void;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = a.array[0];
+        r.ptr[2] = a.array[0];
+        r.ptr[3] = a.array[0];
+    }
+    *aligned = r;
+}
+unittest
+{
+    align(16) float[4] A;
+    _mm_store1_ps(A.ptr, _mm_set_ss(42.0f));
+    float[4] correct = [42.0f, 42, 42, 42];
+    assert(A == correct);
+}
+
+/// Store the upper 2 single-precision (32-bit) floating-point elements from `a` into memory.
+void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted
+{
+    pragma(inline, true);
+    long2 la = cast(long2)a;
+    (*p).ptr[0] = la.array[1];
+}
+unittest
+{
+    __m64 R = _mm_setzero_si64();
+    long2 A = [13, 25];
+    _mm_storeh_pi(&R, cast(__m128)A);
+    assert(R.array[0] == 25);
+}
+
+/// Store the lower 2 single-precision (32-bit) floating-point elements from `a` into memory.
+void _mm_storel_pi(__m64* p, __m128 a) pure @trusted
+{
+    pragma(inline, true);
+    long2 la = cast(long2)a;
+    (*p).ptr[0] = la.array[0];
+}
+unittest
+{
+    __m64 R = _mm_setzero_si64();
+    long2 A = [13, 25];
+    _mm_storel_pi(&R, cast(__m128)A);
+    assert(R.array[0] == 13);
+}
+
+/// Store 4 single-precision (32-bit) floating-point elements from `a` into memory in reverse order. 
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE should not be trusted
+{
+    __m128* aligned = cast(__m128*)mem_addr;
+    static if (DMD_with_DSIMD)
+    {
+        __m128 r = cast(__m128) __simd(XMM.SHUFPS, a, a, 27);
+    }
+    else
+    {
+        __m128 r; // PERF =void;
+        r.ptr[0] = a.array[3];
+        r.ptr[1] = a.array[2];
+        r.ptr[2] = a.array[1];
+        r.ptr[3] = a.array[0];
+    }
+    *aligned = r;
+}
+unittest
+{
+    align(16) float[4] A;
+    _mm_storer_ps(A.ptr, _mm_setr_ps(1.0f, 2, 3, 4));
+    float[4] correct = [4.0f, 3.0f, 2.0f, 1.0f];
+    assert(A == correct);
+}
+
+/// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 
+/// `mem_addr` does not need to be aligned on any particular boundary.
+void _mm_storeu_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE should not be trusted, see #62
+{
+    pragma(inline, true);
+    static if (DMD_with_DSIMD)
+    {
+        cast(void) __simd_sto(XMM.STOUPS, *cast(void16*)(cast(float*)mem_addr), a);
+    }
+    else static if (GDC_with_SSE)
+    {
+        __builtin_ia32_storeups(mem_addr, a); // better in -O0
+    }
+    else static if (LDC_with_optimizations)
+    {
+        storeUnaligned!(float4)(a, mem_addr);
+    }
+    else
+    {
+        mem_addr[0] = a.array[0];
+        mem_addr[1] = a.array[1];
+        mem_addr[2] = a.array[2];
+        mem_addr[3] = a.array[3];
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2, 3, 4);
+    align(16) float[6] R = [0.0f, 0, 0, 0, 0, 0];
+    float[4] correct = [1.0f, 2, 3, 4];
+    _mm_storeu_ps(&R[1], A);
+    assert(R[1..5] == correct);
+}
+
+/// Store 64-bits of integer data from `a` into memory using a non-temporal memory hint.
+/// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
+void _mm_stream_pi (__m64* mem_addr, __m64 a) pure @trusted
+{
+    _mm_stream_si64(cast(long*)mem_addr, a.array[0]);
+}
+
+/// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 
+/// `a`s into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte 
+/// boundary or a general-protection exception may be generated.
+/// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
+void _mm_stream_ps (float* mem_addr, __m128 a)
+{
+    // TODO report this bug: DMD generates no stream instruction when using D_SIMD
+    static if (GDC_with_SSE)
+    {
+        return __builtin_ia32_movntps(mem_addr, a); 
+    }
+    else static if (LDC_with_InlineIREx && LDC_with_optimizations)
+    {
+        enum prefix = `!0 = !{ i32 1 }`;
+        enum ir = `
+            store <4 x float> %1, <4 x float>* %0, align 16, !nontemporal !0
+            ret void`;
+        LDCInlineIREx!(prefix, ir, "", void, __m128*, float4)(cast(__m128*)mem_addr, a);
+
+    }
+    else
+    {
+        // Regular store instead.
+        __m128* dest = cast(__m128*)mem_addr;
+        *dest = a; // it's a regular move instead
+    }
+}
+unittest
+{
+    align(16) float[4] A;
+    _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f));
+    assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f);
+}
+
+/// Subtract packed single-precision (32-bit) floating-point elements in `b` from packed single-precision (32-bit) 
+/// floating-point elements in `a`.
+__m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe
+{
+    pragma(inline, true);
+    return a - b;
+}
+unittest
+{
+    __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
+    a = _mm_sub_ps(a, a);
+    float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
+    assert(a.array == correct);
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in `b` from the lower single-precision (32-bit)
+/// floating-point element in `a`, store the subtration result in the lower element of result, and copy the upper 3 
+/// packed elements from a to the upper elements of result.
+__m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+        return cast(__m128) __simd(XMM.SUBSS, a, b);
+    else static if (GDC_with_SSE)
+        return __builtin_ia32_subss(a, b);
+    else
+    {
+        a[0] -= b[0];
+        return a;
+    }
+}
+unittest
+{
+    __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
+    a = _mm_sub_ss(a, a);
+    float[4] correct = [0.0f, -2.0, 3.0f, 1.0f];
+    assert(a.array == correct);
+}
+
+/// Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in row0, row1, 
+/// row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.).
+void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe
+{
+    __m128 tmp3, tmp2, tmp1, tmp0;
+    tmp0 = _mm_unpacklo_ps(row0, row1);
+    tmp2 = _mm_unpacklo_ps(row2, row3);
+    tmp1 = _mm_unpackhi_ps(row0, row1);
+    tmp3 = _mm_unpackhi_ps(row2, row3);
+    row0 = _mm_movelh_ps(tmp0, tmp2);
+    row1 = _mm_movehl_ps(tmp2, tmp0);
+    row2 = _mm_movelh_ps(tmp1, tmp3);
+    row3 = _mm_movehl_ps(tmp3, tmp1);
+}
+unittest
+{
+    __m128 l0 = _mm_setr_ps(0, 1, 2, 3);
+    __m128 l1 = _mm_setr_ps(4, 5, 6, 7);
+    __m128 l2 = _mm_setr_ps(8, 9, 10, 11);
+    __m128 l3 = _mm_setr_ps(12, 13, 14, 15);
+    _MM_TRANSPOSE4_PS(l0, l1, l2, l3);
+    float[4] r0 = [0.0f, 4, 8, 12];
+    float[4] r1 = [1.0f, 5, 9, 13];
+    float[4] r2 = [2.0f, 6, 10, 14];
+    float[4] r3 = [3.0f, 7, 11, 15];
+    assert(l0.array == r0);
+    assert(l1.array == r1);
+    assert(l2.array == r2);
+    assert(l3.array == r3);
+}
+
+// Note: the only difference between these intrinsics is the signalling
+//       behaviour of quiet NaNs. This is incorrect but the case where
+//       you would want to differentiate between qNaN and sNaN and then
+//       treat them differently on purpose seems extremely rare.
+alias _mm_ucomieq_ss = _mm_comieq_ss;
+alias _mm_ucomige_ss = _mm_comige_ss;
+alias _mm_ucomigt_ss = _mm_comigt_ss;
+alias _mm_ucomile_ss = _mm_comile_ss;
+alias _mm_ucomilt_ss = _mm_comilt_ss;
+alias _mm_ucomineq_ss = _mm_comineq_ss;
+
+/// Return vector of type `__m128` with undefined elements.
+__m128 _mm_undefined_ps() pure @safe
+{
+    pragma(inline, true);
+    __m128 undef = void;
+    return undef;
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half `a` and `b`.
+__m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted
+{
+    // PERF GDC use intrinsic
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.UNPCKHPS, a, b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <4 x float> %0, <4 x float> %1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+                  ret <4 x float> %r`;
+        return LDCInlineIR!(ir, float4, float4, float4)(a, b);
+    }
+    else
+    {
+        __m128 r; // PERF =void;
+        r.ptr[0] = a.array[2];
+        r.ptr[1] = b.array[2];
+        r.ptr[2] = a.array[3];
+        r.ptr[3] = b.array[3];
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
+    __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
+    __m128 R = _mm_unpackhi_ps(A, B);
+    float[4] correct = [3.0f, 7.0f, 4.0f, 8.0f];
+    assert(R.array == correct);
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of `a` and `b`.
+__m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted
+{
+    // PERF GDC use intrinsic
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.UNPCKLPS, a, b);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `%r = shufflevector <4 x float> %0, <4 x float> %1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+                   ret <4 x float> %r`;
+        return LDCInlineIR!(ir, float4, float4, float4)(a, b);
+    }
+    else
+    {
+        __m128 r; // PERF =void;
+        r.ptr[0] = a.array[0];
+        r.ptr[1] = b.array[0];
+        r.ptr[2] = a.array[1];
+        r.ptr[3] = b.array[1];
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
+    __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
+    __m128 R = _mm_unpacklo_ps(A, B);
+    float[4] correct = [1.0f, 5.0f, 2.0f, 6.0f];
+    assert(R.array == correct);
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
+__m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe
+{
+    static if (DMD_with_DSIMD)
+    {
+        return cast(__m128) __simd(XMM.XORPS, cast(void16) a, cast(void16) b);
+    }
+    else
+    {
+        return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b);
+    }
+}
+unittest
+{
+    __m128 A = cast(__m128) _mm_set1_epi32(0x80000000);
+    __m128 B = _mm_setr_ps(4.0f, -5.0, -9.5f, float.infinity);
+    __m128 C = _mm_xor_ps(A, B);
+    float[4] correct = [-4.0f, 5.0, 9.5f, -float.infinity];
+    assert(C.array == correct);
+}
+
+private
+{
+    // Returns: `true` if the pointer is suitably aligned.
+    bool isPointerAligned(void* p, size_t alignment) pure
+    {
+        assert(alignment != 0);
+        return ( cast(size_t)p & (alignment - 1) ) == 0;
+    }
+
+    // Returns: next pointer aligned with alignment bytes.
+    void* nextAlignedPointer(void* start, size_t alignment) pure
+    {
+        return cast(void*)nextMultipleOf(cast(size_t)(start), alignment);
+    }
+
+    // Returns number of bytes to actually allocate when asking
+    // for a particular alignment
+    @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure
+    {
+        enum size_t pointerSize = size_t.sizeof;
+        return askedSize + alignment - 1 + pointerSize * 3;
+    }
+
+    // Store pointer given by malloc + size + alignment
+    @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure
+    {
+        enum size_t pointerSize = size_t.sizeof;
+        char* start = cast(char*)raw + pointerSize * 3;
+        void* aligned = nextAlignedPointer(start, alignment);
+        void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize);
+        *rawLocation = raw;
+        size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize);
+        *sizeLocation = size;
+        size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize);
+        *alignmentLocation = alignment;
+        assert( isPointerAligned(aligned, alignment) );
+        return aligned;
+    }
+
+    // Returns: x, multiple of powerOfTwo, so that x >= n.
+    @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow
+    {
+        // check power-of-two
+        assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0));
+
+        size_t mask = ~(powerOfTwo - 1);
+        return (n + powerOfTwo - 1) & mask;
+    }
+
+    void* alignedReallocImpl(bool PreserveDataIfResized)(void* aligned, size_t size, size_t alignment)
+    {
+        // Calling `_mm_realloc`, `_mm_realloc_discard` or `realloc`  with size 0 is 
+        // Undefined Behavior, and not only since C23.
+        // Moreover, alignedReallocImpl was buggy about it.
+        assert(size != 0);
+
+        if (aligned is null)
+            return _mm_malloc(size, alignment);
+
+        assert(alignment != 0);
+        assert(isPointerAligned(aligned, alignment));
+
+        size_t previousSize = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 2);
+        size_t prevAlignment = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 3);
+
+        // It is illegal to change the alignment across calls.
+        assert(prevAlignment == alignment);
+
+        void* raw = *cast(void**)(cast(char*)aligned - size_t.sizeof);
+        size_t request = requestedSize(size, alignment);
+        size_t previousRequest = requestedSize(previousSize, alignment);
+        assert(previousRequest - request == previousSize - size);
+
+        // Heuristic: if a requested size is within 50% to 100% of what is already allocated
+        //            then exit with the same pointer
+        // PERF it seems like `realloc` should do that, not us.
+        if ( (previousRequest < request * 4) && (request <= previousRequest) )
+            return aligned;
+
+        void* newRaw = malloc(request);
+        if (request > 0 && newRaw == null) // realloc(0) can validly return anything
+            onOutOfMemoryError();
+
+        void* newAligned = storeRawPointerPlusInfo(newRaw, size, alignment);
+
+        static if (PreserveDataIfResized)
+        {
+            size_t minSize = size < previousSize ? size : previousSize;
+            memcpy(newAligned, aligned, minSize); // ok to use memcpy: newAligned is into new memory, always different from aligned
+        }
+
+        // Free previous data
+        _mm_free(aligned);
+        assert(isPointerAligned(newAligned, alignment));
+        return newAligned;
+    }
+}
+
+unittest
+{
+    assert(nextMultipleOf(0, 4) == 0);
+    assert(nextMultipleOf(1, 4) == 4);
+    assert(nextMultipleOf(2, 4) == 4);
+    assert(nextMultipleOf(3, 4) == 4);
+    assert(nextMultipleOf(4, 4) == 4);
+    assert(nextMultipleOf(5, 4) == 8);
+
+    {
+        void* p = _mm_malloc(23, 16);
+        assert(p !is null);
+        assert(((cast(size_t)p) & 0xf) == 0);
+        _mm_free(p);
+    }
+
+    void* nullAlloc = _mm_malloc(0, 32);
+    assert(nullAlloc != null);
+    _mm_free(nullAlloc);
+}
+
+unittest
+{
+    // In C23, it is UB to call realloc with 0 size.
+    // Ensure this is not the case, ever.
+
+    int alignment = 1;
+    void* alloc = _mm_malloc(18, alignment);
+
+    // DO NOT DO THAT:
+    //_mm_realloc(alloc, 0, alignment);
+
+    // DO THAT:
+    _mm_free(alloc);
+}
+
+
+// For some reason, order of declaration is important for this one
+// so it is misplaced.
+// Note: is just another name for _mm_cvtss_si32
+alias _mm_cvt_ss2si = _mm_cvtss_si32;
diff --git a/src/gears/main.d b/src/gears/main.d
index c500f64..0c59ddd 100644
--- a/src/gears/main.d
+++ b/src/gears/main.d
@@ -14,22 +14,6 @@ void main()
 	r.Renderer rd = r.Init(&window);
 	scope(exit) r.Destroy(&rd);
 
-	/*
-	Vec4 f1 = Vec4(r: 2.0, a: 5.5);
-	Vec4 f2;
-
-	Vec4* f = &f1;
-
-	asm
-	{
-		mov R8, f;
-		movups XMM0, f1.r.offsetof[R8];
-		movups f2, XMM0;
-	}
-
-	writeln(f2);
-	*/
-
 	while (true)
 	{
 		p.HandleEvents(&window);
diff --git a/src/gears/renderer.d b/src/gears/renderer.d
index 5cf0baf..667738c 100644
--- a/src/gears/renderer.d
+++ b/src/gears/renderer.d
@@ -32,6 +32,7 @@ enum Format : VkFormat
 	RGBA_F32 = VK_FORMAT_R32G32B32A32_SFLOAT,
 	RGBA_UINT = VK_FORMAT_B8G8R8A8_UINT,
 	RGBA_UNORM = VK_FORMAT_R8G8B8A8_UNORM,
+	RGBA_SRGB = VK_FORMAT_R8G8B8A8_SRGB,
 }
 
 alias FMT = Format;
@@ -101,9 +102,17 @@ struct Renderer
 
 	PushConst push_const;
 
+	Vec3 camera_pos = Vec3(0.0);
+
 	Model yoder;
 }
 
+struct Camera
+{
+	Vec3 pos = Vec3(0.0);
+	Vec3 target = Vec3(0.0);
+}
+
 struct GlobalUniforms
 {
 	Vec2 res;
@@ -128,17 +137,6 @@ extern(C) struct Material
 	f32 shininess = 0.0;
 }
 
-static assert(Material.ambient.offsetof == 0, "ambient offset incorrect");
-static assert(Material.diffuse.offsetof == 16, "ambient offset incorrect");
-static assert(Material.specular.offsetof == 32, "ambient offset incorrect");
-static assert(Material.albedo_texture.offsetof == 48, "ambient offset incorrect");
-static assert(Material.ambient_texture.offsetof == 52, "ambient offset incorrect");
-static assert(Material.specular_texture.offsetof == 56, "ambient offset incorrect");
-static assert(Material.albedo_has_texture.offsetof == 60, "ambient offset incorrect");
-static assert(Material.ambient_has_texture.offsetof == 64, "ambient offset incorrect");
-static assert(Material.specular_has_texture.offsetof == 68, "ambient offset incorrect");
-static assert(Material.shininess.offsetof == 72, "ambient offset incorrect");
-
 struct UIVertex
 {
 	Vec2 p0;
@@ -244,7 +242,7 @@ Cycle(Renderer* rd)
 
 	SetUniform(rd, &rd.globals);
 
-	DrawRect(rd, 150.0, 300.0, 500.0, 700.0, Vec4(r: 0.0, g: 0.0, b: 1.0, a: 1.0));
+	DrawRect(rd, 150.0, 300.0, 500.0, 700.0, Vec4(0.0, 0.0, 1.0, 1.0));
 
 	PrepComputeDrawImage(rd);
 
diff --git a/src/gears/vulkan.d b/src/gears/vulkan.d
index d26bf6b..8b97560 100644
--- a/src/gears/vulkan.d
+++ b/src/gears/vulkan.d
@@ -732,7 +732,7 @@ CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, u32 ch, u8[] data)
 		assert(Transfer(vk, &buf, data), "CreateImageView failure: Buffer Transfer error");
 
 		ImageView conv_view;
-		CreateImageView(vk, &conv_view, w, h, VK_FORMAT_R32G32B32A32_SFLOAT);
+		CreateImageView(vk, &conv_view, w, h, FMT.RGBA_F32);
 
 		WriteConvDescriptor(vk, &buf);
 		WriteConvDescriptor(vk, &conv_view);
@@ -783,10 +783,12 @@ CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, u32 ch, u8[] data)
 				
 		FinishComputePass(vk);
 
-		vkWaitForFences(vk.device, 1, &vk.comp_fence, VK_TRUE, 1000000000);
+		vkWaitForFences(vk.device, 1, &vk.comp_fence, VK_TRUE, u64.max);
 
-		//Destroy(vk, &buf);
-		//Destroy(&conv_view, vk.device, vk.vma);
+		vkQueueWaitIdle(vk.tfer_queue);
+
+		Destroy(vk, &buf);
+		Destroy(&conv_view, vk.device, vk.vma);
 	}
 }
 
@@ -834,7 +836,7 @@ FinishComputePass(Vulkan* vk)
 }
 
 pragma(inline): void
-CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, VkFormat format = VK_FORMAT_R8G8B8A8_SRGB)
+CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, Format format = FMT.RGBA_UNORM)
 {
 	VmaAllocationCreateInfo alloc_info = {
 		usage: VMA_MEMORY_USAGE_GPU_ONLY,
@@ -849,7 +851,7 @@ CreateImageView(Vulkan* vk, ImageView* view, u32 w, u32 h, VkFormat format = VK_
 		format: format,
 		tiling: VK_IMAGE_TILING_OPTIMAL,
 		initialLayout: VK_IMAGE_LAYOUT_UNDEFINED,
-		usage: format == VK_FORMAT_R8G8B8A8_SRGB ? (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT) : (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
+		usage: format == FMT.RGBA_F32 ? (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT) : (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT), 
 		samples: VK_SAMPLE_COUNT_1_BIT,
 		extent: {
 			width: w,
diff --git a/src/shared/aliases.d b/src/shared/aliases.d
index b6d1136..1e89946 100644
--- a/src/shared/aliases.d
+++ b/src/shared/aliases.d
@@ -1,5 +1,6 @@
 import core.memory;
 import std.stdint;
+import dplug.math;
 
 debug
 {
@@ -28,3 +29,10 @@ alias b32 = uint;
 alias intptr = intptr_t;
 alias uintptr = uintptr_t;
 
+alias Vec2 = vec2f;
+alias Vec3 = vec3f;
+alias Vec4 = vec4f;
+
+alias Mat2 = mat2f;
+alias Mat3 = mat3f;
+alias Mat4 = mat4f;
diff --git a/src/shared/util.d b/src/shared/util.d
index a29e7b5..95ee887 100644
--- a/src/shared/util.d
+++ b/src/shared/util.d
@@ -359,38 +359,3 @@ Hash(string str)
 	return xxh3_64bits_withSeed(str.ptr, str.length, HASH_SEED);
 }
 
-struct Matrix(T, int S)
-{
-	T[S][S] m;
-	alias m this;
-}
-
-alias Mat2 = Matrix!(f32, 2);
-alias Mat3 = Matrix!(f32, 3);
-alias Mat4 = Matrix!(f32, 4);
-
-struct Vector(T, int S)
-{
-	union
-	{
-		struct
-		{
-			T r = 0.0;
-			T g = 0.0;
-			static if (S > 2) T b = 0.0;
-			static if (S > 3) T a = 0.0;
-		};
-		struct
-		{
-			T x;
-			T y;
-			static if (S > 2) T z;
-			static if (S > 3) T w;
-		};
-		T[S] v;
-	}
-}
-
-alias Vec2 = Vector!(f32, 2);
-alias Vec3 = Vector!(f32, 3);
-alias Vec4 = Vector!(f32, 4);