/**
* `core.simd` emulation layer.
*
* Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
*            cet 2024.
* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
*/
module inteli.types;


pure:
nothrow:
@nogc:

version(GNU)
{
    // Note: for GDC support, be sure to use https://explore.dgnu.org/

    // Future: just detect vectors, do not base upon arch.

    version(X86_64)
    {
        enum MMXSizedVectorsAreEmulated = false;
        enum SSESizedVectorsAreEmulated = false;

        // Does GDC support AVX-sized vectors?
        static if (__VERSION__ >= 2100) // Starting at GDC 12.1 only.
        {
            enum AVXSizedVectorsAreEmulated = !(is(__vector(double[4]))); 
        }
        else
        {
            enum AVXSizedVectorsAreEmulated = true;
        }

        import gcc.builtins;
    }
    else
    {
        enum MMXSizedVectorsAreEmulated = true;
        enum SSESizedVectorsAreEmulated = true;
        enum AVXSizedVectorsAreEmulated = true;
    }
}
else version(LDC)
{
    public import ldc.simd;

    // Use this alias to mention it should only be used with LDC,
    // for example when emulated shufflevector would just be wasteful.
    alias shufflevectorLDC = shufflevector;

    enum MMXSizedVectorsAreEmulated = false;
    enum SSESizedVectorsAreEmulated = false;
    enum AVXSizedVectorsAreEmulated = false;
}
else version(DigitalMars)
{
    public import core.simd;

    static if (__VERSION__ >= 2100)
    {
        // Note: turning this true is very desirable for DMD performance,
        // but also leads to many bugs being discovered upstream.
        // The fact that it works at all relies on many workardounds.
        // In particular intel-intrinsics with this "on" is a honeypot for DMD backend bugs,
        // and a very strong DMD codegen test suite.
        // What happens typically is that contributors end up on a DMD bug in their PR.
        // But finally, in 2022 D_SIMD has been activated, at least for SSE and some instructions.
        enum bool tryToEnableCoreSimdWithDMD = true;
    }
    else
    {
        enum bool tryToEnableCoreSimdWithDMD = false;
    }

    version(D_SIMD)
    {
        enum MMXSizedVectorsAreEmulated = true;
        enum SSESizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD;

        // Note: with DMD, AVX-sized vectors can't be enabled yet.
        // On linux + x86_64, this will fail since a few operands seem to be missing. 
        // FUTURE: enable AVX-sized vectors in DMD. :)
        //
        // Blockers: https://issues.dlang.org/show_bug.cgi?id=24283 and 24284
        //           Probably other, unreported issues.
        version(D_AVX)
            enum AVXSizedVectorsAreEmulated = true;
        else
            enum AVXSizedVectorsAreEmulated = true;
    }
    else
    {
        // Some DMD 32-bit targets don't have D_SIMD
        enum MMXSizedVectorsAreEmulated = true;
        enum SSESizedVectorsAreEmulated = true;
        enum AVXSizedVectorsAreEmulated = true;
    }
}

enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated;

static if (CoreSimdIsEmulated)
{
    // core.simd is emulated in some capacity: introduce `VectorOps`

    mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N)
    {
        enum Count = N;
        alias Base = BaseType;

        BaseType* ptr() return pure nothrow @nogc
        {
            return array.ptr;
        }

        // Unary operators
        VectorType opUnary(string op)() pure nothrow @safe @nogc
        {
            VectorType res = void;
            mixin("res.array[] = " ~ op ~ "array[];");
            return res;
        }

        // Binary operators
        VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc
        {
            VectorType res = void;
            mixin("res.array[] = array[] " ~ op ~ " other.array[];");
            return res;
        }

        // Assigning a BaseType value
        void opAssign(BaseType e) pure nothrow @safe @nogc
        {
            array[] = e;
        }

        // Assigning a static array
        void opAssign(ArrayType v) pure nothrow @safe @nogc
        {
            array[] = v[];
        }

        void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc
        {
            mixin("array[] "  ~ op ~ "= other.array[];");
        }

        // Assigning a dyn array
        this(ArrayType v) pure nothrow @safe @nogc
        {
            array[] = v[];
        }

        // Broadcast constructor
        this(BaseType x) pure nothrow @safe @nogc
        {
            array[] = x;
        }

        /// We can't support implicit conversion but do support explicit casting.
        /// "Vector types of the same size can be implicitly converted among each other."
        /// Casting to another vector type is always just a raw copy.
        VecDest opCast(VecDest)() pure const nothrow @trusted @nogc
            if (VecDest.sizeof == VectorType.sizeof)
            {
                VecDest dest = void;
                // Copy
                dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[];
                return dest;
            }

        ref inout(BaseType) opIndex(size_t i) inout return pure nothrow @safe @nogc
        {
            return array[i];
        }

    }
}
else
{
    public import core.simd;

    // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can
    // And GDC sometimes need those unsigned vector types for some intrinsics.
    // For internal use only.
    package alias ushort8 = Vector!(ushort[8]);
    package alias ubyte8  = Vector!(ubyte[8]);
    package alias ubyte16 = Vector!(ubyte[16]);

    static if (!AVXSizedVectorsAreEmulated)
    {
        package alias ushort16 = Vector!(ushort[16]);
        package alias ubyte32  = Vector!(ubyte[32]);
    }
}

// Emulate ldc.simd cmpMask and other masks.
// Note: these should be deprecated on non-LDC, 
// since it's slower to generate that code.
version(LDC)
{} 
else
{
    // TODO: deprecated and write plain versions instead

    private template BaseType(V)
    {
        alias typeof( ( { V v; return v; }()).array[0]) BaseType;
    }

    private template TrueMask(V)
    {
        alias Elem = BaseType!V;

        static if (is(Elem == float))
        {
            immutable uint m1 = 0xffffffff;
            enum Elem TrueMask = *cast(float*)(&m1);
        }
        else static if (is(Elem == double))
        {
            immutable ulong m1 = 0xffffffff_ffffffff;
            enum Elem TrueMask = *cast(double*)(&m1);
        }
        else // integer case
        {
            enum Elem TrueMask = -1;
        }
    }

    Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison
    {
        enum size_t Count = Vec.array.length;
        Vec result;
        foreach(int i; 0..Count)
        {
            bool cond = a.array[i] == b.array[i];
            result.ptr[i] = cond ? TrueMask!Vec : 0;
        }
        return result;
    }

    Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison
    {
        enum size_t Count = Vec.array.length;
        Vec result;
        foreach(int i; 0..Count)
        {
            bool cond = a.array[i] > b.array[i];
            result.ptr[i] = cond ? TrueMask!Vec : 0;
        }
        return result;
    }
}

unittest
{
    float4 a = [1, 3, 5, 7];
    float4 b = [2, 3, 4, 5];
    int4 c = cast(int4)(greaterMask!float4(a, b));
    static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff];
    assert(c.array == correct);
}

static if (MMXSizedVectorsAreEmulated)
{
    /// MMX-like SIMD types
    struct float2
    {
        float[2] array;
        mixin VectorOps!(float2, float[2]);
    }

    struct byte8
    {
        byte[8] array;
        mixin VectorOps!(byte8, byte[8]);
    }

    struct short4
    {
        short[4] array;
        mixin VectorOps!(short4, short[4]);
    }

    struct int2
    {
        int[2] array;
        mixin VectorOps!(int2, int[2]);
    }

    struct long1
    {
        long[1] array;
        mixin VectorOps!(long1, long[1]);
    }
}
else
{
    // For this compiler, defining MMX-sized vectors is working.
    public import core.simd;
    alias long1 = Vector!(long[1]);
    alias float2 = Vector!(float[2]);
    alias int2 = Vector!(int[2]);
    alias short4 = Vector!(short[4]);
    alias byte8 = Vector!(byte[8]);
}

static assert(float2.sizeof == 8);
static assert(byte8.sizeof == 8);
static assert(short4.sizeof == 8);
static assert(int2.sizeof == 8);
static assert(long1.sizeof == 8);


static if (SSESizedVectorsAreEmulated)
{
    /// SSE-like SIMD types

    struct float4
    {
        float[4] array;
        mixin VectorOps!(float4, float[4]);
    }

    struct byte16
    {
        byte[16] array;
        mixin VectorOps!(byte16, byte[16]);
    }

    struct short8
    {
        short[8] array;
        mixin VectorOps!(short8, short[8]);
    }

    struct int4
    {
        int[4] array;
        mixin VectorOps!(int4, int[4]);
    }

    struct long2
    {
        long[2] array;
        mixin VectorOps!(long2, long[2]);
    }

    struct double2
    {
        double[2] array;
        mixin VectorOps!(double2, double[2]);
    }
}

static assert(float4.sizeof == 16);
static assert(byte16.sizeof == 16);
static assert(short8.sizeof == 16);
static assert(int4.sizeof == 16);
static assert(long2.sizeof == 16);
static assert(double2.sizeof == 16);


static if (AVXSizedVectorsAreEmulated)
{
    /// AVX-like SIMD types

    struct float8
    {
        float[8] array;
        mixin VectorOps!(float8, float[8]);
    }

    struct byte32
    {
        byte[32] array;
        mixin VectorOps!(byte32, byte[32]);
    }

    struct short16
    {
        short[16] array;
        mixin VectorOps!(short16, short[16]);
    }

    struct int8
    {
        int[8] array;
        mixin VectorOps!(int8, int[8]);
    }

    struct long4
    {
        long[4] array;
        mixin VectorOps!(long4, long[4]);
    }

    struct double4
    {
        double[4] array;
        mixin VectorOps!(double4, double[4]);
    }
}
else
{
    public import core.simd;    
}
static assert(float8.sizeof == 32);
static assert(byte32.sizeof == 32);
static assert(short16.sizeof == 32);
static assert(int8.sizeof == 32);
static assert(long4.sizeof == 32);
static assert(double4.sizeof == 32);


alias __m256 = float8;
alias __m256i = long4; // long long __vector with ICC, GCC, and clang
alias __m256d = double4;
alias __m128 = float4;
alias __m128i = int4;
alias __m128d = double2;
alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long

int _MM_SHUFFLE2(int x, int y) pure @safe
{
    assert(x >= 0 && x <= 1);
    assert(y >= 0 && y <= 1);
    return (x << 1) | y;
}

int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe
{
    assert(x >= 0 && x <= 3);
    assert(y >= 0 && y <= 3);
    assert(z >= 0 && z <= 3);
    assert(w >= 0 && w <= 3);
    return (z<<6) | (y<<4) | (x<<2) | w;
}

// test assignment from scalar to vector type
unittest
{
    float4 A = 3.0f;
    float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f];
    assert(A.array == correctA);

    int2 B = 42;
    int[2] correctB = [42, 42];
    assert(B.array == correctB);
}