module dlib.math;

import dlib.aliases;
import dlib.util;

import std.traits;
import std.math.traits;
import std.meta;

public import std.math.rounding : round;
public import core.math : sqrt, cos, sin;

enum PI = 3.14159L;

const u16[128] _RSQRT_TABLE = [
	0xb451, 0xb2f0, 0xb196, 0xb044, 0xaef9, 0xadb6, 0xac79, 0xab43,
	0xaa14, 0xa8eb, 0xa7c8, 0xa6aa, 0xa592, 0xa480, 0xa373, 0xa26b,
	0xa168, 0xa06a, 0x9f70, 0x9e7b, 0x9d8a, 0x9c9d, 0x9bb5, 0x9ad1,
	0x99f0, 0x9913, 0x983a, 0x9765, 0x9693, 0x95c4, 0x94f8, 0x9430,
	0x936b, 0x92a9, 0x91ea, 0x912e, 0x9075, 0x8fbe, 0x8f0a, 0x8e59,
	0x8daa, 0x8cfe, 0x8c54, 0x8bac, 0x8b07, 0x8a64, 0x89c4, 0x8925,
	0x8889, 0x87ee, 0x8756, 0x86c0, 0x862b, 0x8599, 0x8508, 0x8479,
	0x83ec, 0x8361, 0x82d8, 0x8250, 0x81c9, 0x8145, 0x80c2, 0x8040,
	0xff02, 0xfd0e, 0xfb25, 0xf947, 0xf773, 0xf5aa, 0xf3ea, 0xf234,
	0xf087, 0xeee3, 0xed47, 0xebb3, 0xea27, 0xe8a3, 0xe727, 0xe5b2,
	0xe443, 0xe2dc, 0xe17a, 0xe020, 0xdecb, 0xdd7d, 0xdc34, 0xdaf1,
	0xd9b3, 0xd87b, 0xd748, 0xd61a, 0xd4f1, 0xd3cd, 0xd2ad, 0xd192,
	0xd07b, 0xcf69, 0xce5b, 0xcd51, 0xcc4a, 0xcb48, 0xca4a, 0xc94f,
	0xc858, 0xc764, 0xc674, 0xc587, 0xc49d, 0xc3b7, 0xc2d4, 0xc1f4,
	0xc116, 0xc03c, 0xbf65, 0xbe90, 0xbdbe, 0xbcef, 0xbc23, 0xbb59,
	0xba91, 0xb9cc, 0xb90a, 0xb84a, 0xb78c, 0xb6d0, 0xb617, 0xb560,
];

union Union64
{
	f64 f;
	u64 u;
}

union Union32
{
	f32 f;
	u32 u;
}

auto
AsUint(T)(T v) if(isFloatingPoint!(T))
{
	static if(is(T == f32))
	{
		Union32 u = { f: v };
	}
	else
	{
		Union64 u = { f: v };
	}

	return u.u;
}

auto
AsFloat(T)(T v) if(isIntegral!(T))
{
	static if(T.sizeof == 4)
	{
		Union32 u = { u: v };
	}
	else
	{
		Union64 u = { u: v };
	}

	return u.f;
}

T
_MathInvalid(T)(T v) if(isFloatingPoint!(T))
{
	return (v - v) / (v - v);
}

u32
Mul32(T)(T a, T b) if(isIntegral!(T))
{
	return cast(u32)(cast(u64)(a*b) >> 32);
}

u64
Mul64(u64 a, u64 b)
{
	u64 ahi = a>>32;
	u64 alo = a&0xffffffff;
	u64 bhi = b>>32;
	u64 blo = b&0xffffffff;
	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
}

/*
T
Sqrt(T)(T x) if(isFloatingPoint!(T))
{
	u64 ix, top, m;

	ix  = AsUint(x);
	top = ix >> 52;
	if (top - 0x001 >= 0x7ff - 0x001) {
		if (ix * 2 == 0)
		{
			return x;
		}
		if (ix == 0x7ff0000000000000)
		{
			return x;
		}
		if (ix > 0x7ff0000000000000)
		{
			return _MathInvalid(x);
		}
		ix   = AsUint(x * 0x1p52);
		top  = ix >> 52;
		top -= 52;
	}

	i32 even = top & 1;
	m = (ix << 11) | 0x8000000000000000;
	if (even)
	{
		m >>= 1;
	} 

	top = (top + 0x3ff) >> 1;

	static const u64 three = 0xc0000000;
	u64 r, s, d, u, i;

	i = (ix >> 46) % 128;
	r = cast(u32)(_RSQRT_TABLE[cast(usize)i] << 16);
	s = Mul32(m>>32, r);
	d = Mul32(s, r);
	u = three - d;
	r = Mul32(r, u) << 1;
	s = Mul32(s, u) << 1;
	d = Mul32(s, r);
	u = three - d;
	r = Mul32(r, u) << 1;
	r = r << 32;
	s = Mul64(m, r);
	d = Mul64(s, r);
	u = (three<<32) - d;
	s = Mul64(s, u);  
	s = (s - 2) >> 9; 

	u64 d0, d1, d2;
	f64 y, t;
	d0 = (m << 42) - s*s;
	d1 = s - d0;
	d2 = d1 + s + 1;
	s += d1 >> 63;
	s &= 0x000fffffffffffff;
	s |= top << 52;
	y = AsFloat(s);
	
	static if(true) 
	{
		u64 tiny = d2==0 ? 0 : 0x0010000000000000;
		tiny |= (d1^d2) & 0x8000000000000000;
		t = AsFloat(tiny);
		y = cast(f64)(y + t);
	}

	return y;
}
*/

T
Abs(T)(T x) if(isSigned!(T))
{
	static if(isFloatingPoint!(T))
	{
		static if(is(T == f32)) 
		{
			Union32 f = { f: x };
		}
		else
		{
			Union64 f = { f: x };
		}
		
		f.u &= cast(typeof(f.u))-1 / 2;

		x = f.f;
	}
	else
	{
		x = x >= 0 ? x : -x;
	}

	return x;
}

T
Ceil(T)(T x) if(is(T == f32) || is(T == f64))
{
	if(!isNaN(x) && !isInfinity(x) && x != 0.0)
	{
		x = _Floor(x+1.0);
	}

	return x;
}

T
Floor(T)(T x) if(is(T == f32) || is(T == f64))
{
	if(!isNaN(x) && !isInfinity(x) && x != 0.0)
	{
		x = _Floor(x);
	}

	return x;
}

T 
_Floor(T)(const T x) @trusted pure nothrow @nogc
{
	alias F = floatTraits!(T);
	// Take care not to trigger library calls from the compiler,
	// while ensuring that we don't get defeated by some optimizers.
	union floatBits
	{
		T rv;
		ushort[T.sizeof/2] vu;

		// Other kinds of extractors for real formats.
		static if (F.realFormat == RealFormat.ieeeSingle)
			uint vi;
		else static if (F.realFormat == RealFormat.ieeeDouble)
			ulong vi;
	}

	floatBits y = void;
	y.rv = x;

	// Find the exponent (power of 2)
	// Do this by shifting the raw value so that the exponent lies in the low bits,
	// then mask out the sign bit, and subtract the bias.
	static if (F.realFormat == RealFormat.ieeeSingle)
	{
		int exp = ((y.vi >> (T.mant_dig - 1)) & 0xff) - 0x7f;
		enum mantissa_mask = F.MANTISSAMASK_INT;
		enum sign_shift = 31;
	}
	else static if (F.realFormat == RealFormat.ieeeDouble)
	{
		long exp = ((y.vi >> (T.mant_dig - 1)) & 0x7ff) - 0x3ff;
		enum mantissa_mask = F.MANTISSAMASK_LONG;
		enum sign_shift = 63;
	}
	else static if (F.realFormat == RealFormat.ieeeExtended ||
									F.realFormat == RealFormat.ieeeExtended53)
	{
		int exp = (y.vu[F.EXPPOS_SHORT] & 0x7fff) - 0x3fff;

		version (LittleEndian)
			int pos = 0;
		else
			int pos = 4;
	}
	else static if (F.realFormat == RealFormat.ieeeQuadruple)
	{
		int exp = (y.vu[F.EXPPOS_SHORT] & 0x7fff) - 0x3fff;

		version (LittleEndian)
			int pos = 0;
		else
			int pos = 7;
	}
	else
			static assert(false, "Not implemented for this architecture");

	if (exp < 0)
	{
		if (x < 0.0)
			return -1.0;
		else
			return 0.0;
	}

	static if (F.realFormat == RealFormat.ieeeSingle ||
						 F.realFormat == RealFormat.ieeeDouble)
	{
		if (exp < (T.mant_dig - 1))
		{
			// Clear all bits representing the fraction part.
			// Note: the fraction mask represents the floating point number 0.999999...
			// i.e: `2.0 ^^ (exp - T.mant_dig + 1) * (fraction_mask + 1) == 1.0`
			const fraction_mask = mantissa_mask >> exp;

			if ((y.vi & fraction_mask) != 0)
			{
				// If 'x' is negative, then first substract (1.0 - T.epsilon) from the value.
				if (y.vi >> sign_shift)
					y.vi += fraction_mask;
				y.vi &= ~fraction_mask;
			}
		}
	}
	else
	{
		static if (F.realFormat == RealFormat.ieeeExtended53)
			exp = (T.mant_dig + 11 - 1) - exp; // mant_dig is really 64
		else
			exp = (T.mant_dig - 1) - exp;

		// Zero 16 bits at a time.
		while (exp >= 16)
		{
			version (LittleEndian)
				y.vu[pos++] = 0;
			else
				y.vu[pos--] = 0;
			exp -= 16;
		}

		// Clear the remaining bits.
		if (exp > 0)
			y.vu[pos] &= 0xffff ^ ((1 << exp) - 1);

		if ((x < 0.0) && (x != y.rv))
			y.rv -= 1.0;
	}

	return y.rv;
}

T
_MinOrMaxInner(T, bool max, Args...)(T first, Args args)
{
	T value = first;
	static foreach(i; 0 .. args.length)
	{
		static if(max)  if(cast(T)(args[i]) > value) value = cast(T)(args[i]);
		static if(!max) if(cast(T)(args[i]) < value) value = cast(T)(args[i]);
	}

	return value;
}

T
Max(T, Args...)(T first, Args args)
{
	return _MinOrMaxInner!(T, true, Args)(first, args);
}

T
Min(T, Args...)(T first, Args args)
{
	return _MinOrMaxInner!(T, false, Args)(first, args);
}

T
RoundUp(T)(T a, T b)
{
	a += b - 1;
	a -= a%b;
	return a;
}

T AlignPow2(T)(T v, T a)
{
	return (v + a - 1) & ~(a - 1);
}

f32 Radians(f32 deg)
{
	return deg * (PI / 180.0);
}

struct Vector(T, int N)
{
	static assert(N > 1 && N <= 4);

	enum _N = N;
	alias T _T;

	union
	{
		T[N] v = 0;
		struct
		{
			T x;
			alias x r;

			T y;
			alias y g;

			static if(N > 2)
			{
				T z;
				alias z b;
			}

			static if(N > 3)
			{
				T w;
				alias w a;
			}
		};
	}

	this(Vec3, f32)(Vec3 v3, f32 f) if(N == 4 && is(T: f32))
	{
		x = v3.x;
		y = v3.y;
		z = v3.z;
		w = f;
	}

	this(Arr)(Arr arr) if(is(Arr: typeof(v)))
	{
		this.v = arr;
	}

	this(Args...)(Args args) 
	{
		static if(args.length == 1)
		{
			enum bool is_array          = isArray!(Args[0]);
			enum bool is_assignable     = isAssignable!(T, Args[0]);
			enum bool types_are_numeric = TypesAreNumeric!(T, Args[0]);

			static if(is_array || is_assignable || types_are_numeric)
			{
				opAssign!(Args[0])(args[0]);
			}
			else static assert(false, "Invalid Vector constructor");
		}
		else static if(args.length == N)
		{
			static foreach(i; 0 .. N)
			{
				v[i] = cast(T)args[i];
			}
		}
		else static if(args.length == 2 && N == 4)
		{
			v[0] = args[0];
			v[1] = args[0];
			v[2] = args[0];
			v[3] = args[1];
		}
		else
		{
			static assert(false, "Invalid Vector constructor");
		}
	}

	ref Vector opAssign(U)(U x) if(is(U: T) || isAssignable!(T, U) || TypesAreNumeric!(T, U)) 
	{
		v[] = cast(T)x;
		return this;
	}

	ref Vector opAssign(U)(U u) if(isStaticArray!(U) && U.length == N && isAssignable!(T, typeof(*U.init.ptr)))
	{
		static foreach(i; 0 .. N)
		{
			v[i] = cast(T)u[i];
		}
		return this;
	}

	ref Vector opAssign(U)(U u) if(is(U : Vector))
	{
		v[] = u.v[];
		return this;
	}

	inout(T)* ptr() inout @property
	{
		return v.ptr;
	}

	bool opEquals(U)(U other) if(is(U: Vector!(T, N)))
	{
		bool result = true;

		foreach(i; 0 .. N)
		{
			if(Abs(v[i] - other.v[i]) > 0.0000009)
			{
				result = false;
				break;
			}
		}

		return result;
	}

	int opDollar()
	{
		return N;
	}

	T[] opSlice()
	{
		return v[];
	}

	Vector opUnary(string op)() if(op == "+" || op == "-" || op == "~" || op == "!")
	{
		Vector res;
		static foreach(i; 0 .. N)
		{
			mixin("res.v[", i, "] = " ~ op ~ "v[", i, "];\n");
		}
		return res;
	}

	ref Vector opOpAssign(string op, U)(U value) if(is(U: Vector))
	{
		static foreach(i; 0 .. N)
		{
			mixin("v[", i, "] " ~op~ "= value.v[", i, "];");
		}
		return this;
	}

	ref Vector opOpAssign(string op, U)(U value) if(IsConvertible!(U))
	{
		Vector conv = value;
		return opOpAssign!(op)(conv);
	}

	@property auto opDispatch(string op, U = void)() if(ValidSwizzle!(op) && op.length <= 4)
	{
		Vector!(T, op.length) result;
		enum index_tuple = SwizzleTuple!(op);
		static foreach(i, index; index_tuple)
		{
			result.v[i] = v[index];
		}
		return result;
	}

	@property void opDispatch(string op, U)(U x) if((op.length > 1) && ValidUniqueSwizzle!(op) && is(typeof(Vector!(T, op.length)(x))))
	{
		Vector!(T, op.length) conv = x;
		enum index_tuple = SwizzleTuple!(op);
		static foreach(i, index; index_tuple)
		{
			v[index] = conv[i];
		}
	}

	Vector
	opBinaryRight(string op, U)(U operand) if(IsConvertible!(U) && (op == "*" || op == "+" || op == "-" || op == "/"))
	{
		Vector res;
		static foreach(i; 0 .. N)
		{
			mixin("res.v[", i, "] = v[", i, "] " ~op~ " operand;");
		}
		return res;
	}

	static if(N == 4)
	{
		Vector opBinary(string op, U)(U operand) if((is(U: Vector!(f32, 4)) && is(T: f32)) && (op == "*" || op == "+" || op == "-" || op == "/"))
		{
			Vector result;
	
			version(X86_64)
			{
				f32* l   = &x;
				f32* r   = &operand.x;
				f32* res = &result.x;

				asm 
				{
					mov    R8,   l;
					mov    R9,   r;
					mov    R10,  res;
					movups XMM0, x.offsetof[R8];
					movups XMM1, operand.x.offsetof[R9];
				}
				static      if(op == "*") asm  { mulps XMM0, XMM1; }
				else static if(op == "-") asm  { subps XMM0, XMM1; }
				else static if(op == "+") asm  { addps XMM0, XMM1; }
				else static if(op == "/") asm  { divps XMM0, XMM1; }
				asm 
				{
					movups result.x.offsetof[R10], XMM0;
				}
			}
			else
			{
				static foreach(i; 0 .. N)
				{
					result.v[i] = mixin("this.v[", i, "] " ~op~ " operand.v[", i, "]");
				}
			}

			return result;
		}

		Vector opBinary(string op, U)(U operand) if(IsConvertible!(U) && (op == "*" || op == "+" || op == "-" || op == "/")) 
		{
			Vector result;

			version(X86_64)
			{
				Vector other = operand;
				f32*   l     = &x;
				f32*   r     = &other.x;
				f32*   res   = &result.x;

				asm 
				{
					mov    R8,   l;
					mov    R9,   r;
					mov    R10,  res;
					movups XMM0, x.offsetof[R8];
					movups XMM1, other.x.offsetof[R9];
				}
				static      if(op == "*") asm { mulps XMM0, XMM1; }
				else static if(op == "-") asm { subps XMM0, XMM1; }
				else static if(op == "+") asm { addps XMM0, XMM1; }
				else static if(op == "/") asm { divps XMM0, XMM1; }
				asm 
				{
					movups result.x.offsetof[R10], XMM0;
				}
			}
			else
			{
				static foreach(i; 0 .. N)
				{
					mixin("result.v[", i, "] = this.v[", i, "] " ~op~ " operand;");
				}
			}

			return result;
		}
	}
	else
	{
		Vector opBinary(string op, U)(U operand) if(is(U: Vector) && U._N == N && (op == "*" || op == "+" || op == "-" || op == "/"))
		{
			Vector res;
			
			static foreach(i; 0 .. N)
			{
				mixin("res.v[", i, "] = v[", i, "] " ~ op ~ " operand.v[", i, "];");
			}

			return res;
		}

		Vector opBinary(string op, U)(U operand) if(IsConvertible!(U) && (op == "*" || op == "+" || op == "-" || op == "/"))
		{
			Vector res;
			Vector other = operand;

			static foreach(i; 0 .. N)
			{
				mixin("res.v[", i, "] = v[", i, "] " ~ op ~ " other.v[", i, "];");
			}

			return res;
		}
	}

	
	ref T opIndex(size_t i)
	{
		return v[i];
	}

	T opIndexAssign(U : T)(U x, size_t i)
	{
		return v[i] = x;
	}

	U opCast(U)() if(IsVector!(U) && (U._N == _N))
	{
		U result;

		static foreach(i; 0 .. N)
		{
			rev.v[i] = cast(U._T)v[i];
		}

		return result;
	}

	template IsConvertible(U)
	{
		enum bool IsConvertible = !is(U == Vector) && __traits(compiles, { U u; T t = cast(T)u; } );
	}

	template TypesAreNumeric(U, V)
	{
		enum bool TypesAreNumeric = isNumeric!(U) && isNumeric!(V);
	}

	template SwizzleIndex(char c)
	{
		static if((c == 'x' || c == 'r') && N > 0)
			enum SwizzleIndex = 0;
		else static if((c == 'y' || c == 'g') && N > 1)
			enum SwizzleIndex = 1;
		else static if((c == 'z' || c == 'b') && N > 2)
			enum SwizzleIndex = 2;
		else static if((c == 'w' || c == 'a') && N > 3)
			enum SwizzleIndex = 3;
		else
			enum SwizzleIndex = -1;
	}

	template SwizzleSet(char c)
	{
		static if(c == 'x' || c == 'y' || c == 'z' || c == 'w')
			enum SwizzleSet = 0;
		else static if(c == 'r' || c == 'g' || c == 'b' || c == 'a')
			enum SwizzleSet = 1;
		else
			enum SwizzleSet = -1;
	}

	template SwizzleTuple(string op)
	{
		enum op_length = op.length;
		static if(op.length == 0)
			enum SwizzleTuple = [];
		else
			enum SwizzleTuple = [ SwizzleIndex!(op[0])] ~ SwizzleTuple!(op[1 .. op.length]);
	}

	template SearchString(char c, string s)
	{
		static if(s.length == 0)
		{
			enum bool result = false;
		}
		else
		{
			enum string tail = s[1 .. s.length];
			enum bool result = (s[0] == c) || SearchString!(c, tail).result;
		}
	}

	template UniqueChars(string s)
	{
		static if(s.length == 1)
		{
			enum bool result = true;
		}
		else
		{
			enum tail = s[1 .. s.length];
			enum bool result = !(SearchString!(s[0], tail).result) && UniqueChars!(tail).result;
		}
	}

	template ValidSwizzle(string op, int last_swizzle = -1)
	{
		static if(op.length == 0)
		{
			enum bool ValidSwizzle = true;
		}
		else
		{
			enum length = op.length;
			enum int swizzle_set = SwizzleSet!(op[0]);
			enum bool valid_swizzle_set = (last_swizzle == -1 || (swizzle_set == last_swizzle));
			enum bool ValidSwizzle = (SwizzleIndex!(op[0]) != -1) && valid_swizzle_set && ValidSwizzle!(op[1 .. length], swizzle_set);
		}
	}

	template ValidUniqueSwizzle(string op)
	{
		static if(ValidSwizzle!(op))
		{
			enum ValidUniqueSwizzle = UniqueChars!(op).result;
		}
		else
		{
			enum ValidUniqueSwizzle = false;
		}
	}
}

// TODO: fix alignment for non mat4s
align(16) struct Matrix(T, int D)
{
	static assert(D > 0 && D <= 4);

	alias Vector!(T, D) MatrixVec;
	alias T             _T;
	alias T[4]          Row;

	enum N  = D*D;
	enum _D = D;

	union
	{
		T[N]         v = 0;
		Row[D]       rows;
		MatrixVec[D] vec;
	}

	// TODO: setup 

	this(U...)(U values)
	{
		static if((U.length == N) && allSatisfy!(IsTypeAssignable, U))
		{
			static foreach(i, x; values)
			{
				v[i] = x;
			}
		}
		else static if((U.length == 1) && (isAssignable!(U[0])) && (!is(U[0] : Matrix)))
		{
			v[] = values[0];
		}
		else static assert(false, "Cannot construct matrix with provided parameters");
	}

	this(U)(T x)
	{
		static foreach(i; 0 .. N)
		{
			v[i] = x;
		}
	}

	this(U)(U x) if(is(U: typeof(this.v)))
	{
		this.v = x;
	}

	@property inout(T)* ptr() inout
	{
		return v.ptr;
	}

	ref Matrix opAssign(U : T)(U x)
	{
		static foreach(i; 0 .. N)
		{
			v[i] = x;
		}

		return this;
	}

	ref Matrix opAssign(U : Matrix)(U x)
	{
		static foreach(i; 0 .. N)
		{
			v[i] = x.v[i];
		}

		return this;
	}

	ref Matrix opAssign(U)(U x) if(IsMatrixInstantiation!(U) && is(U._T : _T) && (!is(U: Matrix) && (U.N != N)))
	{
		static foreach(i; 0 .. N)
		{
			v[i] = x.v[i];
		}

		return this;
	}

	ref T opIndex(size_t i, size_t j)
	{
		return v[(i * D) + j];
	}

	T opIndexAssign(U: T)(U x, size_t i, size_t j)
	{
		return v[(i * D) + j] = x;
	}

	bool opEquals(U)(U other) if(is(U: Matrix!(T, D)))
	{
		bool result = true;

		static foreach(i; 0 .. N)
		{
			if(Abs(this.v[i] - other.v[i]) > 0.0000009)
			{
				result = false;
			}
		}

		return result;
	}

	Matrix opBinary(string op)(T scalar) if(op == "*")
	{
		Matrix result;

		static foreach(i; 0 .. N)
		{
			result.v[i] = v[i] * scalar;
		}

		return result;
	}

	static if(D == 4)
	{
		Vec4 opBinary(string op)(Vec4 x) if(is(T: f32) && (op == "*"))
		{
			Vec4 result = 0.0;
			
			// TODO: optimize
			Mat4MulVec4(&this, &x, &result);

			return result;
		}

		Matrix opBinary(string op, U)(U x) if(is(U: Matrix!(T, D)) && is(T: f32) && D == 4 && (op == "*")) 
		{
			Matrix result;

			MatZero(&result);
			Mat4Mul(&this, &x, &result);

			return result;
		}
	}

	template IsTypeAssignable(U)
	{
		enum bool IsTypeAssignable = std.traits.isAssignable!(T, U);
	}

	template IsMatrixInstantiation(U)
	{
		static void IsMatrix(T, int D)(Matrix!(T, D) x) {}
		
		enum bool IsMatrixInstantiation = is(typeof(IsMatrix(U.init)));
	}
}

struct Quat
{
	union
	{
		f32[4] v   = 0;
		Vec4   vec;
		struct
		{
			f32 x;
			f32 y;
			f32 z;
			f32 w;
		};
	};

	this(f32 w, f32 x, f32 y, f32 z)
	{
		vec.x = x;
		vec.y = y;
		vec.z = z;
		vec.w = w;
	}

	U opCast(U)() if(is(U: Mat4))
	{
		Mat4 result;

		// TODO: optimize
		Mat4FromQuat(&this, &result);

		return result;
	}

	Quat opBinary(string op, U)(U r) if(op == "*" && is(U: Quat))
	{
		Quat q;

		q.x = this.w * r.x + this.x * r.w + this.y * r.z - this.z * r.y;
		q.y = this.w * r.y - this.x * r.z + this.y * r.w + this.z * r.x;
		q.z = this.w * r.z + this.x * r.y - this.y * r.x + this.z * r.w;
		q.w = this.w * r.w - this.x * r.x - this.y * r.y - this.z * r.z;

		return q;
	}
}

void
Mat4Mul(Mat4* l, Mat4* r, Mat4* result)
{
	version(X86_64)
	{
		asm @trusted 
		{
			mov    R8,   l;
			mov    R9,   r;
			mov    R10,  result;

			movups XMM0, [R8];
			movups XMM1, [R9+00];
			movups XMM2, [R9+16];
			movups XMM3, [R9+32];
			movups XMM4, [R9+48];

			movups XMM6, XMM1;
			shufps XMM6, XMM6, 0;   // XMM5 = vec.xxxx;
			mulps  XMM6, XMM0;      // XMM6 = col1;

			movups XMM7, XMM2;
			shufps XMM7, XMM7, 0;
			mulps  XMM7, XMM0;      // XMM7 = col2;

			movups XMM8, XMM3;
			shufps XMM8, XMM8, 0;
			mulps  XMM8, XMM0;      // XMM8 = col3;

			movups XMM9, XMM4;
			shufps XMM9, XMM9, 0;
			mulps  XMM9, XMM0;      // XMM9 = col4;

			movups XMM0, [R8+16];

			movups XMM5, XMM1;
			shufps XMM5, XMM5, 85;  // XMM5 = vec.yyyy;
			mulps  XMM5, XMM0;
			addps  XMM6, XMM5;

			movups XMM5, XMM2;
			shufps XMM5, XMM5, 85;
			mulps  XMM5, XMM0;
			addps  XMM7, XMM5;

			movups XMM5, XMM3;
			shufps XMM5, XMM5, 85;
			mulps  XMM5, XMM0;
			addps  XMM8, XMM5;

			movups XMM5, XMM4;
			shufps XMM5, XMM5, 85;
			mulps  XMM5, XMM0;
			addps  XMM9, XMM5;

			movups XMM0, [R8+32];

			movups XMM5, XMM1;
			shufps XMM5, XMM5, 170; // XMM5 = vec.zzzz;
			mulps  XMM5, XMM0;
			addps  XMM6, XMM5;

			movups XMM5, XMM2;
			shufps XMM5, XMM5, 170;
			mulps  XMM5, XMM0;
			addps  XMM7, XMM5;

			movups XMM5, XMM3;
			shufps XMM5, XMM5, 170;
			mulps  XMM5, XMM0;
			addps  XMM8, XMM5;

			movups XMM5, XMM4;
			shufps XMM5, XMM5, 170;
			mulps  XMM5, XMM0;
			addps  XMM9, XMM5;

			movups XMM0, [R8+48];

			movups XMM5, XMM1;
			shufps XMM5, XMM5, 255; // XMM5 = vec.wwww;
			mulps  XMM5, XMM0;
			addps  XMM6, XMM5;

			movups XMM5, XMM2;
			shufps XMM5, XMM5, 255;
			mulps  XMM5, XMM0;
			addps  XMM7, XMM5;

			movups XMM5, XMM3;
			shufps XMM5, XMM5, 255;
			mulps  XMM5, XMM0;
			addps  XMM8, XMM5;

			movups XMM5, XMM4;
			shufps XMM5, XMM5, 255;
			mulps  XMM5, XMM0;
			addps  XMM9, XMM5;

			movups [R10+00], XMM6;
			movups [R10+16], XMM7;
			movups [R10+32], XMM8;
			movups [R10+48], XMM9;
		}
	}
}

void
Vec4MulAddScale(Vec4* a, f32 s, Vec4* dst) // glm_vec4_muladds
{
	(*dst) += (*a)*s;
}

void
Mat4Inv(Mat4* mat, Mat4* dst) // glm_mat4_inv
{
  f32 a   = mat.rows[0][0], b   = mat.rows[0][1], c   = mat.rows[0][2], d   = mat.rows[0][3],
      e   = mat.rows[1][0], f   = mat.rows[1][1], g   = mat.rows[1][2], h   = mat.rows[1][3],
      i   = mat.rows[2][0], j   = mat.rows[2][1], k   = mat.rows[2][2], l   = mat.rows[2][3],
      m   = mat.rows[3][0], n   = mat.rows[3][1], o   = mat.rows[3][2], p   = mat.rows[3][3],

      c1  = k * p - l * o,  c2  = c * h - d * g,  c3  = i * p - l * m,
      c4  = a * h - d * e,  c5  = j * p - l * n,  c6  = b * h - d * f, 
      c7  = i * n - j * m,  c8  = a * f - b * e,  c9  = j * o - k * n,
      c10 = b * g - c * f,  c11 = i * o - k * m,  c12 = a * g - c * e,

      idt = 1.0f/(c8*c1+c4*c9+c10*c3+c2*c7-c12*c5-c6*c11), ndt = -idt;

  dst.rows[0][0] = (f * c1  - g * c5  + h * c9)  * idt;
  dst.rows[0][1] = (b * c1  - c * c5  + d * c9)  * ndt;
  dst.rows[0][2] = (n * c2  - o * c6  + p * c10) * idt;
  dst.rows[0][3] = (j * c2  - k * c6  + l * c10) * ndt;

  dst.rows[1][0] = (e * c1  - g * c3  + h * c11) * ndt;
  dst.rows[1][1] = (a * c1  - c * c3  + d * c11) * idt;
  dst.rows[1][2] = (m * c2  - o * c4  + p * c12) * ndt;
  dst.rows[1][3] = (i * c2  - k * c4  + l * c12) * idt;

  dst.rows[2][0] = (e * c5  - f * c3  + h * c7)  * idt;
  dst.rows[2][1] = (a * c5  - b * c3  + d * c7)  * ndt;
  dst.rows[2][2] = (m * c6  - n * c4  + p * c8)  * idt;
  dst.rows[2][3] = (i * c6  - j * c4  + l * c8)  * ndt;

  dst.rows[3][0] = (e * c9  - f * c11 + g * c7)  * ndt;
  dst.rows[3][1] = (a * c9  - b * c11 + c * c7)  * idt;
  dst.rows[3][2] = (m * c10 - n * c12 + o * c8)  * ndt;
  dst.rows[3][3] = (i * c10 - j * c12 + k * c8)  * idt;
}

void
Mat4MulVec4(Mat4* m, Vec4* v, Vec4* dst) // glm_mat4_mulv
{
  dst.v[0] = m.rows[0][0] * v.v[0] + m.rows[1][0] * v.v[1] + m.rows[2][0] * v.v[2] + m.rows[3][0] * v.v[3];
  dst.v[1] = m.rows[0][1] * v.v[0] + m.rows[1][1] * v.v[1] + m.rows[2][1] * v.v[2] + m.rows[3][1] * v.v[3];
  dst.v[2] = m.rows[0][2] * v.v[0] + m.rows[1][2] * v.v[1] + m.rows[2][2] * v.v[2] + m.rows[3][2] * v.v[3];
  dst.v[3] = m.rows[0][3] * v.v[0] + m.rows[1][3] * v.v[1] + m.rows[2][3] * v.v[2] + m.rows[3][3] * v.v[3];
}

void
Mat4FromQuat(Quat* q, Mat4* dst) // glm_quat_mat4
{
	f32 w,  x,  y,  z,
			xx, yy, zz,
			xy, yz, xz, 
			wx, wy, wz, norm, s;

	norm = Norm(q);
	s    = norm > 0.0f ? 2.0f/norm : 0.0f;

  x = q.v[0];
  y = q.v[1];
  z = q.v[2];
  w = q.v[3];

  xx = s * x * x;  xy = s * x * y;  wx = s * w * x;
  yy = s * y * y;  yz = s * y * z;  wy = s * w * y;
  zz = s * z * z;  xz = s * x * z;  wz = s * w * z;

  dst.rows[0][0] = 1.0f - yy - zz;
  dst.rows[1][1] = 1.0f - xx - zz;
  dst.rows[2][2] = 1.0f - xx - yy;

  dst.rows[0][1] = xy + wz;
  dst.rows[1][2] = yz + wx;
  dst.rows[2][0] = xz + wy;

  dst.rows[1][0] = xy - wz;
  dst.rows[2][1] = yz - wx;
  dst.rows[0][2] = xz - wy;

  dst.rows[0][3] = 0.0f;
  dst.rows[1][3] = 0.0f;
  dst.rows[2][3] = 0.0f;
  dst.rows[3][0] = 0.0f;
  dst.rows[3][1] = 0.0f;
  dst.rows[3][2] = 0.0f;
  dst.rows[3][3] = 1.0f;
}

pragma(inline) Mat4 
Mat4Identity()
{
	return Mat4(
		1.0, 0.0, 0.0, 0.0,
		0.0, 1.0, 0.0, 0.0,
		0.0, 0.0, 1.0, 0.0,
		0.0, 0.0, 0.0, 1.0
	);
}

pragma(inline) void
Mat4Identity(Mat4* mat)
{
	MatZero(mat);

	(*mat)[0, 0] = 1.0;
	(*mat)[1, 1] = 1.0;
	(*mat)[2, 2] = 1.0;
	(*mat)[3, 3] = 1.0;
}

 pragma(inline) Mat3 
Mat3Identity()
{
	return Mat3(
		1.0, 0.0, 0.0,
		0.0, 1.0, 0.0,
		0.0, 0.0, 1.0
	);
}

 pragma(inline) Mat2
Mat2Identity()
{
	return Mat2(
		1.0, 0.0,
		0.0, 1.0
	);
}

pragma(inline) Quat
QuatFromAxis(f32 angle, Vec3 axis) // glm_quatv
{
	Quat q;

  f32 a = angle * 0.5f;
  f32 c = cos(a);
  f32 s = sin(a);

	Vec3 k = Normalize(axis);

  q.x = s * k.x;
  q.y = s * k.y;
  q.z = s * k.z;
  q.w = c;

	return q;
}

pragma(inline) f32
Dot(Vec2* l, Vec2* r)
{
	return l.x * r.x + l.y * r.y;
}

pragma(inline) f32
Dot(Vec3* l, Vec3* r)
{
	return l.x * r.x + l.y * r.y + l.z * r.z;
}

pragma(inline) f32
Dot(Vec4* l, Vec4* r)
{
	// TODO: SIMD this
	return l.x * r.x + l.y * r.y + l.z * r.z + l.w * r.w;
}

pragma(inline) f32
Norm(Quat* q)
{
	return Norm(&q.vec);
}

pragma(inline) f32
Norm(Vec3* v)
{
	return sqrt(Dot(v, v));
}

pragma(inline) f32
Norm(Vec4* v)
{
	// TODO: SIMD this
	return sqrt(Dot(v, v));
}

pragma(inline) void
Normalize(T)(T* vec) if(is(T: Vec2) || is(T: Vec3) || is(T: Vec4))
{
	f32 length = Norm(vec);

	if(length < f32.epsilon)
	{
		static foreach(i; 0 .. vec._N)
		{
			vec.v[i] = 0.0;
		}
	}
	else
	{
		static foreach(i; 0 .. vec._N)
		{
			vec.v[i] *= (1.0 / length);
		}
	}
}

pragma(inline) T
Normalize(T)(T vec) if(is(T: Vec2) || is(T: Vec3) || is(T: Vec4))
{
	Normalize(&vec);
	return vec;
}

pragma(inline) Quat
Normalize(Quat q)
{
	f32 dot = Norm(&q.vec);
 
	if(dot <= 0.0)
	{
		q = Quat(1.0, 0.0, 0.0, 0.0);
	}

	q.vec *= 1.0 / sqrt(dot);

	return q;
}

pragma(inline) Mat4
Perspective(bool flip_y = true)(f32 fov, f32 aspect, f32 near, f32 far) // glm_perspective_lh_zo
{
	Mat4 mat;

  f32 f  = 1.0f / tanf(fovy * 0.5f);
  f32 fn = 1.0f / (nearZ - farZ);

  mat.rows[0][0] = f / aspect;
  mat.rows[1][1] = f;
  mat.rows[2][2] = -far * fn;
  mat.rows[2][3] = 1.0f;
  mat.rows[3][2] = near * far * fn;

	static if(flip_y)
	{
		mat[1, 1] *= -1.0;
	}

	return res;
}

void
Ortho(Mat4* mat, f32 left, f32 bottom, f32 right, f32 top, f32 near, f32 far) // glm_ortho_lh_zo
{
	MatZero(mat);

  f32 rl = 1.0f / (right  - left);
  f32 tb = 1.0f / (top    - bottom);
  f32 fn =-1.0f / (far    - near);

  mat.rows[0][0] = 2.0f * rl;
  mat.rows[1][1] = 2.0f * tb;
  mat.rows[2][2] =-fn;
  mat.rows[3][0] =-(right  + left)    * rl;
  mat.rows[3][1] =-(top    + bottom)  * tb;
  mat.rows[3][2] = near * fn;
  mat.rows[3][3] = 1.0f;
}

Mat4
Ortho(f32 left, f32 bottom, f32 right, f32 top, f32 near, f32 far)
{
	Mat4 mat;
	Ortho(&mat, left, bottom, right, top, near, far);
	return mat;
}

T
Mix(T, U)(T a, T b, U t) if((IsVector!(T) || is(T == f32) || is(T == f64)) && (is(U == f32) || is(U == f64)))
{
	return a + t * (b - a);
}

pragma(inline) Vec3
Rotate(Quat q, Vec3 vec)
{
	Quat p = Normalize(q);

	Vec3 i = p.vec.xyz;
	f32 r = p.vec.w;

	Vec3 v1 = i * (2.0 * Dot(&i, &vec));
	Vec3 v2 = i * (r * r - Dot(&i, &i));
	v1 += v2;

	v2 = i * vec;
	v2 = v2 * (2.0 * r);

	return v1 + v2;
}

pragma(inline) Mat4
LookAt(Vec3 eye, Vec3 center, Vec3 up) // glm_lookat_lh
{
	Mat4 result;

	Vec3 f = Normalize(center - eye);

	Vec3 s = CrossN(up, f);
	Vec3 u = Cross(f, s);

  result.rows[0][0] = s[0];
  result.rows[0][1] = u[0];
  result.rows[0][2] = f[0];
  result.rows[1][0] = s[1];
  result.rows[1][1] = u[1];
  result.rows[1][2] = f[1];
  result.rows[2][0] = s[2];
  result.rows[2][1] = u[2];
  result.rows[2][2] = f[2];
  result.rows[3][0] =-Dot(&s, &eye);
  result.rows[3][1] =-Dot(&u, &eye);
  result.rows[3][2] =-Dot(&f, &eye);
  result.rows[0][3] = result.rows[1][3] = result.rows[2][3] = 0.0f;
  result.rows[3][3] = 1.0f;

	return result;
}

pragma(inline) Mat4
Look(Vec3 eye, Vec3 dir, Vec3 up)
{
	return LookAt(eye, eye + dir, up);
}

pragma(inline) Vec3
Cross(Vec3 a, Vec3 b)
{
	Vec3 c;
	Cross(a, b, &c);
	return c;
}

pragma(inline) Vec3
CrossN(Vec3 a, Vec3 b)
{
	Vec3 c;
	Cross(a, b, &c);
	Normalize(&c);
	return c;
}

pragma(inline) void
CrossN(Vec3 a, Vec3 b, Vec3* dst)
{
	Cross(a, b, dst);
	Normalize(dst);
}

pragma(inline) void
Cross(Vec3 a, Vec3 b, Vec3* dst)
{
  dst.x = a.y * b.z - a.z * b.y;
  dst.y = a.z * b.x - a.x * b.z;
  dst.z = a.x * b.y - a.y * b.x;
}

pragma(inline) void
MatZero(Mat4* mat)
{
	mat.v[] = 0.0;
}

pragma(inline) void
Translate(Mat4* mat, Vec3 vec)
{
	Vec4MulAddScale(&mat.vec[0], vec.x, &mat.vec[3]);
	Vec4MulAddScale(&mat.vec[1], vec.y, &mat.vec[3]);
	Vec4MulAddScale(&mat.vec[2], vec.z, &mat.vec[3]);
}

pragma(inline) Mat4
Inverse(Mat4 mat)
{
	Mat4 res;
	MatZero(&res);

	// TODO: optimize
	Mat4Inv(&mat, &res);

	return res;
}

void
Transform(Vec3* vec, Mat4* mat, f32 last = 1.0)
{
	Vec4 tvec = Vec4(*vec, last);
	tvec = *mat * tvec;
	*vec = tvec.xyz;
}

pragma(inline) T
Lerp(T)(T x, T y, T a)
{
	return x * (1 - a) + y * a;
}

pragma(inline) T
InverseLerp(T)(T v, T min, T max)
{
	return (v - min) / (max - min);
}

pragma(inline) T
Remap(T)(T v, T in_min, T in_max, T out_min, T out_max)
{
	T t = InverseLerp(v, in_min, in_max);
	return Lerp(out_min, out_max, t);
}

template 
IsVector(T)
{
//	import std.traits : isInstanceOf;
	enum bool IsVector = isInstanceOf!(Vector, T);
}

T
Clamp(T)(T value, T min, T max) if(isNumeric!(T))
{
	assert(min <= max, "Clamp assertion failed: min is higher than max");
	T x = value < min ? min : value;
	return x > max ? max : x;
}

T
Clamp(T)(T value, T min, T max) if(IsVector!(T))
{
	static foreach(i; 0 .. value._N)
	{
		value.v[i] = Clamp(value.v[i], min.v[i], max.v[i]);
	}

	return value;
}

version(DLIB_TEST) unittest
{
	enum FLOAT_MAX = f32.max;
	enum FLOAT_MIN = -f32.max;

	/*

	void PrintMatrix(Mat4 mat)
	{
		foreach(i; 0 .. mat.N)
		{
			if(i % 4 == 0)
			{
				printf("\n");
			}
			printf("%.08f ", mat.v[i]);
		}
		printf("\n");
	}

	srand(cast(u32)time(null));

	f32 RandomFloat()
	{
		return cast(f32)(rand())/cast(f32)(RAND_MAX + 1.0);	
	}

	{ // Vec2 arithmetic
		Vec2 v1 = Vec2(5.0, 10.0);
		Vec2 v2 = Vec2(2.5, 5.0);

		Vec2 result = v1 * v2;

		assert(result == Vec2(12.5, 50.0), "Vec2 mul failure");

		result = v1 + v2;

		assert(result == Vec2(7.5, 15.0), "Vec2 add failure");

		result = v1 - v2;

		assert(result == Vec2(2.5, 5.0), "Vec2 sub failure");

		result = v1 / v2;

		assert(result == Vec2(2.0), "Vec2 div failure");
	}

	{ // Vec3 Arithmetic
		Vec3 v1 = Vec3(5.0, 10.0, 15.0);
		Vec3 v2 = Vec3(2.5, 5.0, 7.5);

		Vec3 result = v1 * v2;

		assert(result == Vec3(12.5, 50.0, 112.5), "Vec3 mul failure");

		result = v1 + v2;

		assert(result == Vec3(7.5, 15.0, 22.5), "Vec3 add failure");

		result = v1 - v2;

		assert(result == Vec3(2.5, 5.0, 7.5), "Vec3 sub failure");

		result = v1 / v2;

		assert(result == Vec3(2.0), "Vec3 div failure");
	}

	{ // Vec3 Arithmetic
		Vec4 v1 = Vec4(5.0, 10.0, 15.0, 20.0);
		Vec4 v2 = Vec4(2.5, 5.0, 7.5, 10.0);

		Vec4 result = v1 * v2;

		assert(result == Vec4(12.5, 50.0, 112.5, 200.0), "Vec4 mul failure");

		result = v1 + v2;

		assert(result == Vec4(7.5, 15.0, 22.5, 30.0), "Vec4 add failure");

		result = v1 - v2;

		assert(result == Vec4(2.5, 5.0, 7.5, 10.0), "Vec4 sub failure");

		result = v1 / v2;

		assert(result == Vec4(2.0), "Vec4 div failure");
	}

	{ // Mat4 Arithmetic
		Mat4 m1 = RandomMat4();
		Mat4 m2 = RandomMat4();
		Mat4 m3 = m1 * m2;
		Mat4 m4;
		
		MatZero(&m4);

		for(u32 i = 0; i < 4; i += 1)
		{
			for(u32 j = 0; j < 4; j += 1)
			{
				for(u32 k = 0; k < 4; k += 1)
				{
					m4.rows[i][j] += m1.rows[k][j] * m2.rows[i][k];
				}
			}
		}

		assert(m3 == m4, "Mat4 mul failure");
	}

	{ // Translate
		Mat4 mat = Mat4Identity();
		Vec4 vec = Vec4(1.0, 2.0, 3.0, 1.0);

		Translate(&mat, Vec3(13.0, 11.0, 7.0));
		Vec4 result = mat * vec;

		assert(result == Vec4(14.0, 13.0, 10.0, 1.0));

		mat = Mat4Identity();
		Translate(&mat, Vec3(1.0, -1.0, -5.0));
		result = mat * result;

		assert(result == Vec4(15.0, 12.0, 5.0, 1.0));
	}

	{ // Identity
		Mat4 identity = Mat4(
				1.0, 0.0, 0.0, 0.0,
				0.0, 1.0, 0.0, 0.0,
				0.0, 0.0, 1.0, 0.0,
				0.0, 0.0, 0.0, 1.0
				);
		Mat4 mat = Mat4Identity();

		assert(identity == mat);
	}


	{ // Inverse
		foreach(i; 0 .. 1000)
		{
			Mat4 m1 = RandomMat4();

			Mat4 m1_inv = Inverse(m1);
			Mat4 m1_reinv = Inverse(m1_inv);

			assert(m1 == m1_reinv, "Inverse test failed");
		}
	}

	{ // Cross
		Vec3 v1 = Vec3(2.0, -3.0, 4.0);
		Vec3 v2 = Vec3(12.0, -31.0, 43.0);

		Vec3 v3 = Cross(v1, v2);

		Vec3 v4 = Vec3(
			v1.y * v2.z - v1.z * v2.y,
			v1.z * v2.x - v1.x * v2.z,
			v1.x * v2.y - v1.y * v2.x
		);

		assert(v3 == v4, "Vec3 Cross failure");

		v3 = CrossN(v1, v2);

		Normalize(&v4);

		assert(v3 == v4, "Vec3 CrossN failure");
	}
	*/

	{ // Initializers
		u32[4] arr = [1, 2, 3, 4];
		Vec4 vec = Vec4(arr);
		assert(vec == Vec4(1.0, 2.0, 3.0, 4.0));

		Mat2 mat = Mat2(1.0, 0.0, 0.0, 1.0);

		f32[16] floats = 22.0;
		Mat4 mat4 = Mat4(floats);
		assert(mat4.v == floats);

		Quat quat = Quat(1.0, 1.0, 1.0, 1.0);

		Quat quat2;
	}

	{
		Vec3 vec = Vec3(1.0, 2.0, 3.0);
		Mat4 mat = Mat4(
			1.0, 1.0, 1.0, 1.0,
			2.0, 2.0, 2.0, 2.0,
			3.0, 3.0, 3.0, 3.0,
			4.0, 4.0, 4.0, 4.0
		);

		Transform(&vec, &mat, 1.0);

		assert(vec == Vec3(18.0));

		Vec4 low  = Vec4(-5.0);
		Vec4 high = Vec4(+5.0);
		Vec4 test = Vec4(-500.0);

		test = Clamp(test, low, high);

		assert(test == low);

		test = Vec4(+500.0);
		test = Clamp(test, low, high);

		assert(test == high);

	}

	{
		Vec4 m0 = Vec4(0.0);
		Vec4 m1 = Vec4(10.0);

		assert(Mix(m0, m1, 0.0)  == m0);
		assert(Mix(m0, m1, 1.0)  == m1);
		assert(Mix(m0, m1, 0.5)  == Vec4(5.0));
		assert(Mix(m0, m1, 0.75) == Vec4(7.5));

		void TestModify(Vec4 v)
		{
			v.r = 55;
		}
	}

	{
		u32 max = Max(50, 33, 123.0);
		assert(max == 123);
	}

	{
		Vec2 v0 = 50U;

		assert(Abs(v0.x-50.0) < 0.0009 && Abs(v0.y-50.0) < 0.0009);

		U8Vec2 v1 = U8Vec2(55U);

		assert(v1.x == 55 && v1.y == 55);
	}

	{
		assert(Ceil(10.5) == 11.0);
		assert(Floor(100.33) == 100.0);
		assert(Abs(-500) == 500);
		assert(Abs(-500.0) == 500.0);

		assert(Clamp(55, 10, 40)  == 40);
		assert(Clamp(-20, -5, 55) == -5);
	}
}