diff --git a/math.d b/math.d index 098a328..78cc76c 100644 --- a/math.d +++ b/math.d @@ -521,7 +521,7 @@ align(16) struct Matrix(T, int D) Matrix result; MatZero(&result); - glm_mat4_mul(glm_mat.ptr, x.glm_mat.ptr, result.glm_mat.ptr); + Mat4MulASM(&this, &x, &result); return result; } @@ -583,21 +583,14 @@ struct Quat } } -Mat4 -Mat4MulASM(Mat4 l, Mat4 r) +void +Mat4MulASM(Mat4* l, Mat4* r, Mat4* result) { - Mat4 result; - - auto lp = &l; - auto rp = &r; - auto res = &result; - - // TODO: fix this asm @trusted { - mov R8, lp; - mov R9, rp; - mov R10, res; + mov R8, l; + mov R9, r; + mov R10, result; movups XMM0, [R8]; movups XMM1, [R9+00]; @@ -607,93 +600,91 @@ Mat4MulASM(Mat4 l, Mat4 r) movups XMM6, XMM1; shufps XMM6, XMM6, 0; // XMM5 = vec.xxxx; - mulps XMM6, XMM0; // XMM6 = col1; + mulps XMM6, XMM0; // XMM6 = col1; movups XMM7, XMM2; shufps XMM7, XMM7, 0; - mulps XMM7, XMM0; // XMM7 = col2; + mulps XMM7, XMM0; // XMM7 = col2; movups XMM8, XMM3; shufps XMM8, XMM8, 0; - mulps XMM8, XMM0; // XMM8 = col3; + mulps XMM8, XMM0; // XMM8 = col3; - movups XMM9, XMM3; + movups XMM9, XMM4; shufps XMM9, XMM9, 0; - mulps XMM9, XMM0; // XMM9 = col4; + mulps XMM9, XMM0; // XMM9 = col4; movups XMM0, [R8+16]; movups XMM5, XMM1; shufps XMM5, XMM5, 85; // XMM5 = vec.yyyy; - mulps XMM5, XMM0; - addps XMM6, XMM5; + mulps XMM5, XMM0; + addps XMM6, XMM5; movups XMM5, XMM2; shufps XMM5, XMM5, 85; - mulps XMM5, XMM0; - addps XMM7, XMM5; + mulps XMM5, XMM0; + addps XMM7, XMM5; movups XMM5, XMM3; shufps XMM5, XMM5, 85; - mulps XMM5, XMM0; - addps XMM8, XMM5; + mulps XMM5, XMM0; + addps XMM8, XMM5; movups XMM5, XMM4; shufps XMM5, XMM5, 85; - mulps XMM5, XMM0; - addps XMM9, XMM5; + mulps XMM5, XMM0; + addps XMM9, XMM5; movups XMM0, [R8+32]; movups XMM5, XMM1; shufps XMM5, XMM5, 170; // XMM5 = vec.zzzz; - mulps XMM5, XMM0; - addps XMM6, XMM5; + mulps XMM5, XMM0; + addps XMM6, XMM5; movups XMM5, XMM2; shufps XMM5, XMM5, 170; - mulps XMM5, XMM0; - addps XMM7, XMM5; + mulps XMM5, XMM0; + addps XMM7, XMM5; movups XMM5, XMM3; shufps XMM5, XMM5, 170; - mulps XMM5, XMM0; - addps XMM8, XMM5; - + mulps XMM5, XMM0; + addps XMM8, XMM5; + movups XMM5, XMM4; shufps XMM5, XMM5, 170; - mulps XMM5, XMM0; - addps XMM9, XMM5; + mulps XMM5, XMM0; + addps XMM9, XMM5; movups XMM0, [R8+48]; movups XMM5, XMM1; shufps XMM5, XMM5, 255; // XMM5 = vec.wwww; - mulps XMM5, XMM0; - addps XMM6, XMM5; + mulps XMM5, XMM0; + addps XMM6, XMM5; movups XMM5, XMM2; shufps XMM5, XMM5, 255; - mulps XMM5, XMM0; - addps XMM7, XMM5; + mulps XMM5, XMM0; + addps XMM7, XMM5; movups XMM5, XMM3; shufps XMM5, XMM5, 255; - mulps XMM5, XMM0; - addps XMM8, XMM5; + mulps XMM5, XMM0; + addps XMM8, XMM5; movups XMM5, XMM4; shufps XMM5, XMM5, 255; - mulps XMM5, XMM0; - addps XMM9, XMM5; + mulps XMM5, XMM0; + addps XMM9, XMM5; movups [R10+00], XMM6; movups [R10+16], XMM7; movups [R10+32], XMM8; movups [R10+48], XMM9; } - - return result; } pragma(inline) Mat4