fix asm mat4 multiply

This commit is contained in:
Matthew 2025-10-25 13:21:31 +11:00
parent a97309cb04
commit f5fdd05336

79
math.d
View File

@ -521,7 +521,7 @@ align(16) struct Matrix(T, int D)
Matrix result; Matrix result;
MatZero(&result); MatZero(&result);
glm_mat4_mul(glm_mat.ptr, x.glm_mat.ptr, result.glm_mat.ptr); Mat4MulASM(&this, &x, &result);
return result; return result;
} }
@ -583,21 +583,14 @@ struct Quat
} }
} }
Mat4 void
Mat4MulASM(Mat4 l, Mat4 r) Mat4MulASM(Mat4* l, Mat4* r, Mat4* result)
{ {
Mat4 result;
auto lp = &l;
auto rp = &r;
auto res = &result;
// TODO: fix this
asm @trusted asm @trusted
{ {
mov R8, lp; mov R8, l;
mov R9, rp; mov R9, r;
mov R10, res; mov R10, result;
movups XMM0, [R8]; movups XMM0, [R8];
movups XMM1, [R9+00]; movups XMM1, [R9+00];
@ -607,93 +600,91 @@ Mat4MulASM(Mat4 l, Mat4 r)
movups XMM6, XMM1; movups XMM6, XMM1;
shufps XMM6, XMM6, 0; // XMM5 = vec.xxxx; shufps XMM6, XMM6, 0; // XMM5 = vec.xxxx;
mulps XMM6, XMM0; // XMM6 = col1; mulps XMM6, XMM0; // XMM6 = col1;
movups XMM7, XMM2; movups XMM7, XMM2;
shufps XMM7, XMM7, 0; shufps XMM7, XMM7, 0;
mulps XMM7, XMM0; // XMM7 = col2; mulps XMM7, XMM0; // XMM7 = col2;
movups XMM8, XMM3; movups XMM8, XMM3;
shufps XMM8, XMM8, 0; shufps XMM8, XMM8, 0;
mulps XMM8, XMM0; // XMM8 = col3; mulps XMM8, XMM0; // XMM8 = col3;
movups XMM9, XMM3; movups XMM9, XMM4;
shufps XMM9, XMM9, 0; shufps XMM9, XMM9, 0;
mulps XMM9, XMM0; // XMM9 = col4; mulps XMM9, XMM0; // XMM9 = col4;
movups XMM0, [R8+16]; movups XMM0, [R8+16];
movups XMM5, XMM1; movups XMM5, XMM1;
shufps XMM5, XMM5, 85; // XMM5 = vec.yyyy; shufps XMM5, XMM5, 85; // XMM5 = vec.yyyy;
mulps XMM5, XMM0; mulps XMM5, XMM0;
addps XMM6, XMM5; addps XMM6, XMM5;
movups XMM5, XMM2; movups XMM5, XMM2;
shufps XMM5, XMM5, 85; shufps XMM5, XMM5, 85;
mulps XMM5, XMM0; mulps XMM5, XMM0;
addps XMM7, XMM5; addps XMM7, XMM5;
movups XMM5, XMM3; movups XMM5, XMM3;
shufps XMM5, XMM5, 85; shufps XMM5, XMM5, 85;
mulps XMM5, XMM0; mulps XMM5, XMM0;
addps XMM8, XMM5; addps XMM8, XMM5;
movups XMM5, XMM4; movups XMM5, XMM4;
shufps XMM5, XMM5, 85; shufps XMM5, XMM5, 85;
mulps XMM5, XMM0; mulps XMM5, XMM0;
addps XMM9, XMM5; addps XMM9, XMM5;
movups XMM0, [R8+32]; movups XMM0, [R8+32];
movups XMM5, XMM1; movups XMM5, XMM1;
shufps XMM5, XMM5, 170; // XMM5 = vec.zzzz; shufps XMM5, XMM5, 170; // XMM5 = vec.zzzz;
mulps XMM5, XMM0; mulps XMM5, XMM0;
addps XMM6, XMM5; addps XMM6, XMM5;
movups XMM5, XMM2; movups XMM5, XMM2;
shufps XMM5, XMM5, 170; shufps XMM5, XMM5, 170;
mulps XMM5, XMM0; mulps XMM5, XMM0;
addps XMM7, XMM5; addps XMM7, XMM5;
movups XMM5, XMM3; movups XMM5, XMM3;
shufps XMM5, XMM5, 170; shufps XMM5, XMM5, 170;
mulps XMM5, XMM0; mulps XMM5, XMM0;
addps XMM8, XMM5; addps XMM8, XMM5;
movups XMM5, XMM4; movups XMM5, XMM4;
shufps XMM5, XMM5, 170; shufps XMM5, XMM5, 170;
mulps XMM5, XMM0; mulps XMM5, XMM0;
addps XMM9, XMM5; addps XMM9, XMM5;
movups XMM0, [R8+48]; movups XMM0, [R8+48];
movups XMM5, XMM1; movups XMM5, XMM1;
shufps XMM5, XMM5, 255; // XMM5 = vec.wwww; shufps XMM5, XMM5, 255; // XMM5 = vec.wwww;
mulps XMM5, XMM0; mulps XMM5, XMM0;
addps XMM6, XMM5; addps XMM6, XMM5;
movups XMM5, XMM2; movups XMM5, XMM2;
shufps XMM5, XMM5, 255; shufps XMM5, XMM5, 255;
mulps XMM5, XMM0; mulps XMM5, XMM0;
addps XMM7, XMM5; addps XMM7, XMM5;
movups XMM5, XMM3; movups XMM5, XMM3;
shufps XMM5, XMM5, 255; shufps XMM5, XMM5, 255;
mulps XMM5, XMM0; mulps XMM5, XMM0;
addps XMM8, XMM5; addps XMM8, XMM5;
movups XMM5, XMM4; movups XMM5, XMM4;
shufps XMM5, XMM5, 255; shufps XMM5, XMM5, 255;
mulps XMM5, XMM0; mulps XMM5, XMM0;
addps XMM9, XMM5; addps XMM9, XMM5;
movups [R10+00], XMM6; movups [R10+00], XMM6;
movups [R10+16], XMM7; movups [R10+16], XMM7;
movups [R10+32], XMM8; movups [R10+32], XMM8;
movups [R10+48], XMM9; movups [R10+48], XMM9;
} }
return result;
} }
pragma(inline) Mat4 pragma(inline) Mat4