commit fb45fd78ebb8e2c59de3ff89f932a50dba628638
Author: Matthew <matthew@sleepy.day>
Date:   Sat Aug 16 10:39:28 2025 +1000

    first commit

diff --git a/aliases.d b/aliases.d
new file mode 100644
index 0000000..e183734
--- /dev/null
+++ b/aliases.d
@@ -0,0 +1,42 @@
+import core.memory;
+import std.stdint;
+import math;
+
+debug
+{
+	const BUILD_DEBUG = true;
+}
+else
+{
+	const BUILD_DEBUG = false;
+}
+
+alias i8 = byte;
+alias i16 = short;
+alias i32 = int;
+alias i64 = long;
+
+alias u8 = ubyte;
+alias u16 = ushort;
+alias u32 = uint;
+alias u64 = ulong;
+
+alias f32 = float;
+alias f64 = double;
+
+alias b32 = uint;
+
+alias intptr = i64;
+alias uintptr = u64;
+
+alias usize = size_t;
+
+alias Vec2 = Vector!(f32, 2);
+alias Vec3 = Vector!(f32, 3);
+alias Vec4 = Vector!(f32, 4);
+
+alias UVec2 = Vector!(u32, 2);
+
+alias Mat2 = Matrix!(f32, 2);
+alias Mat3 = Matrix!(f32, 3);
+alias Mat4 = Matrix!(f32, 4);
diff --git a/alloc.d b/alloc.d
new file mode 100644
index 0000000..e3354ab
--- /dev/null
+++ b/alloc.d
@@ -0,0 +1,133 @@
+import aliases;
+import math;
+import std.stdio;
+import core.stdc.string : memset;
+import core.memory;
+import platform;
+
+const DEFAULT_ALIGNMENT = (void *).sizeof * 2;
+
+struct Arena
+{
+	u8* mem;
+	u64 length;
+	u64 pos;
+};
+
+T*
+MAlloc(T)()
+{
+	void* mem = MemAlloc(T.sizeof);
+	return cast(T*)mem;
+}
+
+T[]
+MAllocArray(T)(u64 count)
+{
+	void* mem = MemAlloc(T.sizeof * count);
+	return cast(T*)(mem)[0 .. count];
+}
+
+void
+MFree(T)(T* ptr)
+{
+	MemFree(cast(void*)ptr, T.sizeof);
+}
+
+void
+MFreeArray(T)(T[] slice)
+{
+	MemFree(cast(void*)slice.ptr, cast(u64)slice.length);
+}
+
+T*
+Alloc(T)()
+{
+	void* mem = pureMalloc(T.sizeof);
+	memset(mem, 0, T.sizeof);
+	return (cast(T*)mem);
+}
+
+T[]
+AllocArray(T)(u64 count)
+{
+	void* mem = pureMalloc(T.sizeof * count);
+	memset(mem, 0, T.sizeof * count);
+	return (cast(T*)mem)[0 .. count];
+}
+
+Arena 
+CreateArena(u64 size)
+{
+	Arena arena = {
+		mem: cast(u8 *)pureMalloc(size),
+		length: size,
+		pos: 0,
+	};
+
+	assert(arena.mem != null, "Unable to allocate memory for arena");
+
+	return arena;
+};
+
+T[]
+AllocArray(T)(Arena* arena, u64 count)
+{
+	void* mem = AllocAlign(arena, T.sizeof * count, DEFAULT_ALIGNMENT);
+	memset(mem, 0, T.sizeof * count);
+	return (cast(T*)mem)[0 .. count];
+}
+
+T* 
+Alloc(T)(Arena* arena)
+{
+	void* mem = AllocAlign(arena, T.sizeof, DEFAULT_ALIGNMENT);
+	memset(mem, 0, T.sizeof);
+	return cast(T*)mem;
+};
+
+void* 
+AllocAlign(Arena* arena, u64 size, u64 alignment)
+{
+	void* ptr = null;
+
+	uintptr mem_pos = cast(uintptr)arena.mem;
+	uintptr current = mem_pos + arena.pos;
+	uintptr offset = AlignPow2(current, alignment) - mem_pos;
+	
+	if (offset+size <= arena.length)
+	{
+		ptr = &arena.mem[offset];
+		arena.pos = offset+size;
+	}
+	else
+	{
+		writefln("AllocAlign failure: out of memory, size requested: %llu", size);
+		assert(0);
+	}
+
+	return ptr;
+};
+
+void
+Reset(Arena* arena)
+{
+	arena.pos = 0;
+}
+
+void
+Free(Arena* arena)
+{
+	pureFree(arena.mem);
+}
+
+void
+FreeArray(T)(T[] arr)
+{
+	pureFree(arr.ptr);
+}
+
+void Free(T)(T* ptr)
+{
+	pureFree(ptr);
+}
diff --git a/assets.d b/assets.d
new file mode 100644
index 0000000..bf68e9d
--- /dev/null
+++ b/assets.d
@@ -0,0 +1,317 @@
+import aliases;
+import std.file;
+import std.stdio;
+import util;
+import std.exception;
+import alloc;
+
+File Asset_File;
+
+FileHeader Asset_Header;
+
+AssetInfo[] Asset_Info;
+
+u8[][] Asset_Data;
+
+const u32 FILE_VERSION = 2;
+const u32 MODEL_VERSION = 1;
+
+enum AssetType : u32
+{
+	None,
+	ModelM3D,
+	Shader,
+	Texture,
+}
+
+alias AT = AssetType;
+
+struct FileHeader
+{
+	u32 magic;
+	u32 file_version;
+	u64 asset_count;
+	u64 asset_info_offset;
+}
+
+struct ModelHeader
+{
+	u32 magic;
+	u32 model_version;
+	u64 vertex_count;
+	u64 vertex_offset;
+	u64 index_count;
+	u64 index_offset;
+	u64 material_count;
+	u64 material_offset;
+	u64 texture_count;
+	u64 texture_offset;
+}
+
+struct Vertex
+{
+	Vec4 color;
+	Vec4 tangent;
+	Vec3 pos;
+	Vec3 normal;
+	Vec2 uv;
+}
+
+struct ModelData
+{
+	Vertex[] 			vertices;
+	u32[]		 			indices;
+	Material[] 		materials;
+	TextureInfo[] textures;
+}
+
+struct Material
+{
+	Vec4 ambient;
+	Vec4 albedo;
+	Vec4 specular;
+	u32 albedo_texture;
+	u32 ambient_texture;
+	u32 specular_texture;
+	u32 alpha_texture;
+	b32 albedo_has_texture;
+	b32 ambient_has_texture;
+	b32 specular_has_texture;
+	b32 alpha_has_texture;
+	f32 shininess = 0.0;
+	f32 alpha = 0.0;
+}
+
+struct TextureInfo
+{
+	string name;
+	u32 	 id;
+}
+
+struct TextureHeader
+{
+	u64 str_length;
+	u64 str_offset;
+	u32 texture_id;
+}
+
+struct ModelMeta
+{
+	u64 index_count;
+}
+
+struct TexData
+{
+	void* data;
+	TexMeta meta;
+}
+
+struct TexMeta
+{
+	u32 w;
+	u32 h;
+	u32 ch;
+}
+
+struct AssetInfo
+{
+	u64 hash;
+	u64 offset;
+	u64 length;
+	AssetType type;
+}
+
+bool Asset_Pack_Opened = false;
+
+debug
+{
+
+bool g_DIR_SET = false;
+
+void
+SetDir()
+{
+	if (exists("assets"))
+	{
+		chdir("./assets");
+	}
+	else if (exists("Gears") || exists("Gears.exe"))
+	{
+		chdir("../assets");
+	}
+	else
+	{
+		assert(false, "Unable to set directory");
+	}
+
+	g_DIR_SET = true;
+}
+
+u8[]
+LoadAssetData(Arena* arena, string name)
+{
+	if (!g_DIR_SET)
+	{
+		SetDir();
+	}
+
+	File f;
+	try 
+	{
+		f = File(name, "rb");
+	}
+	catch (ErrnoException e)
+	{
+		assert(false, "Unable to open file");
+	}
+
+	u8[] mem = AllocArray!(u8)(arena, f.size());
+	return f.rawRead(mem);
+}
+
+}
+else
+{
+
+void
+OpenAssetPack()
+{
+	if (!Asset_Pack_Opened)
+	{
+		bool success = true;
+		string file_path = exists("build/assets.sgp") ? "build/assets.sgp" : "assets.sgp";
+
+		// TODO: replace this with something that doesn't throw an exception and figure out if this is the best way to handle thing (probably isnt)
+		try 
+		{
+			Asset_File = File(file_path, "rb");
+		}
+		catch (ErrnoException e)
+		{
+			Logf("OpenAssetPack failure: Unable to open file %s", file_path);
+			assert(false, "Unable to open asset pack file");
+		}
+
+		FileHeader[1] header_arr;
+
+		Asset_File.rawRead(header_arr);
+
+		Asset_Header = header_arr[0];
+
+		Asset_Info = AllocArray!(AssetInfo)(Asset_Header.asset_count);
+		Asset_Data = AllocArray!(u8[])(Asset_Header.asset_count);
+
+		assert(Asset_Header.file_version == FILE_VERSION, "OpenAssetPack failure: file version incorrect");
+
+		Asset_File.seek(Asset_Header.asset_info_offset);
+
+		Asset_File.rawRead(Asset_Info);
+	}
+}
+
+pragma(inline): void
+CheckAssetPack()
+{
+	if (!Asset_Pack_Opened)
+	{
+		OpenAssetPack();
+	}
+}
+
+AssetInfo
+GetAssetInfo(string name)
+{
+	CheckAssetPack();
+
+	u64 hash = Hash(name);
+
+	AssetInfo asset_info;
+	foreach(i, info; Asset_Info)
+	{
+		if (info.hash == hash)
+		{
+			asset_info = info;
+			break;
+		}
+	}
+
+	assert(asset_info.hash != 0, "GetAssetInfo failure: unable to find matching asset");
+
+	return asset_info;
+}
+
+u8[]
+LoadAssetData(Arena* arena, string name)
+{
+	CheckAssetPack();
+
+	u64 hash = Hash(name);
+	u8[] data = null;
+
+	foreach(i, info; Asset_Info)
+	{
+		if (info.hash == hash)
+		{
+			data = AllocArray!(u8)(arena, info.length);
+			Asset_File.seek(info.offset);
+			Asset_File.rawRead(data);
+			assert(data != null && data.length == info.length, "LoadAssetData failure: Asset data loaded incorrectly");
+
+			break;
+		}
+	}
+
+	return data;
+}
+
+u8[]
+LoadAssetData(string name)
+{
+	CheckAssetPack();
+
+	u64 hash = Hash(name);
+	u8[] data = null;
+
+	foreach(i, info; Asset_Info)
+	{
+		if (info.hash == hash)
+		{
+			if (Asset_Data[i].ptr == null)
+			{
+				Asset_Data[i] = AllocArray!(u8)(info.length);
+				Asset_File.seek(info.offset);
+				Asset_File.rawRead(Asset_Data[i]);
+				assert(Asset_Data[i] != null && Asset_Data[i].length == info.length, "LoadAssetData failure: Asset data loaded incorrectly.");
+			}
+
+			data = Asset_Data[i];
+			break;
+		}
+	}
+
+	return data;
+}
+
+void
+UnloadAssetData(string name)
+{
+	u64 hash = Hash(name);
+
+	foreach(i, info; Asset_Info)
+	{
+		if (info.hash == hash)
+		{
+			if (Asset_Data[i] != null)
+			{
+				FreeArray(Asset_Data[i]);
+				break;
+			}
+		}
+	}
+}
+
+}
+
+
+
+
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..b75ab7a
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -eu
+
+if [ -z "$1" ]; then
+	echo "No output parameter named, please pass the build directory to this script."
+	exit 1
+fi
+
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+build="$1"
+
+if 	 [ -x "$(command -v g++)" ]; 		then cpp_compiler="g++"; c_compiler="gcc";
+elif [ -x "$(command -v clang++)" ]; then cpp_compiler="clang++"; c_compiler="clang";
+else echo "Unable to find c++ cpp_compiler"; exit -1; fi;
+
+if 	 [ -x "$(command -v mold)" ]; then linker_cmd="-fuse-ld=mold"; 
+elif [ -x "$(command -v lld)"  ]; then linker_cmd="-fuse-ld=lld";
+elif [ -x "$(command -v ld)"   ]; then linker_cmd="-fuse-ld=ld"; 
+else echo "Unable to find c/c++ linker"; exit -1; fi;
+
+# STB_IMAGE
+src="${script_dir}/external/stb/stb.c"
+flags="-std=c99 -Wno-everything -Iexternal/stb -c -static"
+obj="${build}/stb.o"
+lib="${build}/libstb.a"
+
+if ! [ -f "${build}/libstb.a" ]; then
+	$c_compiler $flags $src $out $obj
+	ar rcs $lib $obj
+	rm $obj
+fi
+
+# M3D
+src="${script_dir}/external/m3d/m3d.c"
+flags="-std=c99 -Wno-everything -Iexternal/m3d -c -static"
+obj="${build}/m3d.o"
+lib="${build}/libm3d.a"
+
+if ! [ -f "${build}/libm3d.a" ]; then
+	$c_compiler $flags $src $out $obj
+	ar rcs $lib $obj
+	rm $obj
+fi
+
+# CGLM
+src="${script_dir}/external/cglm/cglm.c"
+flags="-std=c99 -Wno-everything -Iexternal/cglm -c -static"
+obj="${build}/cglm.o"
+lib="${build}/libcglm.a"
+
+if ! [ -f "${build}/libcglm.a" ]; then
+	$c_compiler $flags $src $out $obj
+	ar rcs $lib $obj
+	rm $obj
+fi
+
diff --git a/external/cglm/aabb2d.h b/external/cglm/aabb2d.h
new file mode 100644
index 0000000..6369d08
--- /dev/null
+++ b/external/cglm/aabb2d.h
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_aabb2d_h
+#define cglm_aabb2d_h
+
+#include "common.h"
+#include "vec2.h"
+#include "util.h"
+
+/* DEPRECATED! use _diag */
+#define glm_aabb2d_size(aabb)         glm_aabb2d_diag(aabb)
+
+/*!
+ * @brief make [aabb] zero
+ *
+ * @param[in, out]  aabb aabb
+ */
+CGLM_INLINE
+void
+glm_aabb2d_zero(vec2 aabb[2]) {
+  glm_vec2_zero(aabb[0]);
+  glm_vec2_zero(aabb[1]);
+}
+
+/*!
+ * @brief copy all members of [aabb] to [dest]
+ *
+ * @param[in]  aabb source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_aabb2d_copy(vec2 aabb[2], vec2 dest[2]) {
+  glm_vec2_copy(aabb[0], dest[0]);
+  glm_vec2_copy(aabb[1], dest[1]);
+}
+
+/*!
+ * @brief apply transform to Axis-Aligned Bounding aabb
+ *
+ * @param[in]  aabb  bounding aabb
+ * @param[in]  m    transform matrix
+ * @param[out] dest transformed bounding aabb
+ */
+CGLM_INLINE
+void
+glm_aabb2d_transform(vec2 aabb[2], mat3 m, vec2 dest[2]) {
+  vec2 v[2], xa, xb, ya, yb;
+
+  glm_vec2_scale(m[0], aabb[0][0], xa);
+  glm_vec2_scale(m[0], aabb[1][0], xb);
+
+  glm_vec2_scale(m[1], aabb[0][1], ya);
+  glm_vec2_scale(m[1], aabb[1][1], yb);
+
+  /* translation + min(xa, xb) + min(ya, yb) */
+  glm_vec2(m[2], v[0]);
+  glm_vec2_minadd(xa, xb, v[0]);
+  glm_vec2_minadd(ya, yb, v[0]);
+
+  /* translation + max(xa, xb) + max(ya, yb) */
+  glm_vec2(m[2], v[1]);
+  glm_vec2_maxadd(xa, xb, v[1]);
+  glm_vec2_maxadd(ya, yb, v[1]);
+
+  glm_vec2_copy(v[0], dest[0]);
+  glm_vec2_copy(v[1], dest[1]);
+}
+
+/*!
+ * @brief merges two AABB bounding aabb and creates new one
+ *
+ * two aabb must be in same space, if one of aabb is in different space then
+ * you should consider to convert it's space by glm_aabb_space
+ *
+ * @param[in]  aabb1 bounding aabb 1
+ * @param[in]  aabb2 bounding aabb 2
+ * @param[out] dest merged bounding aabb
+ */
+CGLM_INLINE
+void
+glm_aabb2d_merge(vec2 aabb1[2], vec2 aabb2[2], vec2 dest[2]) {
+  dest[0][0] = glm_min(aabb1[0][0], aabb2[0][0]);
+  dest[0][1] = glm_min(aabb1[0][1], aabb2[0][1]);
+
+  dest[1][0] = glm_max(aabb1[1][0], aabb2[1][0]);
+  dest[1][1] = glm_max(aabb1[1][1], aabb2[1][1]);
+}
+
+/*!
+ * @brief crops a bounding aabb with another one.
+ *
+ * this could be useful for getting a baabb which fits with view frustum and
+ * object bounding aabbes. In this case you crop view frustum aabb with objects
+ * aabb
+ *
+ * @param[in]  aabb     bounding aabb 1
+ * @param[in]  cropAabb crop aabb
+ * @param[out] dest     cropped bounding aabb
+ */
+CGLM_INLINE
+void
+glm_aabb2d_crop(vec2 aabb[2], vec2 cropAabb[2], vec2 dest[2]) {
+  dest[0][0] = glm_max(aabb[0][0], cropAabb[0][0]);
+  dest[0][1] = glm_max(aabb[0][1], cropAabb[0][1]);
+
+  dest[1][0] = glm_min(aabb[1][0], cropAabb[1][0]);
+  dest[1][1] = glm_min(aabb[1][1], cropAabb[1][1]);
+}
+
+/*!
+ * @brief crops a bounding aabb with another one.
+ *
+ * this could be useful for getting a baabb which fits with view frustum and
+ * object bounding aabbes. In this case you crop view frustum aabb with objects
+ * aabb
+ *
+ * @param[in]  aabb      bounding aabb
+ * @param[in]  cropAabb  crop aabb
+ * @param[in]  clampAabb minimum aabb
+ * @param[out] dest      cropped bounding aabb
+ */
+CGLM_INLINE
+void
+glm_aabb2d_crop_until(vec2 aabb[2],
+                      vec2 cropAabb[2],
+                      vec2 clampAabb[2],
+                      vec2 dest[2]) {
+  glm_aabb2d_crop(aabb, cropAabb, dest);
+  glm_aabb2d_merge(clampAabb, dest, dest);
+}
+
+/*!
+ * @brief invalidate AABB min and max values
+ *
+ * @param[in, out]  aabb bounding aabb
+ */
+CGLM_INLINE
+void
+glm_aabb2d_invalidate(vec2 aabb[2]) {
+  glm_vec2_fill(aabb[0], FLT_MAX);
+  glm_vec2_fill(aabb[1], -FLT_MAX);
+}
+
+/*!
+ * @brief check if AABB is valid or not
+ *
+ * @param[in]  aabb bounding aabb
+ */
+CGLM_INLINE
+bool
+glm_aabb2d_isvalid(vec2 aabb[2]) {
+  return glm_vec2_max(aabb[0]) != FLT_MAX
+         && glm_vec2_min(aabb[1]) != -FLT_MAX;
+}
+
+/*!
+ * @brief distance between of min and max
+ *
+ * @param[in]  aabb bounding aabb
+ */
+CGLM_INLINE
+float
+glm_aabb2d_diag(vec2 aabb[2]) {
+  return glm_vec2_distance(aabb[0], aabb[1]);
+}
+
+/*!
+ * @brief size of aabb
+ *
+ * @param[in]  aabb bounding aabb
+ * @param[out]  dest size
+ */
+CGLM_INLINE
+void
+glm_aabb2d_sizev(vec2 aabb[2], vec2 dest) {
+  glm_vec2_sub(aabb[1], aabb[0], dest); 
+}
+
+/*!
+ * @brief radius of sphere which surrounds AABB
+ *
+ * @param[in]  aabb bounding aabb
+ */
+CGLM_INLINE
+float
+glm_aabb2d_radius(vec2 aabb[2]) {
+  return glm_aabb2d_diag(aabb) * 0.5f;
+}
+
+/*!
+ * @brief computes center point of AABB
+ *
+ * @param[in]   aabb  bounding aabb
+ * @param[out]  dest center of bounding aabb
+ */
+CGLM_INLINE
+void
+glm_aabb2d_center(vec2 aabb[2], vec2 dest) {
+  glm_vec2_center(aabb[0], aabb[1], dest);
+}
+
+/*!
+ * @brief check if two AABB intersects
+ *
+ * @param[in]   aabb    bounding aabb
+ * @param[in]   other  other bounding aabb
+ */
+CGLM_INLINE
+bool
+glm_aabb2d_aabb(vec2 aabb[2], vec2 other[2]) {
+  return (aabb[0][0] <= other[1][0] && aabb[1][0] >= other[0][0])
+      && (aabb[0][1] <= other[1][1] && aabb[1][1] >= other[0][1]);
+}
+
+/*!
+ * @brief check if AABB intersects with a circle
+ *
+ * Circle Representation in cglm: [center.x, center.y, radii]
+ *
+ * @param[in]   aabb   solid bounding aabb
+ * @param[in]   c      solid circle
+ */
+CGLM_INLINE
+bool
+glm_aabb2d_circle(vec2 aabb[2], vec3 c) {
+  float dmin;
+  int   a, b;
+
+  a = (c[0] < aabb[0][0]) + (c[0] > aabb[1][0]);
+  b = (c[1] < aabb[0][1]) + (c[1] > aabb[1][1]);
+
+  dmin  = glm_pow2((c[0] - aabb[!(a - 1)][0]) * (a != 0))
+        + glm_pow2((c[1] - aabb[!(b - 1)][1]) * (b != 0));
+
+  return dmin <= glm_pow2(c[2]);
+}
+
+/*!
+ * @brief check if point is inside of AABB
+ *
+ * @param[in]   aabb    bounding aabb
+ * @param[in]   point  point
+ */
+CGLM_INLINE
+bool
+glm_aabb2d_point(vec2 aabb[2], vec2 point) {
+  return (point[0] >= aabb[0][0] && point[0] <= aabb[1][0])
+      && (point[1] >= aabb[0][1] && point[1] <= aabb[1][1]);
+}
+
+/*!
+ * @brief check if AABB contains other AABB
+ *
+ * @param[in]   aabb    bounding aabb
+ * @param[in]   other  other bounding aabb
+ */
+CGLM_INLINE
+bool
+glm_aabb2d_contains(vec2 aabb[2], vec2 other[2]) {
+  return (aabb[0][0] <= other[0][0] && aabb[1][0] >= other[1][0])
+      && (aabb[0][1] <= other[0][1] && aabb[1][1] >= other[1][1]);
+}
+
+#endif /* cglm_aabb2d_h */
diff --git a/external/cglm/affine-mat.h b/external/cglm/affine-mat.h
new file mode 100644
index 0000000..c22c0e0
--- /dev/null
+++ b/external/cglm/affine-mat.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_mul(mat4 m1, mat4 m2, mat4 dest);
+   CGLM_INLINE void glm_mul_rot(mat4 m1, mat4 m2, mat4 dest);
+   CGLM_INLINE void glm_inv_tr(mat4 mat);
+ */
+
+#ifndef cglm_affine_mat_h
+#define cglm_affine_mat_h
+
+#include "common.h"
+#include "mat4.h"
+#include "mat3.h"
+
+#ifdef CGLM_SSE_FP
+#  include "simd/sse2/affine.h"
+#endif
+
+#ifdef CGLM_AVX_FP
+#  include "simd/avx/affine.h"
+#endif
+
+#ifdef CGLM_NEON_FP
+#  include "simd/neon/affine.h"
+#endif
+
+#ifdef CGLM_SIMD_WASM
+#  include "simd/wasm/affine.h"
+#endif
+
+/*!
+ * @brief this is similar to glm_mat4_mul but specialized to affine transform
+ *
+ * Matrix format should be:
+ *   R  R  R  X
+ *   R  R  R  Y
+ *   R  R  R  Z
+ *   0  0  0  W
+ *
+ * this reduces some multiplications. It should be faster than mat4_mul.
+ * if you are not sure about matrix format then DON'T use this! use mat4_mul
+ *
+ * @param[in]   m1    affine matrix 1
+ * @param[in]   m2    affine matrix 2
+ * @param[out]  dest  result matrix
+ */
+CGLM_INLINE
+void
+glm_mul(mat4 m1, mat4 m2, mat4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_mul_wasm(m1, m2, dest);
+#elif defined(__AVX__)
+  glm_mul_avx(m1, m2, dest);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_mul_sse2(m1, m2, dest);
+#elif defined(CGLM_NEON_FP)
+  glm_mul_neon(m1, m2, dest);
+#else
+  float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3],
+        a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2], a13 = m1[1][3],
+        a20 = m1[2][0], a21 = m1[2][1], a22 = m1[2][2], a23 = m1[2][3],
+        a30 = m1[3][0], a31 = m1[3][1], a32 = m1[3][2], a33 = m1[3][3],
+
+        b00 = m2[0][0], b01 = m2[0][1], b02 = m2[0][2],
+        b10 = m2[1][0], b11 = m2[1][1], b12 = m2[1][2],
+        b20 = m2[2][0], b21 = m2[2][1], b22 = m2[2][2],
+        b30 = m2[3][0], b31 = m2[3][1], b32 = m2[3][2], b33 = m2[3][3];
+
+  dest[0][0] = a00 * b00 + a10 * b01 + a20 * b02;
+  dest[0][1] = a01 * b00 + a11 * b01 + a21 * b02;
+  dest[0][2] = a02 * b00 + a12 * b01 + a22 * b02;
+  dest[0][3] = a03 * b00 + a13 * b01 + a23 * b02;
+
+  dest[1][0] = a00 * b10 + a10 * b11 + a20 * b12;
+  dest[1][1] = a01 * b10 + a11 * b11 + a21 * b12;
+  dest[1][2] = a02 * b10 + a12 * b11 + a22 * b12;
+  dest[1][3] = a03 * b10 + a13 * b11 + a23 * b12;
+
+  dest[2][0] = a00 * b20 + a10 * b21 + a20 * b22;
+  dest[2][1] = a01 * b20 + a11 * b21 + a21 * b22;
+  dest[2][2] = a02 * b20 + a12 * b21 + a22 * b22;
+  dest[2][3] = a03 * b20 + a13 * b21 + a23 * b22;
+
+  dest[3][0] = a00 * b30 + a10 * b31 + a20 * b32 + a30 * b33;
+  dest[3][1] = a01 * b30 + a11 * b31 + a21 * b32 + a31 * b33;
+  dest[3][2] = a02 * b30 + a12 * b31 + a22 * b32 + a32 * b33;
+  dest[3][3] = a03 * b30 + a13 * b31 + a23 * b32 + a33 * b33;
+#endif
+}
+
+/*!
+ * @brief this is similar to glm_mat4_mul but specialized to affine transform
+ *
+ * Right Matrix format should be:
+ *   R  R  R  0
+ *   R  R  R  0
+ *   R  R  R  0
+ *   0  0  0  1
+ *
+ * this reduces some multiplications. It should be faster than mat4_mul.
+ * if you are not sure about matrix format then DON'T use this! use mat4_mul
+ *
+ * @param[in]   m1    affine matrix 1
+ * @param[in]   m2    affine matrix 2
+ * @param[out]  dest  result matrix
+ */
+CGLM_INLINE
+void
+glm_mul_rot(mat4 m1, mat4 m2, mat4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_mul_rot_wasm(m1, m2, dest);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_mul_rot_sse2(m1, m2, dest);
+#elif defined(CGLM_NEON_FP)
+  glm_mul_rot_neon(m1, m2, dest);
+#else
+  float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3],
+        a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2], a13 = m1[1][3],
+        a20 = m1[2][0], a21 = m1[2][1], a22 = m1[2][2], a23 = m1[2][3],
+        a30 = m1[3][0], a31 = m1[3][1], a32 = m1[3][2], a33 = m1[3][3],
+
+        b00 = m2[0][0], b01 = m2[0][1], b02 = m2[0][2],
+        b10 = m2[1][0], b11 = m2[1][1], b12 = m2[1][2],
+        b20 = m2[2][0], b21 = m2[2][1], b22 = m2[2][2];
+
+  dest[0][0] = a00 * b00 + a10 * b01 + a20 * b02;
+  dest[0][1] = a01 * b00 + a11 * b01 + a21 * b02;
+  dest[0][2] = a02 * b00 + a12 * b01 + a22 * b02;
+  dest[0][3] = a03 * b00 + a13 * b01 + a23 * b02;
+
+  dest[1][0] = a00 * b10 + a10 * b11 + a20 * b12;
+  dest[1][1] = a01 * b10 + a11 * b11 + a21 * b12;
+  dest[1][2] = a02 * b10 + a12 * b11 + a22 * b12;
+  dest[1][3] = a03 * b10 + a13 * b11 + a23 * b12;
+
+  dest[2][0] = a00 * b20 + a10 * b21 + a20 * b22;
+  dest[2][1] = a01 * b20 + a11 * b21 + a21 * b22;
+  dest[2][2] = a02 * b20 + a12 * b21 + a22 * b22;
+  dest[2][3] = a03 * b20 + a13 * b21 + a23 * b22;
+
+  dest[3][0] = a30;
+  dest[3][1] = a31;
+  dest[3][2] = a32;
+  dest[3][3] = a33;
+#endif
+}
+
+/*!
+ * @brief inverse orthonormal rotation + translation matrix (ridig-body)
+ *
+ * @code
+ * X = | R  T |   X' = | R' -R'T |
+ *     | 0  1 |        | 0     1 |
+ * @endcode
+ *
+ * @param[in,out]  mat  matrix
+ */
+CGLM_INLINE
+void
+glm_inv_tr(mat4 mat) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_inv_tr_wasm(mat);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_inv_tr_sse2(mat);
+#elif defined(CGLM_NEON_FP)
+  glm_inv_tr_neon(mat);
+#else
+  CGLM_ALIGN_MAT mat3 r;
+  CGLM_ALIGN(8)  vec3 t;
+
+  /* rotate */
+  glm_mat4_pick3t(mat, r);
+  glm_mat4_ins3(r, mat);
+
+  /* translate */
+  glm_mat3_mulv(r, mat[3], t);
+  glm_vec3_negate(t);
+  glm_vec3_copy(t, mat[3]);
+#endif
+}
+
+#endif /* cglm_affine_mat_h */
diff --git a/external/cglm/affine-post.h b/external/cglm/affine-post.h
new file mode 100644
index 0000000..3e297e6
--- /dev/null
+++ b/external/cglm/affine-post.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_affine_post_h
+#define cglm_affine_post_h
+
+/*
+ Functions:
+   CGLM_INLINE void glm_translated_to(mat4 m, vec3 v, mat4 dest);
+   CGLM_INLINE void glm_translated(mat4 m, vec3 v);
+   CGLM_INLINE void glm_translated_x(mat4 m, float to);
+   CGLM_INLINE void glm_translated_y(mat4 m, float to);
+   CGLM_INLINE void glm_translated_z(mat4 m, float to);
+   CGLM_INLINE void glm_rotated_x(mat4 m, float angle, mat4 dest);
+   CGLM_INLINE void glm_rotated_y(mat4 m, float angle, mat4 dest);
+   CGLM_INLINE void glm_rotated_z(mat4 m, float angle, mat4 dest);
+   CGLM_INLINE void glm_rotated(mat4 m, float angle, vec3 axis);
+   CGLM_INLINE void glm_rotated_at(mat4 m, vec3 pivot, float angle, vec3 axis);
+   CGLM_INLINE void glm_spinned(mat4 m, float angle, vec3 axis);
+ */
+
+#include "common.h"
+#include "util.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+#include "affine-mat.h"
+
+/*!
+ * @brief translate existing transform matrix by v vector
+ *        and stores result in same matrix
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       v  translate vector [x, y, z]
+ */
+CGLM_INLINE
+void
+glm_translated(mat4 m, vec3 v) {
+  glm_vec3_add(m[3], v, m[3]);
+}
+
+/*!
+ * @brief translate existing transform matrix by v vector
+ *        and store result in dest
+ *
+ * source matrix will remain same
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in]  m    affine transform
+ * @param[in]  v    translate vector [x, y, z]
+ * @param[out] dest translated matrix
+ */
+CGLM_INLINE
+void
+glm_translated_to(mat4 m, vec3 v, mat4 dest) {
+  glm_mat4_copy(m, dest);
+  glm_translated(dest, v);
+}
+
+/*!
+ * @brief translate existing transform matrix by x factor
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       x  x factor
+ */
+CGLM_INLINE
+void
+glm_translated_x(mat4 m, float x) {
+  m[3][0] += x;
+}
+
+/*!
+ * @brief translate existing transform matrix by y factor
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       y  y factor
+ */
+CGLM_INLINE
+void
+glm_translated_y(mat4 m, float y) {
+  m[3][1] += y;
+}
+
+/*!
+ * @brief translate existing transform matrix by z factor
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       z  z factor
+ */
+CGLM_INLINE
+void
+glm_translated_z(mat4 m, float z) {
+  m[3][2] += z;
+}
+
+/*!
+ * @brief rotate existing transform matrix around X axis by angle
+ *        and store result in dest
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in]   m      affine transform
+ * @param[in]   angle  angle (radians)
+ * @param[out]  dest   rotated matrix
+ */
+CGLM_INLINE
+void
+glm_rotated_x(mat4 m, float angle, mat4 dest) {
+  CGLM_ALIGN_MAT mat4 t = GLM_MAT4_IDENTITY_INIT;
+  float c, s;
+
+  c = cosf(angle);
+  s = sinf(angle);
+
+  t[1][1] =  c;
+  t[1][2] =  s;
+  t[2][1] = -s;
+  t[2][2] =  c;
+
+  glm_mul_rot(t, m, dest);
+}
+
+/*!
+ * @brief rotate existing transform matrix around Y axis by angle
+ *        and store result in dest
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in]   m      affine transform
+ * @param[in]   angle  angle (radians)
+ * @param[out]  dest   rotated matrix
+ */
+CGLM_INLINE
+void
+glm_rotated_y(mat4 m, float angle, mat4 dest) {
+  CGLM_ALIGN_MAT mat4 t = GLM_MAT4_IDENTITY_INIT;
+  float c, s;
+
+  c = cosf(angle);
+  s = sinf(angle);
+
+  t[0][0] =  c;
+  t[0][2] = -s;
+  t[2][0] =  s;
+  t[2][2] =  c;
+
+  glm_mul_rot(t, m, dest);
+}
+
+/*!
+ * @brief rotate existing transform matrix around Z axis by angle
+ *        and store result in dest
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in]   m      affine transform
+ * @param[in]   angle  angle (radians)
+ * @param[out]  dest   rotated matrix
+ */
+CGLM_INLINE
+void
+glm_rotated_z(mat4 m, float angle, mat4 dest) {
+  CGLM_ALIGN_MAT mat4 t = GLM_MAT4_IDENTITY_INIT;
+  float c, s;
+
+  c = cosf(angle);
+  s = sinf(angle);
+
+  t[0][0] =  c;
+  t[0][1] =  s;
+  t[1][0] = -s;
+  t[1][1] =  c;
+
+  glm_mul_rot(t, m, dest);
+}
+
+/*!
+ * @brief rotate existing transform matrix around given axis by angle
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in, out]  m      affine transform
+ * @param[in]       angle  angle (radians)
+ * @param[in]       axis   axis
+ */
+CGLM_INLINE
+void
+glm_rotated(mat4 m, float angle, vec3 axis) {
+  CGLM_ALIGN_MAT mat4 rot;
+  glm_rotate_make(rot, angle, axis);
+  glm_mul_rot(rot, m, m);
+}
+
+/*!
+ * @brief rotate existing transform
+ *        around given axis by angle at given pivot point (rotation center)
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in, out]  m      affine transform
+ * @param[in]       pivot  rotation center
+ * @param[in]       angle  angle (radians)
+ * @param[in]       axis   axis
+ */
+CGLM_INLINE
+void
+glm_rotated_at(mat4 m, vec3 pivot, float angle, vec3 axis) {
+  CGLM_ALIGN(8) vec3 pivotInv;
+
+  glm_vec3_negate_to(pivot, pivotInv);
+
+  glm_translated(m, pivot);
+  glm_rotated(m, angle, axis);
+  glm_translated(m, pivotInv);
+}
+
+/*!
+ * @brief rotate existing transform matrix around given axis by angle around self (doesn't affected by position)
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in, out]  m      affine transform
+ * @param[in]       angle  angle (radians)
+ * @param[in]       axis   axis
+ */
+CGLM_INLINE
+void
+glm_spinned(mat4 m, float angle, vec3 axis) {
+  CGLM_ALIGN_MAT mat4 rot;
+  glm_rotate_atm(rot, m[3], angle, axis);
+  glm_mat4_mul(rot, m, m);
+}
+
+#endif /* cglm_affine_post_h */
diff --git a/external/cglm/affine-pre.h b/external/cglm/affine-pre.h
new file mode 100644
index 0000000..2fa77f7
--- /dev/null
+++ b/external/cglm/affine-pre.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_affine_pre_h
+#define cglm_affine_pre_h
+
+/*
+ Functions:
+   CGLM_INLINE void glm_translate_to(mat4 m, vec3 v, mat4 dest);
+   CGLM_INLINE void glm_translate(mat4 m, vec3 v);
+   CGLM_INLINE void glm_translate_x(mat4 m, float to);
+   CGLM_INLINE void glm_translate_y(mat4 m, float to);
+   CGLM_INLINE void glm_translate_z(mat4 m, float to);
+   CGLM_INLINE void glm_rotate_x(mat4 m, float angle, mat4 dest);
+   CGLM_INLINE void glm_rotate_y(mat4 m, float angle, mat4 dest);
+   CGLM_INLINE void glm_rotate_z(mat4 m, float angle, mat4 dest);
+   CGLM_INLINE void glm_rotate(mat4 m, float angle, vec3 axis);
+   CGLM_INLINE void glm_rotate_at(mat4 m, vec3 pivot, float angle, vec3 axis);
+   CGLM_INLINE void glm_rotate_atm(mat4 m, vec3 pivot, float angle, vec3 axis);
+   CGLM_INLINE void glm_spin(mat4 m, float angle, vec3 axis);
+ */
+
+#include "common.h"
+#include "util.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+#include "affine-mat.h"
+
+/*!
+ * @brief translate existing transform matrix by v vector
+ *        and stores result in same matrix
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       v  translate vector [x, y, z]
+ */
+CGLM_INLINE
+void
+glm_translate(mat4 m, vec3 v) {
+#if defined(CGLM_SIMD)
+  glmm_128 m0, m1, m2, m3;
+
+  m0 = glmm_load(m[0]);
+  m1 = glmm_load(m[1]);
+  m2 = glmm_load(m[2]);
+  m3 = glmm_load(m[3]);
+
+  glmm_store(m[3],
+             glmm_fmadd(m0, glmm_set1(v[0]),
+                        glmm_fmadd(m1, glmm_set1(v[1]),
+                                   glmm_fmadd(m2, glmm_set1(v[2]), m3))));
+#else
+  glm_vec4_muladds(m[0], v[0], m[3]);
+  glm_vec4_muladds(m[1], v[1], m[3]);
+  glm_vec4_muladds(m[2], v[2], m[3]);
+#endif
+}
+
+/*!
+ * @brief translate existing transform matrix by v vector
+ *        and store result in dest
+ *
+ * source matrix will remain same
+ *
+ * @param[in]  m    affine transform
+ * @param[in]  v    translate vector [x, y, z]
+ * @param[out] dest translated matrix
+ */
+CGLM_INLINE
+void
+glm_translate_to(mat4 m, vec3 v, mat4 dest) {
+  glm_mat4_copy(m, dest);
+  glm_translate(dest, v);
+}
+
+/*!
+ * @brief translate existing transform matrix by x factor
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       x  x factor
+ */
+CGLM_INLINE
+void
+glm_translate_x(mat4 m, float x) {
+#if defined(CGLM_SIMD)
+  glmm_store(m[3], glmm_fmadd(glmm_load(m[0]), glmm_set1(x), glmm_load(m[3])));
+#else
+  vec4 v1;
+  glm_vec4_scale(m[0], x, v1);
+  glm_vec4_add(v1, m[3], m[3]);
+#endif
+}
+
+/*!
+ * @brief translate existing transform matrix by y factor
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       y  y factor
+ */
+CGLM_INLINE
+void
+glm_translate_y(mat4 m, float y) {
+#if defined(CGLM_SIMD)
+  glmm_store(m[3], glmm_fmadd(glmm_load(m[1]), glmm_set1(y), glmm_load(m[3])));
+#else
+  vec4 v1;
+  glm_vec4_scale(m[1], y, v1);
+  glm_vec4_add(v1, m[3], m[3]);
+#endif
+}
+
+/*!
+ * @brief translate existing transform matrix by z factor
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       z  z factor
+ */
+CGLM_INLINE
+void
+glm_translate_z(mat4 m, float z) {
+#if defined(CGLM_SIMD)
+  glmm_store(m[3], glmm_fmadd(glmm_load(m[2]), glmm_set1(z), glmm_load(m[3])));
+#else
+  vec4 v1;
+  glm_vec4_scale(m[2], z, v1);
+  glm_vec4_add(v1, m[3], m[3]);
+#endif
+}
+
+/*!
+ * @brief rotate existing transform matrix around X axis by angle
+ *        and store result in dest
+ *
+ * @param[in]   m      affine transform
+ * @param[in]   angle  angle (radians)
+ * @param[out]  dest   rotated matrix
+ */
+CGLM_INLINE
+void
+glm_rotate_x(mat4 m, float angle, mat4 dest) {
+  CGLM_ALIGN_MAT mat4 t = GLM_MAT4_IDENTITY_INIT;
+  float c, s;
+
+  c = cosf(angle);
+  s = sinf(angle);
+
+  t[1][1] =  c;
+  t[1][2] =  s;
+  t[2][1] = -s;
+  t[2][2] =  c;
+
+  glm_mul_rot(m, t, dest);
+}
+
+/*!
+ * @brief rotate existing transform matrix around Y axis by angle
+ *        and store result in dest
+ *
+ * @param[in]   m      affine transform
+ * @param[in]   angle  angle (radians)
+ * @param[out]  dest   rotated matrix
+ */
+CGLM_INLINE
+void
+glm_rotate_y(mat4 m, float angle, mat4 dest) {
+  CGLM_ALIGN_MAT mat4 t = GLM_MAT4_IDENTITY_INIT;
+  float c, s;
+
+  c = cosf(angle);
+  s = sinf(angle);
+
+  t[0][0] =  c;
+  t[0][2] = -s;
+  t[2][0] =  s;
+  t[2][2] =  c;
+
+  glm_mul_rot(m, t, dest);
+}
+
+/*!
+ * @brief rotate existing transform matrix around Z axis by angle
+ *        and store result in dest
+ *
+ * @param[in]   m      affine transform
+ * @param[in]   angle  angle (radians)
+ * @param[out]  dest   rotated matrix
+ */
+CGLM_INLINE
+void
+glm_rotate_z(mat4 m, float angle, mat4 dest) {
+  CGLM_ALIGN_MAT mat4 t = GLM_MAT4_IDENTITY_INIT;
+  float c, s;
+
+  c = cosf(angle);
+  s = sinf(angle);
+
+  t[0][0] =  c;
+  t[0][1] =  s;
+  t[1][0] = -s;
+  t[1][1] =  c;
+
+  glm_mul_rot(m, t, dest);
+}
+
+/*!
+ * @brief rotate existing transform matrix 
+ *        around given axis by angle at ORIGIN (0,0,0)
+ *
+ *   **❗️IMPORTANT ❗️**
+ *
+ *   If you need to rotate object around itself e.g. center of object or at
+ *   some point [of object] then `glm_rotate_at()` would be better choice to do so.
+ *
+ *   Even if object's model transform is identity, rotation may not be around
+ *   center of object if object does not lay out at ORIGIN perfectly.
+ *
+ *   Using `glm_rotate_at()` with center of bounding shape ( AABB, Sphere ... )
+ *   would be an easy option to rotate around object if object is not at origin.
+ *
+ *   One another option to rotate around itself at any point is `glm_spin()`
+ *   which is perfect if only rotating around model position is desired e.g. not
+ *   specific point on model for instance center of geometry or center of mass,
+ *   again if geometry is not perfectly centered at origin at identity transform,
+ *   rotation may not be around geometry.
+ *
+ * @param[in, out]  m      affine transform
+ * @param[in]       angle  angle (radians)
+ * @param[in]       axis   axis
+ */
+CGLM_INLINE
+void
+glm_rotate(mat4 m, float angle, vec3 axis) {
+  CGLM_ALIGN_MAT mat4 rot;
+  glm_rotate_make(rot, angle, axis);
+  glm_mul_rot(m, rot, m);
+}
+
+/*!
+ * @brief rotate existing transform
+ *        around given axis by angle at given pivot point (rotation center)
+ *
+ * @param[in, out]  m      affine transform
+ * @param[in]       pivot  rotation center
+ * @param[in]       angle  angle (radians)
+ * @param[in]       axis   axis
+ */
+CGLM_INLINE
+void
+glm_rotate_at(mat4 m, vec3 pivot, float angle, vec3 axis) {
+  CGLM_ALIGN(8) vec3 pivotInv;
+
+  glm_vec3_negate_to(pivot, pivotInv);
+
+  glm_translate(m, pivot);
+  glm_rotate(m, angle, axis);
+  glm_translate(m, pivotInv);
+}
+
+/*!
+ * @brief creates NEW rotation matrix by angle and axis at given point
+ *
+ * this creates rotation matrix, it assumes you don't have a matrix
+ *
+ * this should work faster than glm_rotate_at because it reduces
+ * one glm_translate.
+ *
+ * @param[out] m      affine transform
+ * @param[in]  pivot  rotation center
+ * @param[in]  angle  angle (radians)
+ * @param[in]  axis   axis
+ */
+CGLM_INLINE
+void
+glm_rotate_atm(mat4 m, vec3 pivot, float angle, vec3 axis) {
+  CGLM_ALIGN(8) vec3 pivotInv;
+
+  glm_vec3_negate_to(pivot, pivotInv);
+
+  glm_translate_make(m, pivot);
+  glm_rotate(m, angle, axis);
+  glm_translate(m, pivotInv);
+}
+
+/*!
+ * @brief rotate existing transform matrix 
+ *        around given axis by angle around self (doesn't affected by position)
+ *
+ * @param[in, out]  m      affine transform
+ * @param[in]       angle  angle (radians)
+ * @param[in]       axis   axis
+ */
+CGLM_INLINE
+void
+glm_spin(mat4 m, float angle, vec3 axis) {
+  CGLM_ALIGN_MAT mat4 rot;
+  glm_rotate_atm(rot, m[3], angle, axis);
+  glm_mat4_mul(m, rot, m);
+}
+
+#endif /* cglm_affine_pre_h */
diff --git a/external/cglm/affine.h b/external/cglm/affine.h
new file mode 100644
index 0000000..2c608f7
--- /dev/null
+++ b/external/cglm/affine.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_translate_to(mat4 m, vec3 v, mat4 dest);
+   CGLM_INLINE void glm_translate(mat4 m, vec3 v);
+   CGLM_INLINE void glm_translate_x(mat4 m, float to);
+   CGLM_INLINE void glm_translate_y(mat4 m, float to);
+   CGLM_INLINE void glm_translate_z(mat4 m, float to);
+   CGLM_INLINE void glm_translate_make(mat4 m, vec3 v);
+   CGLM_INLINE void glm_scale_to(mat4 m, vec3 v, mat4 dest);
+   CGLM_INLINE void glm_scale_make(mat4 m, vec3 v);
+   CGLM_INLINE void glm_scale(mat4 m, vec3 v);
+   CGLM_INLINE void glm_scale_uni(mat4 m, float s);
+   CGLM_INLINE void glm_rotate_x(mat4 m, float angle, mat4 dest);
+   CGLM_INLINE void glm_rotate_y(mat4 m, float angle, mat4 dest);
+   CGLM_INLINE void glm_rotate_z(mat4 m, float angle, mat4 dest);
+   CGLM_INLINE void glm_rotate_make(mat4 m, float angle, vec3 axis);
+   CGLM_INLINE void glm_rotate(mat4 m, float angle, vec3 axis);
+   CGLM_INLINE void glm_rotate_at(mat4 m, vec3 pivot, float angle, vec3 axis);
+   CGLM_INLINE void glm_rotate_atm(mat4 m, vec3 pivot, float angle, vec3 axis);
+   CGLM_INLINE void glm_spin(mat4 m, float angle, vec3 axis);
+   CGLM_INLINE void glm_decompose_scalev(mat4 m, vec3 s);
+   CGLM_INLINE bool glm_uniscaled(mat4 m);
+   CGLM_INLINE void glm_decompose_rs(mat4 m, mat4 r, vec3 s);
+   CGLM_INLINE void glm_decompose(mat4 m, vec4 t, mat4 r, vec3 s);
+ */
+
+#ifndef cglm_affine_h
+#define cglm_affine_h
+
+#include "common.h"
+#include "util.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+#include "affine-mat.h"
+
+/*!
+ * @brief creates NEW translate transform matrix by v vector
+ *
+ * @param[out]  m  affine transform
+ * @param[in]   v  translate vector [x, y, z]
+ */
+CGLM_INLINE
+void
+glm_translate_make(mat4 m, vec3 v) {
+  glm_mat4_identity(m);
+  glm_vec3_copy(v, m[3]);
+}
+
+/*!
+ * @brief scale existing transform matrix by v vector
+ *        and store result in dest
+ *
+ * @param[in]  m    affine transform
+ * @param[in]  v    scale vector [x, y, z]
+ * @param[out] dest scaled matrix
+ */
+CGLM_INLINE
+void
+glm_scale_to(mat4 m, vec3 v, mat4 dest) {
+  glm_vec4_scale(m[0], v[0], dest[0]);
+  glm_vec4_scale(m[1], v[1], dest[1]);
+  glm_vec4_scale(m[2], v[2], dest[2]);
+
+  glm_vec4_copy(m[3], dest[3]);
+}
+
+/*!
+ * @brief creates NEW scale matrix by v vector
+ *
+ * @param[out]  m  affine transform
+ * @param[in]   v  scale vector [x, y, z]
+ */
+CGLM_INLINE
+void
+glm_scale_make(mat4 m, vec3 v) {
+  glm_mat4_identity(m);
+  m[0][0] = v[0];
+  m[1][1] = v[1];
+  m[2][2] = v[2];
+}
+
+/*!
+ * @brief scales existing transform matrix by v vector
+ *        and stores result in same matrix
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       v  scale vector [x, y, z]
+ */
+CGLM_INLINE
+void
+glm_scale(mat4 m, vec3 v) {
+  glm_scale_to(m, v, m);
+}
+
+/*!
+ * @brief applies uniform scale to existing transform matrix v = [s, s, s]
+ *        and stores result in same matrix
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       s  scale factor
+ */
+CGLM_INLINE
+void
+glm_scale_uni(mat4 m, float s) {
+  CGLM_ALIGN(8) vec3 v = { s, s, s };
+  glm_scale_to(m, v, m);
+}
+
+/*!
+ * @brief creates NEW rotation matrix by angle and axis
+ *
+ * axis will be normalized so you don't need to normalize it
+ *
+ * @param[out] m     affine transform
+ * @param[in]  angle angle (radians)
+ * @param[in]  axis  axis
+ */
+CGLM_INLINE
+void
+glm_rotate_make(mat4 m, float angle, vec3 axis) {
+  CGLM_ALIGN(8) vec3 axisn, v, vs;
+  float c;
+
+  c = cosf(angle);
+
+  glm_vec3_normalize_to(axis, axisn);
+  glm_vec3_scale(axisn, 1.0f - c, v);
+  glm_vec3_scale(axisn, sinf(angle), vs);
+
+  glm_vec3_scale(axisn, v[0], m[0]);
+  glm_vec3_scale(axisn, v[1], m[1]);
+  glm_vec3_scale(axisn, v[2], m[2]);
+
+  m[0][0] += c;       m[1][0] -= vs[2];   m[2][0] += vs[1];
+  m[0][1] += vs[2];   m[1][1] += c;       m[2][1] -= vs[0];
+  m[0][2] -= vs[1];   m[1][2] += vs[0];   m[2][2] += c;
+
+  m[0][3] = m[1][3] = m[2][3] = m[3][0] = m[3][1] = m[3][2] = 0.0f;
+  m[3][3] = 1.0f;
+}
+
+/*!
+ * @brief decompose scale vector
+ *
+ * @param[in]  m  affine transform
+ * @param[out] s  scale vector (Sx, Sy, Sz)
+ */
+CGLM_INLINE
+void
+glm_decompose_scalev(mat4 m, vec3 s) {
+  s[0] = glm_vec3_norm(m[0]);
+  s[1] = glm_vec3_norm(m[1]);
+  s[2] = glm_vec3_norm(m[2]);
+}
+
+/*!
+ * @brief returns true if matrix is uniform scaled. This is helpful for
+ *        creating normal matrix.
+ *
+ * @param[in] m m
+ *
+ * @return boolean
+ */
+CGLM_INLINE
+bool
+glm_uniscaled(mat4 m) {
+  CGLM_ALIGN(8) vec3 s;
+  glm_decompose_scalev(m, s);
+  return glm_vec3_eq_all(s);
+}
+
+/*!
+ * @brief decompose rotation matrix (mat4) and scale vector [Sx, Sy, Sz]
+ *        DON'T pass projected matrix here
+ *
+ * @param[in]  m affine transform
+ * @param[out] r rotation matrix
+ * @param[out] s scale matrix
+ */
+CGLM_INLINE
+void
+glm_decompose_rs(mat4 m, mat4 r, vec3 s) {
+  CGLM_ALIGN(16) vec4 t = {0.0f, 0.0f, 0.0f, 1.0f};
+  CGLM_ALIGN(8)  vec3 v;
+
+  glm_vec4_copy(m[0], r[0]);
+  glm_vec4_copy(m[1], r[1]);
+  glm_vec4_copy(m[2], r[2]);
+  glm_vec4_copy(t,    r[3]);
+
+  s[0] = glm_vec3_norm(m[0]);
+  s[1] = glm_vec3_norm(m[1]);
+  s[2] = glm_vec3_norm(m[2]);
+
+  glm_vec4_scale(r[0], 1.0f/s[0], r[0]);
+  glm_vec4_scale(r[1], 1.0f/s[1], r[1]);
+  glm_vec4_scale(r[2], 1.0f/s[2], r[2]);
+
+  /* Note from Apple Open Source (assume that the matrix is orthonormal):
+     check for a coordinate system flip.  If the determinant
+     is -1, then negate the matrix and the scaling factors. */
+  glm_vec3_cross(m[0], m[1], v);
+  if (glm_vec3_dot(v, m[2]) < 0.0f) {
+    glm_vec4_negate(r[0]);
+    glm_vec4_negate(r[1]);
+    glm_vec4_negate(r[2]);
+    glm_vec3_negate(s);
+  }
+}
+
+/*!
+ * @brief decompose affine transform, TODO: extract shear factors.
+ *        DON'T pass projected matrix here
+ *
+ * @param[in]  m affine transform
+ * @param[out] t translation vector
+ * @param[out] r rotation matrix (mat4)
+ * @param[out] s scaling vector [X, Y, Z]
+ */
+CGLM_INLINE
+void
+glm_decompose(mat4 m, vec4 t, mat4 r, vec3 s) {
+  glm_vec4_copy(m[3], t);
+  glm_decompose_rs(m, r, s);
+}
+
+#include "affine-pre.h"
+#include "affine-post.h"
+
+#endif /* cglm_affine_h */
diff --git a/external/cglm/affine2d-post.h b/external/cglm/affine2d-post.h
new file mode 100644
index 0000000..c6605a8
--- /dev/null
+++ b/external/cglm/affine2d-post.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_affine2d_post_h
+#define cglm_affine2d_post_h
+
+/*
+ Functions:
+   CGLM_INLINE void glm_translated2d(mat3 m, vec2 v);
+   CGLM_INLINE void glm_translated2d_x(mat3 m, float to);
+   CGLM_INLINE void glm_translated2d_y(mat3 m, float to);
+   CGLM_INLINE void glm_rotated2d(mat3 m, float angle);
+   CGLM_INLINE void glm_scaled2d(mat3 m, vec2 v);
+   CGLM_INLINE void glm_scaled2d_uni(mat3 m, float s);
+ */
+
+#include "vec2.h"
+
+/*!
+ * @brief translate existing transform matrix by v vector
+ *        and store result in same matrix
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       v  translate vector [x, y]
+ */
+CGLM_INLINE
+void
+glm_translated2d(mat3 m, vec2 v) {
+  glm_vec2_add(m[2], v, m[2]);
+}
+
+/*!
+ * @brief translate existing transform matrix by x factor
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       x  x factor
+ */
+CGLM_INLINE
+void
+glm_translated2d_x(mat3 m, float x) {
+  m[2][0] += x;
+}
+
+/*!
+ * @brief translate existing transform matrix by y factor
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       y  y factor
+ */
+CGLM_INLINE
+void
+glm_translated2d_y(mat3 m, float y) {
+  m[2][1] += y;
+}
+
+/*!
+ * @brief rotate existing transform matrix by angle
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in, out]   m      affine transform
+ * @param[in]   angle  angle (radians)
+ */
+CGLM_INLINE
+void
+glm_rotated2d(mat3 m, float angle) {
+  float c = cosf(angle),
+        s = sinf(angle),
+
+        m00 = m[0][0], m10 = m[1][0], m20 = m[2][0],
+        m01 = m[0][1], m11 = m[1][1], m21 = m[2][1];
+
+  m[0][0] = c * m00 - s * m01;
+  m[1][0] = c * m10 - s * m11;
+  m[2][0] = c * m20 - s * m21;
+
+  m[0][1] = s * m00 + c * m01;
+  m[1][1] = s * m10 + c * m11;
+  m[2][1] = s * m20 + c * m21;
+}
+
+/*!
+ * @brief scale existing 2d transform matrix by v vector
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in, out]   m      affine transform
+ * @param[in]   v  scale vector [x, y]
+ */
+CGLM_INLINE
+void
+glm_scaled2d(mat3 m, vec2 v) {
+  m[0][0] *= v[0];
+  m[1][0] *= v[0];
+  m[2][0] *= v[0];
+
+  m[0][1] *= v[1];
+  m[1][1] *= v[1];
+  m[2][1] *= v[1];
+}
+
+/*!
+ * @brief applies uniform scale to existing 2d transform matrix v = [s, s]
+ *
+ *  this is POST transform, applies to existing transform as last transform
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       s  scale factor
+ */
+CGLM_INLINE
+void
+glm_scaled2d_uni(mat3 m, float s) {
+  m[0][0] *= s;
+  m[1][0] *= s;
+  m[2][0] *= s;
+
+  m[0][1] *= s;
+  m[1][1] *= s;
+  m[2][1] *= s;
+}
+
+#endif /* cglm_affine2d_post_h */
diff --git a/external/cglm/affine2d.h b/external/cglm/affine2d.h
new file mode 100644
index 0000000..0dcf50a
--- /dev/null
+++ b/external/cglm/affine2d.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_translate2d(mat3 m, vec2 v)
+   CGLM_INLINE void glm_translate2d_to(mat3 m, vec2 v, mat3 dest)
+   CGLM_INLINE void glm_translate2d_x(mat3 m, float x)
+   CGLM_INLINE void glm_translate2d_y(mat3 m, float y)
+   CGLM_INLINE void glm_translate2d_make(mat3 m, vec2 v)
+   CGLM_INLINE void glm_scale2d_to(mat3 m, vec2 v, mat3 dest)
+   CGLM_INLINE void glm_scale2d_make(mat3 m, vec2 v)
+   CGLM_INLINE void glm_scale2d(mat3 m, vec2 v)
+   CGLM_INLINE void glm_scale2d_uni(mat3 m, float s)
+   CGLM_INLINE void glm_rotate2d_make(mat3 m, float angle)
+   CGLM_INLINE void glm_rotate2d(mat3 m, float angle)
+   CGLM_INLINE void glm_rotate2d_to(mat3 m, float angle, mat3 dest)
+ */
+
+#ifndef cglm_affine2d_h
+#define cglm_affine2d_h
+
+#include "common.h"
+#include "util.h"
+#include "vec2.h"
+#include "mat3.h"
+
+/*!
+ * @brief translate existing 2d transform matrix by v vector
+ *        and stores result in same matrix
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       v  translate vector [x, y]
+ */
+CGLM_INLINE
+void
+glm_translate2d(mat3 m, vec2 v) {
+  m[2][0] = m[0][0] * v[0] + m[1][0] * v[1] + m[2][0];
+  m[2][1] = m[0][1] * v[0] + m[1][1] * v[1] + m[2][1];
+  m[2][2] = m[0][2] * v[0] + m[1][2] * v[1] + m[2][2];
+}
+
+/*!
+ * @brief translate existing 2d transform matrix by v vector
+ *        and store result in dest
+ *
+ * source matrix will remain same
+ *
+ * @param[in]  m    affine transform
+ * @param[in]  v    translate vector [x, y]
+ * @param[out] dest translated matrix
+ */
+CGLM_INLINE
+void
+glm_translate2d_to(mat3 m, vec2 v, mat3 dest) {
+  glm_mat3_copy(m, dest);
+  glm_translate2d(dest, v);
+}
+
+/*!
+ * @brief translate existing 2d transform matrix by x factor
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       x  x factor
+ */
+CGLM_INLINE
+void
+glm_translate2d_x(mat3 m, float x) {
+  m[2][0] = m[0][0] * x + m[2][0];
+  m[2][1] = m[0][1] * x + m[2][1];
+  m[2][2] = m[0][2] * x + m[2][2];
+}
+
+/*!
+ * @brief translate existing 2d transform matrix by y factor
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       y  y factor
+ */
+CGLM_INLINE
+void
+glm_translate2d_y(mat3 m, float y) {
+  m[2][0] = m[1][0] * y + m[2][0];
+  m[2][1] = m[1][1] * y + m[2][1];
+  m[2][2] = m[1][2] * y + m[2][2];
+}
+
+/*!
+ * @brief creates NEW translate 2d transform matrix by v vector
+ *
+ * @param[out]  m  affine transform
+ * @param[in]   v  translate vector [x, y]
+ */
+CGLM_INLINE
+void
+glm_translate2d_make(mat3 m, vec2 v) {
+  glm_mat3_identity(m);
+  m[2][0] = v[0];
+  m[2][1] = v[1];
+}
+
+/*!
+ * @brief scale existing 2d transform matrix by v vector
+ *        and store result in dest
+ *
+ * @param[in]  m    affine transform
+ * @param[in]  v    scale vector [x, y]
+ * @param[out] dest scaled matrix
+ */
+CGLM_INLINE
+void
+glm_scale2d_to(mat3 m, vec2 v, mat3 dest) {
+  dest[0][0] = m[0][0] * v[0];
+  dest[0][1] = m[0][1] * v[0];
+  dest[0][2] = m[0][2] * v[0];
+  
+  dest[1][0] = m[1][0] * v[1];
+  dest[1][1] = m[1][1] * v[1];
+  dest[1][2] = m[1][2] * v[1];
+  
+  dest[2][0] = m[2][0];
+  dest[2][1] = m[2][1];
+  dest[2][2] = m[2][2];
+}
+
+/*!
+ * @brief creates NEW 2d scale matrix by v vector
+ *
+ * @param[out]  m  affine transform
+ * @param[in]   v  scale vector [x, y]
+ */
+CGLM_INLINE
+void
+glm_scale2d_make(mat3 m, vec2 v) {
+  glm_mat3_identity(m);
+  m[0][0] = v[0];
+  m[1][1] = v[1];
+}
+
+/*!
+ * @brief scales existing 2d transform matrix by v vector
+ *        and stores result in same matrix
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       v  scale vector [x, y]
+ */
+CGLM_INLINE
+void
+glm_scale2d(mat3 m, vec2 v) {
+  m[0][0] = m[0][0] * v[0];
+  m[0][1] = m[0][1] * v[0];
+  m[0][2] = m[0][2] * v[0];
+
+  m[1][0] = m[1][0] * v[1];
+  m[1][1] = m[1][1] * v[1];
+  m[1][2] = m[1][2] * v[1];
+}
+
+/*!
+ * @brief applies uniform scale to existing 2d transform matrix v = [s, s]
+ *        and stores result in same matrix
+ *
+ * @param[in, out]  m  affine transform
+ * @param[in]       s  scale factor
+ */
+CGLM_INLINE
+void
+glm_scale2d_uni(mat3 m, float s) {
+  m[0][0] = m[0][0] * s;
+  m[0][1] = m[0][1] * s;
+  m[0][2] = m[0][2] * s;
+
+  m[1][0] = m[1][0] * s;
+  m[1][1] = m[1][1] * s;
+  m[1][2] = m[1][2] * s;
+}
+
+/*!
+ * @brief creates NEW rotation matrix by angle around Z axis
+ *
+ * @param[out] m     affine transform
+ * @param[in]  angle angle (radians)
+ */
+CGLM_INLINE
+void
+glm_rotate2d_make(mat3 m, float angle) {
+  float c, s;
+
+  s = sinf(angle);
+  c = cosf(angle);
+  
+  m[0][0] = c;
+  m[0][1] = s;
+  m[0][2] = 0;
+
+  m[1][0] = -s;
+  m[1][1] = c;
+  m[1][2] = 0;
+  
+  m[2][0] = 0.0f;
+  m[2][1] = 0.0f;
+  m[2][2] = 1.0f;
+}
+
+/*!
+ * @brief rotate existing 2d transform matrix around Z axis by angle
+ *         and store result in same matrix
+ *
+ * @param[in, out]  m      affine transform
+ * @param[in]       angle  angle (radians)
+ */
+CGLM_INLINE
+void
+glm_rotate2d(mat3 m, float angle) {
+  float m00 = m[0][0],  m10 = m[1][0],
+        m01 = m[0][1],  m11 = m[1][1],
+        m02 = m[0][2],  m12 = m[1][2];
+  float c, s;
+
+  s = sinf(angle);
+  c = cosf(angle);
+  
+  m[0][0] = m00 * c + m10 * s;
+  m[0][1] = m01 * c + m11 * s;
+  m[0][2] = m02 * c + m12 * s;
+
+  m[1][0] = m00 * -s + m10 * c;
+  m[1][1] = m01 * -s + m11 * c;
+  m[1][2] = m02 * -s + m12 * c;
+}
+
+/*!
+ * @brief rotate existing 2d transform matrix around Z axis by angle
+ *        and store result in dest
+ *
+ * @param[in]  m      affine transform
+ * @param[in]  angle  angle (radians)
+ * @param[out] dest   destination
+ */
+CGLM_INLINE
+void
+glm_rotate2d_to(mat3 m, float angle, mat3 dest) {
+  float m00 = m[0][0],  m10 = m[1][0],
+        m01 = m[0][1],  m11 = m[1][1],
+        m02 = m[0][2],  m12 = m[1][2];
+  float c, s;
+
+  s = sinf(angle);
+  c = cosf(angle);
+  
+  dest[0][0] = m00 * c + m10 * s;
+  dest[0][1] = m01 * c + m11 * s;
+  dest[0][2] = m02 * c + m12 * s;
+
+  dest[1][0] = m00 * -s + m10 * c;
+  dest[1][1] = m01 * -s + m11 * c;
+  dest[1][2] = m02 * -s + m12 * c;
+  
+  dest[2][0] = m[2][0];
+  dest[2][1] = m[2][1];
+  dest[2][2] = m[2][2];
+}
+
+#endif /* cglm_affine2d_h */
diff --git a/external/cglm/applesimd.h b/external/cglm/applesimd.h
new file mode 100644
index 0000000..479ada6
--- /dev/null
+++ b/external/cglm/applesimd.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_applesimd_h
+#define cglm_applesimd_h
+#if defined(__APPLE__)                                                        \
+    && defined(SIMD_COMPILER_HAS_REQUIRED_FEATURES)                           \
+    && defined(SIMD_BASE)                                                     \
+    && defined(SIMD_TYPES)                                                    \
+    && defined(SIMD_VECTOR_TYPES)
+
+#include "common.h"
+#include "struct.h"
+
+/*!
+ * @brief converts mat4 to Apple's simd type simd_float4x4
+ * @return simd_float4x4
+ */
+CGLM_INLINE
+simd_float4x4
+glm_mat4_applesimd(mat4 m) {
+  simd_float4x4 t;
+
+  t.columns[0][0] = m[0][0];
+  t.columns[0][1] = m[0][1];
+  t.columns[0][2] = m[0][2];
+  t.columns[0][3] = m[0][3];
+
+  t.columns[1][0] = m[1][0];
+  t.columns[1][1] = m[1][1];
+  t.columns[1][2] = m[1][2];
+  t.columns[1][3] = m[1][3];
+
+  t.columns[2][0] = m[2][0];
+  t.columns[2][1] = m[2][1];
+  t.columns[2][2] = m[2][2];
+  t.columns[2][3] = m[2][3];
+
+  t.columns[3][0] = m[3][0];
+  t.columns[3][1] = m[3][1];
+  t.columns[3][2] = m[3][2];
+  t.columns[3][3] = m[3][3];
+
+  return t;
+}
+
+/*!
+* @brief converts mat3 to Apple's simd type simd_float3x3
+* @return simd_float3x3
+*/
+CGLM_INLINE
+simd_float3x3
+glm_mat3_applesimd(mat3 m) {
+  simd_float3x3 t;
+
+  t.columns[0][0] = m[0][0];
+  t.columns[0][1] = m[0][1];
+  t.columns[0][2] = m[0][2];
+
+  t.columns[1][0] = m[1][0];
+  t.columns[1][1] = m[1][1];
+  t.columns[1][2] = m[1][2];
+
+  t.columns[2][0] = m[2][0];
+  t.columns[2][1] = m[2][1];
+  t.columns[2][2] = m[2][2];
+
+  return t;
+}
+
+/*!
+* @brief converts vec4 to Apple's simd type simd_float4
+* @return simd_float4
+*/
+CGLM_INLINE
+simd_float4
+glm_vec4_applesimd(vec4 v) {
+  return (simd_float4){v[0], v[1], v[2], v[3]};
+}
+
+/*!
+* @brief converts vec3 to Apple's simd type simd_float3
+* @return simd_float3
+*/
+CGLM_INLINE
+simd_float3
+glm_vec3_applesimd(vec3 v) {
+  return (simd_float3){v[0], v[1], v[2]};
+}
+
+/*!
+ * @brief generic function to convert cglm types to Apple's simd types
+ *
+ * Example usage:
+ *   simd_float4x4 m = applesimd(mat4_value);
+ *   simd_float3   v = applesimd(vec3_value);
+ *
+ * @param x cglm type (mat4, mat3, vec4, vec3)
+ * @return corresponding Apple simd type
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#  define applesimd(x) _Generic((x),                    \
+                         mat4:  glm_mat4_applesimd,     \
+                         mat3:  glm_mat3_applesimd,     \
+                         vec4:  glm_vec4_applesimd,     \
+                         vec3:  glm_vec3_applesimd      \
+                       )((x))
+#endif
+
+#ifdef cglm_types_struct_h
+CGLM_INLINE simd_float4x4 glms_mat4_(applesimd)(mat4s m) { return glm_mat4_applesimd(m.raw); }
+CGLM_INLINE simd_float3x3 glms_mat3_(applesimd)(mat3s m) { return glm_mat3_applesimd(m.raw); }
+CGLM_INLINE simd_float4   glms_vec4_(applesimd)(vec4s v) { return glm_vec4_applesimd(v.raw); }
+CGLM_INLINE simd_float3   glms_vec3_(applesimd)(vec3s v) { return glm_vec3_applesimd(v.raw); }
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#  undef applesimd
+#  define applesimd(x) _Generic((x),                       \
+                         mat4:  glm_mat4_applesimd,        \
+                         mat3:  glm_mat3_applesimd,        \
+                         vec4:  glm_vec4_applesimd,        \
+                         vec3:  glm_vec3_applesimd,        \
+                         mat4s: glms_mat4_(applesimd),     \
+                         mat3s: glms_mat3_(applesimd),     \
+                         vec4s: glms_vec4_(applesimd),     \
+                         vec3s: glms_vec3_(applesimd)      \
+                       )((x))
+#endif
+#endif
+
+#endif
+#endif /* cglm_applesimd_h */
diff --git a/external/cglm/bezier.h b/external/cglm/bezier.h
new file mode 100644
index 0000000..a6e5f8a
--- /dev/null
+++ b/external/cglm/bezier.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_bezier_h
+#define cglm_bezier_h
+
+#include "common.h"
+
+#define GLM_BEZIER_MAT_INIT  {{-1.0f,  3.0f, -3.0f,  1.0f},                   \
+                              { 3.0f, -6.0f,  3.0f,  0.0f},                   \
+                              {-3.0f,  3.0f,  0.0f,  0.0f},                   \
+                              { 1.0f,  0.0f,  0.0f,  0.0f}}
+#define GLM_HERMITE_MAT_INIT {{ 2.0f, -3.0f,  0.0f,  1.0f},                   \
+                              {-2.0f,  3.0f,  0.0f,  0.0f},                   \
+                              { 1.0f, -2.0f,  1.0f,  0.0f},                   \
+                              { 1.0f, -1.0f,  0.0f,  0.0f}}
+/* for C only */
+#define GLM_BEZIER_MAT  ((mat4)GLM_BEZIER_MAT_INIT)
+#define GLM_HERMITE_MAT ((mat4)GLM_HERMITE_MAT_INIT)
+
+#define CGLM_DECASTEL_EPS   1e-9f
+#define CGLM_DECASTEL_MAX   1000
+#define CGLM_DECASTEL_SMALL 1e-20f
+
+/*!
+ * @brief cubic bezier interpolation
+ *
+ * Formula:
+ *  B(s) = P0*(1-s)^3 + 3*C0*s*(1-s)^2 + 3*C1*s^2*(1-s) + P1*s^3
+ *
+ * similar result using matrix:
+ *  B(s) = glm_smc(t, GLM_BEZIER_MAT, (vec4){p0, c0, c1, p1})
+ *
+ * glm_eq(glm_smc(...), glm_bezier(...)) should return TRUE
+ *
+ * @param[in]  s    parameter between 0 and 1
+ * @param[in]  p0   begin point
+ * @param[in]  c0   control point 1
+ * @param[in]  c1   control point 2
+ * @param[in]  p1   end point
+ *
+ * @return B(s)
+ */
+CGLM_INLINE
+float
+glm_bezier(float s, float p0, float c0, float c1, float p1) {
+  float x, xx, ss, xs3, a;
+
+  x   = 1.0f - s;
+  xx  = x * x;
+  ss  = s * s;
+  xs3 = (s - ss) * 3.0f;
+  a   = p0 * xx + c0 * xs3;
+
+  return a + s * (c1 * xs3 + p1 * ss - a);
+}
+
+/*!
+ * @brief cubic hermite interpolation
+ *
+ * Formula:
+ *  H(s) = P0*(2*s^3 - 3*s^2 + 1) + T0*(s^3 - 2*s^2 + s)
+ *            + P1*(-2*s^3 + 3*s^2) + T1*(s^3 - s^2)
+ *
+ * similar result using matrix:
+ *  H(s) = glm_smc(t, GLM_HERMITE_MAT, (vec4){p0, p1, c0, c1})
+ *
+ * glm_eq(glm_smc(...), glm_hermite(...)) should return TRUE
+ *
+ * @param[in]  s    parameter between 0 and 1
+ * @param[in]  p0   begin point
+ * @param[in]  t0   tangent 1
+ * @param[in]  t1   tangent 2
+ * @param[in]  p1   end point
+ *
+ * @return H(s)
+ */
+CGLM_INLINE
+float
+glm_hermite(float s, float p0, float t0, float t1, float p1) {
+  float ss, d, a, b, c, e, f;
+
+  ss = s  * s;
+  a  = ss + ss;
+  c  = a  + ss;
+  b  = a  * s;
+  d  = s  * ss;
+  f  = d  - ss;
+  e  = b  - c;
+
+  return p0 * (e + 1.0f) + t0 * (f - ss + s) + t1 * f - p1 * e;
+}
+
+/*!
+ * @brief iterative way to solve cubic equation
+ *
+ * @param[in]  prm  parameter between 0 and 1
+ * @param[in]  p0   begin point
+ * @param[in]  c0   control point 1
+ * @param[in]  c1   control point 2
+ * @param[in]  p1   end point
+ *
+ * @return parameter to use in cubic equation
+ */
+CGLM_INLINE
+float
+glm_decasteljau(float prm, float p0, float c0, float c1, float p1) {
+  float u, v, a, b, c, d, e, f;
+  int   i;
+
+  if (prm - p0 < CGLM_DECASTEL_SMALL)
+    return 0.0f;
+
+  if (p1 - prm < CGLM_DECASTEL_SMALL)
+    return 1.0f;
+
+  u  = 0.0f;
+  v  = 1.0f;
+
+  for (i = 0; i < CGLM_DECASTEL_MAX; i++) {
+    /* de Casteljau Subdivision */
+    a  = (p0 + c0) * 0.5f;
+    b  = (c0 + c1) * 0.5f;
+    c  = (c1 + p1) * 0.5f;
+    d  = (a  + b)  * 0.5f;
+    e  = (b  + c)  * 0.5f;
+    f  = (d  + e)  * 0.5f; /* this one is on the curve! */
+
+    /* The curve point is close enough to our wanted t */
+    if (fabsf(f - prm) < CGLM_DECASTEL_EPS)
+      return glm_clamp_zo((u  + v) * 0.5f);
+
+    /* dichotomy */
+    if (f < prm) {
+      p0 = f;
+      c0 = e;
+      c1 = c;
+      u  = (u  + v) * 0.5f;
+    } else {
+      c0 = a;
+      c1 = d;
+      p1 = f;
+      v  = (u  + v) * 0.5f;
+    }
+  }
+
+  return glm_clamp_zo((u  + v) * 0.5f);
+}
+
+#endif /* cglm_bezier_h */
diff --git a/external/cglm/box.h b/external/cglm/box.h
new file mode 100644
index 0000000..8bba678
--- /dev/null
+++ b/external/cglm/box.h
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_box_h
+#define cglm_box_h
+
+#include "common.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "util.h"
+
+/*!
+ * @brief apply transform to Axis-Aligned Bounding Box
+ *
+ * @param[in]  box  bounding box
+ * @param[in]  m    transform matrix
+ * @param[out] dest transformed bounding box
+ */
+CGLM_INLINE
+void
+glm_aabb_transform(vec3 box[2], mat4 m, vec3 dest[2]) {
+  vec3 v[2], xa, xb, ya, yb, za, zb;
+
+  glm_vec3_scale(m[0], box[0][0], xa);
+  glm_vec3_scale(m[0], box[1][0], xb);
+
+  glm_vec3_scale(m[1], box[0][1], ya);
+  glm_vec3_scale(m[1], box[1][1], yb);
+
+  glm_vec3_scale(m[2], box[0][2], za);
+  glm_vec3_scale(m[2], box[1][2], zb);
+
+  /* translation + min(xa, xb) + min(ya, yb) + min(za, zb) */
+  glm_vec3(m[3], v[0]);
+  glm_vec3_minadd(xa, xb, v[0]);
+  glm_vec3_minadd(ya, yb, v[0]);
+  glm_vec3_minadd(za, zb, v[0]);
+
+  /* translation + max(xa, xb) + max(ya, yb) + max(za, zb) */
+  glm_vec3(m[3], v[1]);
+  glm_vec3_maxadd(xa, xb, v[1]);
+  glm_vec3_maxadd(ya, yb, v[1]);
+  glm_vec3_maxadd(za, zb, v[1]);
+
+  glm_vec3_copy(v[0], dest[0]);
+  glm_vec3_copy(v[1], dest[1]);
+}
+
+/*!
+ * @brief merges two AABB bounding box and creates new one
+ *
+ * two box must be in same space, if one of box is in different space then
+ * you should consider to convert it's space by glm_box_space
+ *
+ * @param[in]  box1 bounding box 1
+ * @param[in]  box2 bounding box 2
+ * @param[out] dest merged bounding box
+ */
+CGLM_INLINE
+void
+glm_aabb_merge(vec3 box1[2], vec3 box2[2], vec3 dest[2]) {
+  dest[0][0] = glm_min(box1[0][0], box2[0][0]);
+  dest[0][1] = glm_min(box1[0][1], box2[0][1]);
+  dest[0][2] = glm_min(box1[0][2], box2[0][2]);
+
+  dest[1][0] = glm_max(box1[1][0], box2[1][0]);
+  dest[1][1] = glm_max(box1[1][1], box2[1][1]);
+  dest[1][2] = glm_max(box1[1][2], box2[1][2]);
+}
+
+/*!
+ * @brief crops a bounding box with another one.
+ *
+ * this could be useful for getting a bbox which fits with view frustum and
+ * object bounding boxes. In this case you crop view frustum box with objects
+ * box
+ *
+ * @param[in]  box     bounding box 1
+ * @param[in]  cropBox crop box
+ * @param[out] dest    cropped bounding box
+ */
+CGLM_INLINE
+void
+glm_aabb_crop(vec3 box[2], vec3 cropBox[2], vec3 dest[2]) {
+  dest[0][0] = glm_max(box[0][0], cropBox[0][0]);
+  dest[0][1] = glm_max(box[0][1], cropBox[0][1]);
+  dest[0][2] = glm_max(box[0][2], cropBox[0][2]);
+
+  dest[1][0] = glm_min(box[1][0], cropBox[1][0]);
+  dest[1][1] = glm_min(box[1][1], cropBox[1][1]);
+  dest[1][2] = glm_min(box[1][2], cropBox[1][2]);
+}
+
+/*!
+ * @brief crops a bounding box with another one.
+ *
+ * this could be useful for getting a bbox which fits with view frustum and
+ * object bounding boxes. In this case you crop view frustum box with objects
+ * box
+ *
+ * @param[in]  box      bounding box
+ * @param[in]  cropBox  crop box
+ * @param[in]  clampBox minimum box
+ * @param[out] dest     cropped bounding box
+ */
+CGLM_INLINE
+void
+glm_aabb_crop_until(vec3 box[2],
+                    vec3 cropBox[2],
+                    vec3 clampBox[2],
+                    vec3 dest[2]) {
+  glm_aabb_crop(box, cropBox, dest);
+  glm_aabb_merge(clampBox, dest, dest);
+}
+
+/*!
+ * @brief check if AABB intersects with frustum planes
+ *
+ * this could be useful for frustum culling using AABB.
+ *
+ * OPTIMIZATION HINT:
+ *  if planes order is similar to LEFT, RIGHT, BOTTOM, TOP, NEAR, FAR
+ *  then this method should run even faster because it would only use two
+ *  planes if object is not inside the two planes
+ *  fortunately cglm extracts planes as this order! just pass what you got!
+ *
+ * @param[in]  box     bounding box
+ * @param[in]  planes  frustum planes
+ */
+CGLM_INLINE
+bool
+glm_aabb_frustum(vec3 box[2], vec4 planes[6]) {
+  float *p, dp;
+  int    i;
+
+  for (i = 0; i < 6; i++) {
+    p  = planes[i];
+    dp = p[0] * box[p[0] > 0.0f][0]
+       + p[1] * box[p[1] > 0.0f][1]
+       + p[2] * box[p[2] > 0.0f][2];
+
+    if (dp < -p[3])
+      return false;
+  }
+
+  return true;
+}
+
+/*!
+ * @brief invalidate AABB min and max values
+ *
+ * @param[in, out]  box bounding box
+ */
+CGLM_INLINE
+void
+glm_aabb_invalidate(vec3 box[2]) {
+  glm_vec3_broadcast(FLT_MAX,  box[0]);
+  glm_vec3_broadcast(-FLT_MAX, box[1]);
+}
+
+/*!
+ * @brief check if AABB is valid or not
+ *
+ * @param[in]  box bounding box
+ */
+CGLM_INLINE
+bool
+glm_aabb_isvalid(vec3 box[2]) {
+  return glm_vec3_max(box[0]) != FLT_MAX
+         && glm_vec3_min(box[1]) != -FLT_MAX;
+}
+
+/*!
+ * @brief distance between of min and max
+ *
+ * @param[in]  box bounding box
+ */
+CGLM_INLINE
+float
+glm_aabb_size(vec3 box[2]) {
+  return glm_vec3_distance(box[0], box[1]);
+}
+
+/*!
+ * @brief radius of sphere which surrounds AABB
+ *
+ * @param[in]  box bounding box
+ */
+CGLM_INLINE
+float
+glm_aabb_radius(vec3 box[2]) {
+  return glm_aabb_size(box) * 0.5f;
+}
+
+/*!
+ * @brief computes center point of AABB
+ *
+ * @param[in]   box  bounding box
+ * @param[out]  dest center of bounding box
+ */
+CGLM_INLINE
+void
+glm_aabb_center(vec3 box[2], vec3 dest) {
+  glm_vec3_center(box[0], box[1], dest);
+}
+
+/*!
+ * @brief check if two AABB intersects
+ *
+ * @param[in]   box    bounding box
+ * @param[in]   other  other bounding box
+ */
+CGLM_INLINE
+bool
+glm_aabb_aabb(vec3 box[2], vec3 other[2]) {
+  return (box[0][0] <= other[1][0] && box[1][0] >= other[0][0])
+      && (box[0][1] <= other[1][1] && box[1][1] >= other[0][1])
+      && (box[0][2] <= other[1][2] && box[1][2] >= other[0][2]);
+}
+
+/*!
+ * @brief check if AABB intersects with sphere
+ *
+ * https://github.com/erich666/GraphicsGems/blob/master/gems/BoxSphere.c
+ * Solid Box - Solid Sphere test.
+ *
+ * Sphere Representation in cglm: [center.x, center.y, center.z, radii]
+ *
+ * @param[in]   box    solid bounding box
+ * @param[in]   s      solid sphere
+ */
+CGLM_INLINE
+bool
+glm_aabb_sphere(vec3 box[2], vec4 s) {
+  float dmin;
+  int   a, b, c;
+
+  a = (s[0] < box[0][0]) + (s[0] > box[1][0]);
+  b = (s[1] < box[0][1]) + (s[1] > box[1][1]);
+  c = (s[2] < box[0][2]) + (s[2] > box[1][2]);
+
+  dmin  = glm_pow2((s[0] - box[!(a - 1)][0]) * (a != 0))
+        + glm_pow2((s[1] - box[!(b - 1)][1]) * (b != 0))
+        + glm_pow2((s[2] - box[!(c - 1)][2]) * (c != 0));
+
+  return dmin <= glm_pow2(s[3]);
+}
+
+/*!
+ * @brief check if point is inside of AABB
+ *
+ * @param[in]   box    bounding box
+ * @param[in]   point  point
+ */
+CGLM_INLINE
+bool
+glm_aabb_point(vec3 box[2], vec3 point) {
+  return (point[0] >= box[0][0] && point[0] <= box[1][0])
+      && (point[1] >= box[0][1] && point[1] <= box[1][1])
+      && (point[2] >= box[0][2] && point[2] <= box[1][2]);
+}
+
+/*!
+ * @brief check if AABB contains other AABB
+ *
+ * @param[in]   box    bounding box
+ * @param[in]   other  other bounding box
+ */
+CGLM_INLINE
+bool
+glm_aabb_contains(vec3 box[2], vec3 other[2]) {
+  return (box[0][0] <= other[0][0] && box[1][0] >= other[1][0])
+      && (box[0][1] <= other[0][1] && box[1][1] >= other[1][1])
+      && (box[0][2] <= other[0][2] && box[1][2] >= other[1][2]);
+}
+
+#endif /* cglm_box_h */
diff --git a/external/cglm/call.h b/external/cglm/call.h
new file mode 100644
index 0000000..165f502
--- /dev/null
+++ b/external/cglm/call.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_call_h
+#define cglm_call_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "cglm.h"
+#include "call/vec2.h"
+#include "call/vec3.h"
+#include "call/vec4.h"
+#include "call/ivec2.h"
+#include "call/ivec3.h"
+#include "call/ivec4.h"
+#include "call/mat2.h"
+#include "call/mat2x3.h"
+#include "call/mat2x4.h"
+#include "call/mat3.h"
+#include "call/mat3x2.h"
+#include "call/mat3x4.h"
+#include "call/mat4.h"
+#include "call/mat4x2.h"
+#include "call/mat4x3.h"
+#include "call/affine.h"
+#include "call/cam.h"
+#include "call/quat.h"
+#include "call/euler.h"
+#include "call/plane.h"
+#include "call/noise.h"
+#include "call/frustum.h"
+#include "call/aabb2d.h"
+#include "call/box.h"
+#include "call/io.h"
+#include "call/project.h"
+#include "call/sphere.h"
+#include "call/ease.h"
+#include "call/curve.h"
+#include "call/bezier.h"
+#include "call/ray.h"
+#include "call/affine2d.h"
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglm_call_h */
diff --git a/external/cglm/call/aabb2d.h b/external/cglm/call/aabb2d.h
new file mode 100644
index 0000000..e6f36a0
--- /dev/null
+++ b/external/cglm/call/aabb2d.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_aabb2d_h
+#define cglmc_aabb2d_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+/* DEPRECATED! use _diag */
+#define glmc_aabb2d_size(aabb) glmc_aabb2d_diag(aabb)
+
+CGLM_EXPORT
+void
+glmc_aabb2d_zero(vec2 aabb[2]);
+
+CGLM_EXPORT
+void
+glmc_aabb2d_copy(vec2 aabb[2], vec2 dest[2]);
+
+CGLM_EXPORT
+void
+glmc_aabb2d_transform(vec2 aabb[2], mat3 m, vec2 dest[2]);
+
+CGLM_EXPORT
+void
+glmc_aabb2d_merge(vec2 aabb1[2], vec2 aabb2[2], vec2 dest[2]);
+
+CGLM_EXPORT
+void
+glmc_aabb2d_crop(vec2 aabb[2], vec2 cropAabb[2], vec2 dest[2]);
+
+CGLM_EXPORT
+void
+glmc_aabb2d_crop_until(vec2 aabb[2],
+                     vec2 cropAabb[2],
+                     vec2 clampAabb[2],
+                     vec2 dest[2]);
+
+CGLM_EXPORT
+void
+glmc_aabb2d_invalidate(vec2 aabb[2]);
+
+CGLM_EXPORT
+bool
+glmc_aabb2d_isvalid(vec2 aabb[2]);
+
+CGLM_EXPORT
+float
+glmc_aabb2d_diag(vec2 aabb[2]);
+
+CGLM_EXPORT
+void
+glmc_aabb2d_sizev(vec2 aabb[2], vec2 dest);
+
+CGLM_EXPORT
+float
+glmc_aabb2d_radius(vec2 aabb[2]);
+
+CGLM_EXPORT
+void
+glmc_aabb2d_center(vec2 aabb[2], vec2 dest);
+
+CGLM_EXPORT
+bool
+glmc_aabb2d_aabb(vec2 aabb[2], vec2 other[2]);
+
+CGLM_EXPORT
+bool
+glmc_aabb2d_point(vec2 aabb[2], vec2 point);
+
+CGLM_EXPORT
+bool
+glmc_aabb2d_contains(vec2 aabb[2], vec2 other[2]);
+
+CGLM_EXPORT
+bool
+glmc_aabb2d_circle(vec2 aabb[2], vec3 s);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_aabb2d_h */
diff --git a/external/cglm/call/affine.h b/external/cglm/call/affine.h
new file mode 100644
index 0000000..52b8501
--- /dev/null
+++ b/external/cglm/call/affine.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_affine_h
+#define cglmc_affine_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_translate_make(mat4 m, vec3 v);
+
+CGLM_EXPORT
+void
+glmc_translate_to(mat4 m, vec3 v, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_translate(mat4 m, vec3 v);
+
+CGLM_EXPORT
+void
+glmc_translate_x(mat4 m, float to);
+
+CGLM_EXPORT
+void
+glmc_translate_y(mat4 m, float to);
+
+CGLM_EXPORT
+void
+glmc_translate_z(mat4 m, float to);
+
+CGLM_EXPORT
+void
+glmc_scale_make(mat4 m, vec3 v);
+
+CGLM_EXPORT
+void
+glmc_scale_to(mat4 m, vec3 v, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_scale(mat4 m, vec3 v);
+
+CGLM_EXPORT
+void
+glmc_scale_uni(mat4 m, float s);
+
+CGLM_EXPORT
+void
+glmc_rotate_x(mat4 m, float rad, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_rotate_y(mat4 m, float rad, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_rotate_z(mat4 m, float rad, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_rotate_make(mat4 m, float angle, vec3 axis);
+
+CGLM_EXPORT
+void
+glmc_rotate(mat4 m, float angle, vec3 axis);
+
+CGLM_EXPORT
+void
+glmc_rotate_at(mat4 m, vec3 pivot, float angle, vec3 axis);
+
+CGLM_EXPORT
+void
+glmc_rotate_atm(mat4 m, vec3 pivot, float angle, vec3 axis);
+
+CGLM_EXPORT
+void
+glmc_spin(mat4 m, float angle, vec3 axis);
+
+CGLM_EXPORT
+void
+glmc_decompose_scalev(mat4 m, vec3 s);
+
+CGLM_EXPORT
+bool
+glmc_uniscaled(mat4 m);
+
+CGLM_EXPORT
+void
+glmc_decompose_rs(mat4 m, mat4 r, vec3 s);
+
+CGLM_EXPORT
+void
+glmc_decompose(mat4 m, vec4 t, mat4 r, vec3 s);
+
+/* affine-post */
+
+CGLM_EXPORT
+void
+glmc_translated(mat4 m, vec3 v);
+
+CGLM_EXPORT
+void
+glmc_translated_to(mat4 m, vec3 v, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_translated_x(mat4 m, float x);
+
+CGLM_EXPORT
+void
+glmc_translated_y(mat4 m, float y);
+
+CGLM_EXPORT
+void
+glmc_translated_z(mat4 m, float z);
+
+CGLM_EXPORT
+void
+glmc_rotated_x(mat4 m, float angle, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_rotated_y(mat4 m, float angle, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_rotated_z(mat4 m, float angle, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_rotated(mat4 m, float angle, vec3 axis);
+
+CGLM_EXPORT
+void
+glmc_rotated_at(mat4 m, vec3 pivot, float angle, vec3 axis);
+
+CGLM_EXPORT
+void
+glmc_spinned(mat4 m, float angle, vec3 axis);
+
+/* affine-mat */
+
+CGLM_EXPORT
+void
+glmc_mul(mat4 m1, mat4 m2, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_mul_rot(mat4 m1, mat4 m2, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_inv_tr(mat4 mat);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_affine_h */
diff --git a/external/cglm/call/affine2d.h b/external/cglm/call/affine2d.h
new file mode 100644
index 0000000..e1b9462
--- /dev/null
+++ b/external/cglm/call/affine2d.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_affine2d_h
+#define cglmc_affine2d_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_translate2d_make(mat3 m, vec2 v);
+
+CGLM_EXPORT
+void
+glmc_translate2d_to(mat3 m, vec2 v, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_translate2d(mat3 m, vec2 v);
+
+CGLM_EXPORT
+void
+glmc_translate2d_x(mat3 m, float to);
+
+CGLM_EXPORT
+void
+glmc_translate2d_y(mat3 m, float to);
+
+CGLM_EXPORT
+void
+glmc_scale2d_to(mat3 m, vec2 v, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_scale2d_make(mat3 m, vec2 v);
+
+CGLM_EXPORT
+void
+glmc_scale2d(mat3 m, vec2 v);
+
+CGLM_EXPORT
+void
+glmc_scale2d_uni(mat3 m, float s);
+
+CGLM_EXPORT
+void
+glmc_rotate2d_make(mat3 m, float angle);
+
+CGLM_EXPORT
+void
+glmc_rotate2d(mat3 m, float angle);
+
+CGLM_EXPORT
+void
+glmc_rotate2d_to(mat3 m, float angle, mat3 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_affine2d_h */
diff --git a/external/cglm/call/bezier.h b/external/cglm/call/bezier.h
new file mode 100644
index 0000000..a6a0eb4
--- /dev/null
+++ b/external/cglm/call/bezier.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_bezier_h
+#define cglmc_bezier_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+float
+glmc_bezier(float s, float p0, float c0, float c1, float p1);
+
+CGLM_EXPORT
+float
+glmc_hermite(float s, float p0, float t0, float t1, float p1);
+
+CGLM_EXPORT
+float
+glmc_decasteljau(float prm, float p0, float c0, float c1, float p1);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_bezier_h */
diff --git a/external/cglm/call/box.h b/external/cglm/call/box.h
new file mode 100644
index 0000000..3617eed
--- /dev/null
+++ b/external/cglm/call/box.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_box_h
+#define cglmc_box_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_aabb_transform(vec3 box[2], mat4 m, vec3 dest[2]);
+
+CGLM_EXPORT
+void
+glmc_aabb_merge(vec3 box1[2], vec3 box2[2], vec3 dest[2]);
+
+CGLM_EXPORT
+void
+glmc_aabb_crop(vec3 box[2], vec3 cropBox[2], vec3 dest[2]);
+
+CGLM_EXPORT
+void
+glmc_aabb_crop_until(vec3 box[2],
+                     vec3 cropBox[2],
+                     vec3 clampBox[2],
+                     vec3 dest[2]);
+
+CGLM_EXPORT
+bool
+glmc_aabb_frustum(vec3 box[2], vec4 planes[6]);
+
+CGLM_EXPORT
+void
+glmc_aabb_invalidate(vec3 box[2]);
+
+CGLM_EXPORT
+bool
+glmc_aabb_isvalid(vec3 box[2]);
+
+CGLM_EXPORT
+float
+glmc_aabb_size(vec3 box[2]);
+
+CGLM_EXPORT
+float
+glmc_aabb_radius(vec3 box[2]);
+
+CGLM_EXPORT
+void
+glmc_aabb_center(vec3 box[2], vec3 dest);
+
+CGLM_EXPORT
+bool
+glmc_aabb_aabb(vec3 box[2], vec3 other[2]);
+
+CGLM_EXPORT
+bool
+glmc_aabb_point(vec3 box[2], vec3 point);
+
+CGLM_EXPORT
+bool
+glmc_aabb_contains(vec3 box[2], vec3 other[2]);
+
+CGLM_EXPORT
+bool
+glmc_aabb_sphere(vec3 box[2], vec4 s);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_box_h */
diff --git a/external/cglm/call/cam.h b/external/cglm/call/cam.h
new file mode 100644
index 0000000..d9567ec
--- /dev/null
+++ b/external/cglm/call/cam.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_cam_h
+#define cglmc_cam_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_frustum(float left,   float right,
+             float bottom, float top,
+             float nearZ,  float farZ,
+             mat4  dest);
+
+CGLM_EXPORT
+void
+glmc_ortho(float left,   float right,
+           float bottom, float top,
+           float nearZ,  float farZ,
+           mat4  dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb(vec3 box[2], mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_p(vec3 box[2], float padding, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_pz(vec3 box[2], float padding, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_default(float aspect, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_default_s(float aspect, float size, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_perspective(float fovy, float aspect, float nearZ, float farZ, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_persp_move_far(mat4 proj, float deltaFar);
+
+CGLM_EXPORT
+void
+glmc_perspective_default(float aspect, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_perspective_resize(float aspect, mat4 proj);
+
+CGLM_EXPORT
+void
+glmc_lookat(vec3 eye, vec3 center, vec3 up, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_look(vec3 eye, vec3 dir, vec3 up, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_look_anyup(vec3 eye, vec3 dir, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp(mat4 proj,
+                  float * __restrict nearZ,
+                  float * __restrict farZ,
+                  float * __restrict top,
+                  float * __restrict bottom,
+                  float * __restrict left,
+                  float * __restrict right);
+
+CGLM_EXPORT
+void
+glmc_persp_decompv(mat4 proj, float dest[6]);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_x(mat4 proj,
+                    float * __restrict left,
+                    float * __restrict right);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_y(mat4 proj,
+                    float * __restrict top,
+                    float * __restrict bottom);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_z(mat4 proj,
+                    float * __restrict nearZ,
+                    float * __restrict farZ);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_far(mat4 proj, float * __restrict farZ);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_near(mat4 proj, float * __restrict nearZ);
+
+CGLM_EXPORT
+float
+glmc_persp_fovy(mat4 proj);
+
+CGLM_EXPORT
+float
+glmc_persp_aspect(mat4 proj);
+
+CGLM_EXPORT
+void
+glmc_persp_sizes(mat4 proj, float fovy, vec4 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_cam_h */
diff --git a/external/cglm/call/clipspace/ortho_lh_no.h b/external/cglm/call/clipspace/ortho_lh_no.h
new file mode 100644
index 0000000..3e26fa9
--- /dev/null
+++ b/external/cglm/call/clipspace/ortho_lh_no.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_ortho_lh_no_h
+#define cglmc_ortho_lh_no_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_ortho_lh_no(float left,    float right,
+                 float bottom,  float top,
+                 float nearZ,   float farZ,
+                 mat4  dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_lh_no(vec3 box[2], mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_p_lh_no(vec3 box[2], float padding, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_pz_lh_no(vec3 box[2], float padding, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_default_lh_no(float aspect, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_default_s_lh_no(float aspect, float size, mat4 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_ortho_lh_no_h */
diff --git a/external/cglm/call/clipspace/ortho_lh_zo.h b/external/cglm/call/clipspace/ortho_lh_zo.h
new file mode 100644
index 0000000..dc4c610
--- /dev/null
+++ b/external/cglm/call/clipspace/ortho_lh_zo.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_ortho_lh_zo_h
+#define cglmc_ortho_lh_zo_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_ortho_lh_zo(float left,    float right,
+                 float bottom,  float top,
+                 float nearZ,   float farZ,
+                 mat4  dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_lh_zo(vec3 box[2], mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_p_lh_zo(vec3 box[2], float padding, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_pz_lh_zo(vec3 box[2], float padding, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_default_lh_zo(float aspect, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_default_s_lh_zo(float aspect, float size, mat4 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_ortho_lh_zo_h */
diff --git a/external/cglm/call/clipspace/ortho_rh_no.h b/external/cglm/call/clipspace/ortho_rh_no.h
new file mode 100644
index 0000000..dbba497
--- /dev/null
+++ b/external/cglm/call/clipspace/ortho_rh_no.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_ortho_rh_no_h
+#define cglmc_ortho_rh_no_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_ortho_rh_no(float left,    float right,
+                 float bottom,  float top,
+                 float nearZ,   float farZ,
+                 mat4  dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_rh_no(vec3 box[2], mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_p_rh_no(vec3 box[2], float padding, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_pz_rh_no(vec3 box[2], float padding, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_default_rh_no(float aspect, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_default_s_rh_no(float aspect, float size, mat4 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_ortho_rh_no_h */
diff --git a/external/cglm/call/clipspace/ortho_rh_zo.h b/external/cglm/call/clipspace/ortho_rh_zo.h
new file mode 100644
index 0000000..e79ae83
--- /dev/null
+++ b/external/cglm/call/clipspace/ortho_rh_zo.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_ortho_rh_zo_h
+#define cglmc_ortho_rh_zo_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_ortho_rh_zo(float left,    float right,
+                 float bottom,  float top,
+                 float nearZ,   float farZ,
+                 mat4  dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_rh_zo(vec3 box[2], mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_p_rh_zo(vec3 box[2], float padding, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_aabb_pz_rh_zo(vec3 box[2], float padding, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_default_rh_zo(float aspect, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_ortho_default_s_rh_zo(float aspect, float size, mat4 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_ortho_rh_zo_h */
diff --git a/external/cglm/call/clipspace/persp_lh_no.h b/external/cglm/call/clipspace/persp_lh_no.h
new file mode 100644
index 0000000..4bdbcfe
--- /dev/null
+++ b/external/cglm/call/clipspace/persp_lh_no.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_persp_lh_no_h
+#define cglmc_persp_lh_no_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_frustum_lh_no(float left,    float right,
+                   float bottom,  float top,
+                   float nearZ,   float farZ,
+                   mat4  dest);
+
+CGLM_EXPORT
+void
+glmc_perspective_lh_no(float fovy,
+                       float aspect,
+                       float nearVal,
+                       float farVal,
+                       mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_persp_move_far_lh_no(mat4 proj, float deltaFar);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_lh_no(mat4 proj,
+                        float * __restrict nearZ, float * __restrict farZ,
+                        float * __restrict top,   float * __restrict bottom,
+                        float * __restrict left,  float * __restrict right);
+
+CGLM_EXPORT
+void
+glmc_persp_decompv_lh_no(mat4 proj, float dest[6]);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_x_lh_no(mat4 proj,
+                          float * __restrict left,
+                          float * __restrict right);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_y_lh_no(mat4 proj,
+                          float * __restrict top,
+                          float * __restrict bottom);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_z_lh_no(mat4 proj,
+                          float * __restrict nearZ,
+                          float * __restrict farZ);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_far_lh_no(mat4 proj, float * __restrict farZ);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_near_lh_no(mat4 proj, float * __restrict nearZ);
+
+CGLM_EXPORT
+void
+glmc_persp_sizes_lh_no(mat4 proj, float fovy, vec4 dest);
+
+CGLM_EXPORT
+float
+glmc_persp_fovy_lh_no(mat4 proj);
+
+CGLM_EXPORT
+float
+glmc_persp_aspect_lh_no(mat4 proj);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_persp_lh_no_h */
diff --git a/external/cglm/call/clipspace/persp_lh_zo.h b/external/cglm/call/clipspace/persp_lh_zo.h
new file mode 100644
index 0000000..53c2c1c
--- /dev/null
+++ b/external/cglm/call/clipspace/persp_lh_zo.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_persp_lh_zo_h
+#define cglmc_persp_lh_zo_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_frustum_lh_zo(float left,    float right,
+                   float bottom,  float top,
+                   float nearZ,   float farZ,
+                   mat4  dest);
+
+CGLM_EXPORT
+void
+glmc_perspective_lh_zo(float fovy,
+                       float aspect,
+                       float nearVal,
+                       float farVal,
+                       mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_persp_move_far_lh_zo(mat4 proj, float deltaFar);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_lh_zo(mat4 proj,
+                        float * __restrict nearZ, float * __restrict farZ,
+                        float * __restrict top,   float * __restrict bottom,
+                        float * __restrict left,  float * __restrict right);
+
+CGLM_EXPORT
+void
+glmc_persp_decompv_lh_zo(mat4 proj, float dest[6]);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_x_lh_zo(mat4 proj,
+                          float * __restrict left,
+                          float * __restrict right);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_y_lh_zo(mat4 proj,
+                          float * __restrict top,
+                          float * __restrict bottom);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_z_lh_zo(mat4 proj,
+                          float * __restrict nearZ,
+                          float * __restrict farZ);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_far_lh_zo(mat4 proj, float * __restrict farZ);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_near_lh_zo(mat4 proj, float * __restrict nearZ);
+
+CGLM_EXPORT
+void
+glmc_persp_sizes_lh_zo(mat4 proj, float fovy, vec4 dest);
+
+CGLM_EXPORT
+float
+glmc_persp_fovy_lh_zo(mat4 proj);
+
+CGLM_EXPORT
+float
+glmc_persp_aspect_lh_zo(mat4 proj);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_persp_lh_zo_h */
diff --git a/external/cglm/call/clipspace/persp_rh_no.h b/external/cglm/call/clipspace/persp_rh_no.h
new file mode 100644
index 0000000..9c0d65d
--- /dev/null
+++ b/external/cglm/call/clipspace/persp_rh_no.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_persp_rh_no_h
+#define cglmc_persp_rh_no_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_frustum_rh_no(float left,    float right,
+                   float bottom,  float top,
+                   float nearZ,   float farZ,
+                   mat4  dest);
+
+CGLM_EXPORT
+void
+glmc_perspective_rh_no(float fovy,
+                       float aspect,
+                       float nearVal,
+                       float farVal,
+                       mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_persp_move_far_rh_no(mat4 proj, float deltaFar);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_rh_no(mat4 proj,
+                        float * __restrict nearZ, float * __restrict farZ,
+                        float * __restrict top,   float * __restrict bottom,
+                        float * __restrict left,  float * __restrict right);
+
+CGLM_EXPORT
+void
+glmc_persp_decompv_rh_no(mat4 proj, float dest[6]);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_x_rh_no(mat4 proj,
+                          float * __restrict left,
+                          float * __restrict right);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_y_rh_no(mat4 proj,
+                          float * __restrict top,
+                          float * __restrict bottom);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_z_rh_no(mat4 proj,
+                          float * __restrict nearZ,
+                          float * __restrict farZ);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_far_rh_no(mat4 proj, float * __restrict farZ);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_near_rh_no(mat4 proj, float * __restrict nearZ);
+
+CGLM_EXPORT
+void
+glmc_persp_sizes_rh_no(mat4 proj, float fovy, vec4 dest);
+
+CGLM_EXPORT
+float
+glmc_persp_fovy_rh_no(mat4 proj);
+
+CGLM_EXPORT
+float
+glmc_persp_aspect_rh_no(mat4 proj);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_persp_rh_no_h */
diff --git a/external/cglm/call/clipspace/persp_rh_zo.h b/external/cglm/call/clipspace/persp_rh_zo.h
new file mode 100644
index 0000000..718d4ad
--- /dev/null
+++ b/external/cglm/call/clipspace/persp_rh_zo.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_persp_rh_zo_h
+#define cglmc_persp_rh_zo_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_frustum_rh_zo(float left,    float right,
+                   float bottom,  float top,
+                   float nearZ,   float farZ,
+                   mat4  dest);
+
+CGLM_EXPORT
+void
+glmc_perspective_rh_zo(float fovy,
+                       float aspect,
+                       float nearVal,
+                       float farVal,
+                       mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_persp_move_far_rh_zo(mat4 proj, float deltaFar);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_rh_zo(mat4 proj,
+                        float * __restrict nearZ, float * __restrict farZ,
+                        float * __restrict top,   float * __restrict bottom,
+                        float * __restrict left,  float * __restrict right);
+
+CGLM_EXPORT
+void
+glmc_persp_decompv_rh_zo(mat4 proj, float dest[6]);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_x_rh_zo(mat4 proj,
+                          float * __restrict left,
+                          float * __restrict right);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_y_rh_zo(mat4 proj,
+                          float * __restrict top,
+                          float * __restrict bottom);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_z_rh_zo(mat4 proj,
+                          float * __restrict nearZ,
+                          float * __restrict farZ);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_far_rh_zo(mat4 proj, float * __restrict farZ);
+
+CGLM_EXPORT
+void
+glmc_persp_decomp_near_rh_zo(mat4 proj, float * __restrict nearZ);
+
+CGLM_EXPORT
+void
+glmc_persp_sizes_rh_zo(mat4 proj, float fovy, vec4 dest);
+
+CGLM_EXPORT
+float
+glmc_persp_fovy_rh_zo(mat4 proj);
+
+CGLM_EXPORT
+float
+glmc_persp_aspect_rh_zo(mat4 proj);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_persp_rh_zo_h */
diff --git a/external/cglm/call/clipspace/project_no.h b/external/cglm/call/clipspace/project_no.h
new file mode 100644
index 0000000..3cba860
--- /dev/null
+++ b/external/cglm/call/clipspace/project_no.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_project_no_h
+#define cglmc_project_no_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_unprojecti_no(vec3 pos, mat4 invMat, vec4 vp, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_project_no(vec3 pos, mat4 m, vec4 vp, vec3 dest);
+
+CGLM_EXPORT
+float
+glmc_project_z_no(vec3 pos, mat4 m);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_project_no_h */
diff --git a/external/cglm/call/clipspace/project_zo.h b/external/cglm/call/clipspace/project_zo.h
new file mode 100644
index 0000000..d2a6c62
--- /dev/null
+++ b/external/cglm/call/clipspace/project_zo.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_project_zo_h
+#define cglmc_project_zo_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_unprojecti_zo(vec3 pos, mat4 invMat, vec4 vp, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_project_zo(vec3 pos, mat4 m, vec4 vp, vec3 dest);
+
+CGLM_EXPORT
+float
+glmc_project_z_zo(vec3 pos, mat4 m);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_project_zo_h */
diff --git a/external/cglm/call/clipspace/view_lh_no.h b/external/cglm/call/clipspace/view_lh_no.h
new file mode 100644
index 0000000..3b58c84
--- /dev/null
+++ b/external/cglm/call/clipspace/view_lh_no.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_view_lh_no_h
+#define cglmc_view_lh_no_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_lookat_lh_no(vec3 eye, vec3 center, vec3 up, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_look_lh_no(vec3 eye, vec3 dir, vec3 up, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_look_anyup_lh_no(vec3 eye, vec3 dir, mat4 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_view_lh_no_h */
diff --git a/external/cglm/call/clipspace/view_lh_zo.h b/external/cglm/call/clipspace/view_lh_zo.h
new file mode 100644
index 0000000..c877367
--- /dev/null
+++ b/external/cglm/call/clipspace/view_lh_zo.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_view_lh_zo_h
+#define cglmc_view_lh_zo_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_lookat_lh_zo(vec3 eye, vec3 center, vec3 up, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_look_lh_zo(vec3 eye, vec3 dir, vec3 up, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_look_anyup_lh_zo(vec3 eye, vec3 dir, mat4 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_view_lh_zo_h */
diff --git a/external/cglm/call/clipspace/view_rh_no.h b/external/cglm/call/clipspace/view_rh_no.h
new file mode 100644
index 0000000..6303dbf
--- /dev/null
+++ b/external/cglm/call/clipspace/view_rh_no.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_view_rh_no_h
+#define cglmc_view_rh_no_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_lookat_rh_no(vec3 eye, vec3 center, vec3 up, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_look_rh_no(vec3 eye, vec3 dir, vec3 up, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_look_anyup_rh_no(vec3 eye, vec3 dir, mat4 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_view_rh_no_h */
diff --git a/external/cglm/call/clipspace/view_rh_zo.h b/external/cglm/call/clipspace/view_rh_zo.h
new file mode 100644
index 0000000..00b8707
--- /dev/null
+++ b/external/cglm/call/clipspace/view_rh_zo.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_view_rh_zo_h
+#define cglmc_view_rh_zo_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_lookat_rh_zo(vec3 eye, vec3 center, vec3 up, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_look_rh_zo(vec3 eye, vec3 dir, vec3 up, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_look_anyup_rh_zo(vec3 eye, vec3 dir, mat4 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_view_rh_zo_h */
diff --git a/external/cglm/call/curve.h b/external/cglm/call/curve.h
new file mode 100644
index 0000000..061fdb9
--- /dev/null
+++ b/external/cglm/call/curve.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_curve_h
+#define cglmc_curve_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+float
+glmc_smc(float s, mat4 m, vec4 c);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_curve_h */
diff --git a/external/cglm/call/ease.h b/external/cglm/call/ease.h
new file mode 100644
index 0000000..87e39ca
--- /dev/null
+++ b/external/cglm/call/ease.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_ease_h
+#define cglmc_ease_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+float
+glmc_ease_linear(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_sine_in(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_sine_out(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_sine_inout(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_quad_in(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_quad_out(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_quad_inout(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_cubic_in(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_cubic_out(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_cubic_inout(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_quart_in(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_quart_out(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_quart_inout(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_quint_in(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_quint_out(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_quint_inout(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_exp_in(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_exp_out(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_exp_inout(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_circ_in(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_circ_out(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_circ_inout(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_back_in(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_back_out(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_back_inout(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_elast_in(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_elast_out(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_elast_inout(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_bounce_out(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_bounce_in(float t);
+
+CGLM_EXPORT
+float
+glmc_ease_bounce_inout(float t);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_ease_h */
diff --git a/external/cglm/call/euler.h b/external/cglm/call/euler.h
new file mode 100644
index 0000000..182bcbb
--- /dev/null
+++ b/external/cglm/call/euler.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_euler_h
+#define cglmc_euler_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_euler_angles(mat4 m, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_euler(vec3 angles, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_euler_xyz(vec3 angles,  mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_euler_zyx(vec3 angles,  mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_euler_zxy(vec3 angles, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_euler_xzy(vec3 angles, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_euler_yzx(vec3 angles, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_euler_yxz(vec3 angles, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_euler_by_order(vec3 angles, glm_euler_seq axis, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_euler_xyz_quat(vec3 angles, versor dest);
+
+CGLM_EXPORT
+void
+glmc_euler_xzy_quat(vec3 angles, versor dest);
+
+CGLM_EXPORT
+void
+glmc_euler_yxz_quat(vec3 angles, versor dest);
+
+CGLM_EXPORT
+void
+glmc_euler_yzx_quat(vec3 angles, versor dest);
+
+CGLM_EXPORT
+void
+glmc_euler_zxy_quat(vec3 angles, versor dest);
+
+CGLM_EXPORT
+void
+glmc_euler_zyx_quat(vec3 angles, versor dest);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_euler_h */
diff --git a/external/cglm/call/frustum.h b/external/cglm/call/frustum.h
new file mode 100644
index 0000000..6b4facb
--- /dev/null
+++ b/external/cglm/call/frustum.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_frustum_h
+#define cglmc_frustum_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_frustum_planes(mat4 m, vec4 dest[6]);
+
+CGLM_EXPORT
+void
+glmc_frustum_corners(mat4 invMat, vec4 dest[8]);
+
+CGLM_EXPORT
+void
+glmc_frustum_center(vec4 corners[8], vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_frustum_box(vec4 corners[8], mat4 m, vec3 box[2]);
+
+CGLM_EXPORT
+void
+glmc_frustum_corners_at(vec4  corners[8],
+                        float splitDist,
+                        float farDist,
+                        vec4  planeCorners[4]);
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_frustum_h */
diff --git a/external/cglm/call/io.h b/external/cglm/call/io.h
new file mode 100644
index 0000000..19ea06f
--- /dev/null
+++ b/external/cglm/call/io.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_io_h
+#define cglmc_io_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_mat4_print(mat4   matrix,
+                FILE * __restrict ostream);
+
+CGLM_EXPORT
+void
+glmc_mat3_print(mat3 matrix,
+                FILE * __restrict ostream);
+
+CGLM_EXPORT
+void
+glmc_vec4_print(vec4 vec,
+                FILE * __restrict ostream);
+
+CGLM_EXPORT
+void
+glmc_vec3_print(vec3 vec,
+                FILE * __restrict ostream);
+
+CGLM_EXPORT
+void
+glmc_versor_print(versor vec,
+                  FILE * __restrict ostream);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_io_h */
diff --git a/external/cglm/call/ivec2.h b/external/cglm/call/ivec2.h
new file mode 100644
index 0000000..82f70eb
--- /dev/null
+++ b/external/cglm/call/ivec2.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_ivec2_h
+#define cglmc_ivec2_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_ivec2(int * __restrict v, ivec2 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec2_copy(ivec2 a, ivec2 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec2_zero(ivec2 v);
+
+CGLM_EXPORT
+void
+glmc_ivec2_one(ivec2 v);
+
+CGLM_EXPORT
+int
+glmc_ivec2_dot(ivec2 a, ivec2 b);
+
+CGLM_EXPORT
+int
+glmc_ivec2_cross(ivec2 a, ivec2 b);
+
+CGLM_EXPORT
+void
+glmc_ivec2_add(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec2_adds(ivec2 v, int s, ivec2 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec2_sub(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec2_subs(ivec2 v, int s, ivec2 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec2_mul(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec2_scale(ivec2 v, int s, ivec2 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec2_div(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec2_divs(ivec2 v, int s, ivec2 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec2_mod(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_addadd(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_addadds(ivec2 a, int s, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_subadd(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_subadds(ivec2 a, int s, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_muladd(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_muladds(ivec2 a, int s, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_maxadd(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_minadd(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_subsub(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_subsubs(ivec2 a, int s, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_addsub(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_addsubs(ivec2 a, int s, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_mulsub(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_mulsubs(ivec2 a, int s, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_maxsub(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec2_minsub(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT
+int
+glmc_ivec2_distance2(ivec2 a, ivec2 b);
+
+CGLM_EXPORT
+float
+glmc_ivec2_distance(ivec2 a, ivec2 b);
+
+CGLM_EXPORT
+void
+glmc_ivec2_fill(ivec2 v, int val);
+
+CGLM_EXPORT
+bool
+glmc_ivec2_eq(ivec2 v, int val);
+
+CGLM_EXPORT
+bool
+glmc_ivec2_eqv(ivec2 a, ivec2 b);
+
+CGLM_EXPORT
+void
+glmc_ivec2_maxv(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec2_minv(ivec2 a, ivec2 b, ivec2 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec2_clamp(ivec2 v, int minVal, int maxVal);
+
+CGLM_EXPORT
+void
+glmc_ivec2_abs(ivec2 v, ivec2 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_ivec2_h */
diff --git a/external/cglm/call/ivec3.h b/external/cglm/call/ivec3.h
new file mode 100644
index 0000000..a6cec53
--- /dev/null
+++ b/external/cglm/call/ivec3.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c);, Recep Aslantas.
+ *
+ * MIT License (MIT);, http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_ivec3_h
+#define cglmc_ivec3_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_ivec3(ivec4 v4, ivec3 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec3_copy(ivec3 a, ivec3 dest);
+
+CGLM_EXPORT
+void 
+glmc_ivec3_zero(ivec3 v);
+
+CGLM_EXPORT
+void
+glmc_ivec3_one(ivec3 v);
+
+CGLM_EXPORT
+int
+glmc_ivec3_dot(ivec3 a, ivec3 b);
+
+CGLM_EXPORT
+int
+glmc_ivec3_norm2(ivec3 v);
+
+CGLM_EXPORT
+int
+glmc_ivec3_norm(ivec3 v);
+
+CGLM_EXPORT
+void
+glmc_ivec3_add(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec3_adds(ivec3 v, int s, ivec3 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec3_sub(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec3_subs(ivec3 v, int s, ivec3 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec3_mul(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec3_scale(ivec3 v, int s, ivec3 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec3_div(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec3_divs(ivec3 v, int s, ivec3 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec3_mod(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_addadd(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_addadds(ivec3 a, int s, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_subadd(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_subadds(ivec3 a, int s, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_muladd(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_muladds(ivec3 a, int s, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_maxadd(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_minadd(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_subsub(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_subsubs(ivec3 a, int s, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_addsub(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_addsubs(ivec3 a, int s, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_mulsub(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_mulsubs(ivec3 a, int s, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_maxsub(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec3_minsub(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT
+int
+glmc_ivec3_distance2(ivec3 a, ivec3 b);
+
+CGLM_EXPORT
+float
+glmc_ivec3_distance(ivec3 a, ivec3 b);
+
+CGLM_EXPORT
+void
+glmc_ivec3_fill(ivec3 v, int val);
+
+CGLM_EXPORT
+bool
+glmc_ivec3_eq(ivec3 v, int val);
+
+CGLM_EXPORT
+bool
+glmc_ivec3_eqv(ivec3 a, ivec3 b);
+
+CGLM_EXPORT
+void
+glmc_ivec3_maxv(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec3_minv(ivec3 a, ivec3 b, ivec3 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec3_clamp(ivec3 v, int minVal, int maxVal);
+
+CGLM_EXPORT
+void
+glmc_ivec3_abs(ivec3 v, ivec3 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_ivec3_h */
diff --git a/external/cglm/call/ivec4.h b/external/cglm/call/ivec4.h
new file mode 100644
index 0000000..0e6d721
--- /dev/null
+++ b/external/cglm/call/ivec4.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_ivec4_h
+#define cglmc_ivec4_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_ivec4(ivec3 v3, int last, ivec4 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec4_copy(ivec4 a, ivec4 dest);
+
+CGLM_EXPORT
+void 
+glmc_ivec4_zero(ivec4 v);
+
+CGLM_EXPORT
+void
+glmc_ivec4_one(ivec4 v);
+
+CGLM_EXPORT
+void
+glmc_ivec4_add(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec4_adds(ivec4 v, int s, ivec4 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec4_sub(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec4_subs(ivec4 v, int s, ivec4 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec4_mul(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec4_scale(ivec4 v, int s, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_addadd(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_addadds(ivec4 a, int s, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_subadd(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_subadds(ivec4 a, int s, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_muladd(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_muladds(ivec4 a, int s, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_maxadd(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_minadd(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_subsub(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_subsubs(ivec4 a, int s, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_addsub(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_addsubs(ivec4 a, int s, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_mulsub(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_mulsubs(ivec4 a, int s, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_maxsub(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT 
+void 
+glmc_ivec4_minsub(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT
+int
+glmc_ivec4_distance2(ivec4 a, ivec4 b);
+
+CGLM_EXPORT
+float
+glmc_ivec4_distance(ivec4 a, ivec4 b);
+
+CGLM_EXPORT
+void
+glmc_ivec4_maxv(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec4_minv(ivec4 a, ivec4 b, ivec4 dest);
+
+CGLM_EXPORT
+void
+glmc_ivec4_clamp(ivec4 v, int minVal, int maxVal);
+
+CGLM_EXPORT
+void
+glmc_ivec4_abs(ivec4 v, ivec4 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_ivec4_h */
diff --git a/external/cglm/call/mat2.h b/external/cglm/call/mat2.h
new file mode 100644
index 0000000..c268938
--- /dev/null
+++ b/external/cglm/call/mat2.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_mat2_h
+#define cglmc_mat2_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_mat2_make(const float * __restrict src, mat2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2_copy(mat2 mat, mat2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2_identity(mat2 m);
+
+CGLM_EXPORT
+void
+glmc_mat2_identity_array(mat2 * __restrict mats, size_t count);
+
+CGLM_EXPORT
+void
+glmc_mat2_zero(mat2 m);
+
+CGLM_EXPORT
+void
+glmc_mat2_mul(mat2 m1, mat2 m2, mat2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2_mulv(mat2 m, vec2 v, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2_transpose_to(mat2 mat, mat2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2_transpose(mat2 m);
+
+CGLM_EXPORT
+void
+glmc_mat2_scale(mat2 m, float s);
+
+CGLM_EXPORT
+void
+glmc_mat2_inv(mat2 mat, mat2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2_swap_col(mat2 mat, int col1, int col2);
+
+CGLM_EXPORT
+void
+glmc_mat2_swap_row(mat2 mat, int row1, int row2);
+
+CGLM_EXPORT
+float
+glmc_mat2_trace(mat2 m);
+
+CGLM_EXPORT
+float
+glmc_mat2_det(mat2 m);
+
+CGLM_EXPORT
+float
+glmc_mat2_rmc(vec2 r, mat2 m, vec2 c);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_mat2_h */
diff --git a/external/cglm/call/mat2x3.h b/external/cglm/call/mat2x3.h
new file mode 100644
index 0000000..215d9a4
--- /dev/null
+++ b/external/cglm/call/mat2x3.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_mat2x3_h
+#define cglmc_mat2x3_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_mat2x3_copy(mat2x3 src, mat2x3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2x3_zero(mat2x3 m);
+
+CGLM_EXPORT
+void
+glmc_mat2x3_make(const float * __restrict src, mat2x3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2x3_mul(mat2x3 m1, mat3x2 m2, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2x3_mulv(mat2x3 m, vec2 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2x3_transpose(mat2x3 src, mat3x2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2x3_scale(mat2x3 m, float s);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_mat2x3_h */
diff --git a/external/cglm/call/mat2x4.h b/external/cglm/call/mat2x4.h
new file mode 100644
index 0000000..e2775a4
--- /dev/null
+++ b/external/cglm/call/mat2x4.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_mat2x4_h
+#define cglmc_mat2x4_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_mat2x4_copy(mat2x4 src, mat2x4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2x4_zero(mat2x4 m);
+
+CGLM_EXPORT
+void
+glmc_mat2x4_make(const float * __restrict src, mat2x4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2x4_mul(mat2x4 m1, mat4x2 m2, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2x4_mulv(mat2x4 m, vec2 v, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2x4_transpose(mat2x4 src, mat4x2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat2x4_scale(mat2x4 m, float s);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_mat2x4_h */
diff --git a/external/cglm/call/mat3.h b/external/cglm/call/mat3.h
new file mode 100644
index 0000000..47820f9
--- /dev/null
+++ b/external/cglm/call/mat3.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_mat3_h
+#define cglmc_mat3_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+/* DEPRECATED! use _copy, _ucopy versions */
+#define glmc_mat3_dup(mat, dest)  glmc_mat3_copy(mat, dest)
+
+CGLM_EXPORT
+void
+glmc_mat3_copy(mat3 mat, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3_identity(mat3 mat);
+
+CGLM_EXPORT
+void
+glmc_mat3_zero(mat3 mat);
+
+CGLM_EXPORT
+void
+glmc_mat3_identity_array(mat3 * __restrict mat, size_t count);
+
+CGLM_EXPORT
+void
+glmc_mat3_mul(mat3 m1, mat3 m2, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3_transpose_to(mat3 m, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3_transpose(mat3 m);
+
+CGLM_EXPORT
+void
+glmc_mat3_mulv(mat3 m, vec3 v, vec3 dest);
+
+CGLM_EXPORT
+float
+glmc_mat3_trace(mat3 m);
+
+CGLM_EXPORT
+void
+glmc_mat3_quat(mat3 m, versor dest);
+
+CGLM_EXPORT
+void
+glmc_mat3_scale(mat3 m, float s);
+
+CGLM_EXPORT
+float
+glmc_mat3_det(mat3 mat);
+
+CGLM_EXPORT
+void
+glmc_mat3_inv(mat3 mat, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3_swap_col(mat3 mat, int col1, int col2);
+
+CGLM_EXPORT
+void
+glmc_mat3_swap_row(mat3 mat, int row1, int row2);
+
+CGLM_EXPORT
+float
+glmc_mat3_rmc(vec3 r, mat3 m, vec3 c);
+
+CGLM_EXPORT
+void
+glmc_mat3_make(const float * __restrict src, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3_textrans(float sx, float sy, float rot, float tx, float ty, mat3 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_mat3_h */
diff --git a/external/cglm/call/mat3x2.h b/external/cglm/call/mat3x2.h
new file mode 100644
index 0000000..246a269
--- /dev/null
+++ b/external/cglm/call/mat3x2.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_mat3x2_h
+#define cglmc_mat3x2_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_mat3x2_copy(mat3x2 src, mat3x2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3x2_zero(mat3x2 m);
+
+CGLM_EXPORT
+void
+glmc_mat3x2_make(const float * __restrict src, mat3x2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3x2_mul(mat3x2 m1, mat2x3 m2, mat2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3x2_mulv(mat3x2 m, vec3 v, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3x2_transpose(mat3x2 src, mat2x3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3x2_scale(mat3x2 m, float s);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_mat3x2_h */
diff --git a/external/cglm/call/mat3x4.h b/external/cglm/call/mat3x4.h
new file mode 100644
index 0000000..5ead2f4
--- /dev/null
+++ b/external/cglm/call/mat3x4.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_mat3x4_h
+#define cglmc_mat3x4_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_mat3x4_copy(mat3x4 src, mat3x4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3x4_zero(mat3x4 m);
+
+CGLM_EXPORT
+void
+glmc_mat3x4_make(const float * __restrict src, mat3x4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3x4_mulv(mat3x4 m, vec3 v, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3x4_transpose(mat3x4 src, mat4x3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat3x4_scale(mat3x4 m, float s);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_mat3x4_h */
diff --git a/external/cglm/call/mat4.h b/external/cglm/call/mat4.h
new file mode 100644
index 0000000..f8cd70a
--- /dev/null
+++ b/external/cglm/call/mat4.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_mat_h
+#define cglmc_mat_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+/* DEPRECATED! use _copy, _ucopy versions */
+#define glmc_mat4_udup(mat, dest) glmc_mat4_ucopy(mat, dest)
+#define glmc_mat4_dup(mat, dest)  glmc_mat4_copy(mat, dest)
+
+CGLM_EXPORT
+void
+glmc_mat4_ucopy(mat4 mat, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_copy(mat4 mat, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_identity(mat4 mat);
+
+CGLM_EXPORT
+void
+glmc_mat4_identity_array(mat4 * __restrict mat, size_t count);
+
+CGLM_EXPORT
+void
+glmc_mat4_zero(mat4 mat);
+
+CGLM_EXPORT
+void
+glmc_mat4_pick3(mat4 mat, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_pick3t(mat4 mat, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_ins3(mat3 mat, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_mul(mat4 m1, mat4 m2, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_mulN(mat4 * __restrict matrices[], uint32_t len, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_mulv(mat4 m, vec4 v, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_mulv3(mat4 m, vec3 v, float last, vec3 dest);
+
+CGLM_EXPORT
+float
+glmc_mat4_trace(mat4 m);
+
+CGLM_EXPORT
+float
+glmc_mat4_trace3(mat4 m);
+
+CGLM_EXPORT
+void
+glmc_mat4_quat(mat4 m, versor dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_transpose_to(mat4 m, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_transpose(mat4 m);
+
+CGLM_EXPORT
+void
+glmc_mat4_scale_p(mat4 m, float s);
+
+CGLM_EXPORT
+void
+glmc_mat4_scale(mat4 m, float s);
+
+CGLM_EXPORT
+float
+glmc_mat4_det(mat4 mat);
+
+CGLM_EXPORT
+void
+glmc_mat4_inv(mat4 mat, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_inv_precise(mat4 mat, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_inv_fast(mat4 mat, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_swap_col(mat4 mat, int col1, int col2);
+
+CGLM_EXPORT
+void
+glmc_mat4_swap_row(mat4 mat, int row1, int row2);
+
+CGLM_EXPORT
+float
+glmc_mat4_rmc(vec4 r, mat4 m, vec4 c);
+
+CGLM_EXPORT
+void
+glmc_mat4_make(const float * __restrict src, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4_textrans(float sx, float sy, float rot, float tx, float ty, mat4 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_mat_h */
diff --git a/external/cglm/call/mat4x2.h b/external/cglm/call/mat4x2.h
new file mode 100644
index 0000000..4711d2b
--- /dev/null
+++ b/external/cglm/call/mat4x2.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_mat4x2_h
+#define cglmc_mat4x2_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_mat4x2_copy(mat4x2 src, mat4x2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4x2_zero(mat4x2 m);
+
+CGLM_EXPORT
+void
+glmc_mat4x2_make(const float * __restrict src, mat4x2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4x2_mulv(mat4x2 m, vec4 v, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4x2_transpose(mat4x2 src, mat2x4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4x2_scale(mat4x2 m, float s);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_mat4x2_h */
diff --git a/external/cglm/call/mat4x3.h b/external/cglm/call/mat4x3.h
new file mode 100644
index 0000000..e06e102
--- /dev/null
+++ b/external/cglm/call/mat4x3.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_mat4x3_h
+#define cglmc_mat4x3_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_mat4x3_copy(mat4x3 src, mat4x3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4x3_zero(mat4x3 m);
+
+CGLM_EXPORT
+void
+glmc_mat4x3_make(const float * __restrict src, mat4x3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4x3_mulv(mat4x3 m, vec4 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4x3_transpose(mat4x3 src, mat3x4 dest);
+
+CGLM_EXPORT
+void
+glmc_mat4x3_scale(mat4x3 m, float s);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_mat4x3_h */
diff --git a/external/cglm/call/noise.h b/external/cglm/call/noise.h
new file mode 100644
index 0000000..6020c89
--- /dev/null
+++ b/external/cglm/call/noise.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_noise_h
+#define cglmc_noise_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+float
+glmc_perlin_vec4(vec4 point);
+
+CGLM_EXPORT
+float
+glmc_perlin_vec3(vec3 point);
+
+CGLM_EXPORT
+float
+glmc_perlin_vec2(vec2 point);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_noise_h */
diff --git a/external/cglm/call/plane.h b/external/cglm/call/plane.h
new file mode 100644
index 0000000..f991121
--- /dev/null
+++ b/external/cglm/call/plane.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_plane_h
+#define cglmc_plane_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_plane_normalize(vec4 plane);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_plane_h */
diff --git a/external/cglm/call/project.h b/external/cglm/call/project.h
new file mode 100644
index 0000000..8fa7172
--- /dev/null
+++ b/external/cglm/call/project.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_project_h
+#define cglmc_project_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_unprojecti(vec3 pos, mat4 invMat, vec4 vp, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_unproject(vec3 pos, mat4 m, vec4 vp, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_project(vec3 pos, mat4 m, vec4 vp, vec3 dest);
+
+CGLM_EXPORT
+float
+glmc_project_z(vec3 pos, mat4 m);
+
+CGLM_EXPORT
+void
+glmc_pickmatrix(vec2 center, vec2 size, vec4 vp, mat4 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_project_h */
diff --git a/external/cglm/call/quat.h b/external/cglm/call/quat.h
new file mode 100644
index 0000000..4244d36
--- /dev/null
+++ b/external/cglm/call/quat.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_quat_h
+#define cglmc_quat_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_quat_identity(versor q);
+
+CGLM_EXPORT
+void
+glmc_quat_identity_array(versor * __restrict q, size_t count);
+
+CGLM_EXPORT
+void
+glmc_quat_init(versor q, float x, float y, float z, float w);
+
+CGLM_EXPORT
+void
+glmc_quat(versor q, float angle, float x, float y, float z);
+
+CGLM_EXPORT
+void
+glmc_quatv(versor q, float angle, vec3 axis);
+
+CGLM_EXPORT
+void
+glmc_quat_copy(versor q, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_from_vecs(vec3 a, vec3 b, versor dest);
+
+CGLM_EXPORT
+float
+glmc_quat_norm(versor q);
+
+CGLM_EXPORT
+void
+glmc_quat_normalize_to(versor q, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_normalize(versor q);
+
+CGLM_EXPORT
+float
+glmc_quat_dot(versor p, versor q);
+
+CGLM_EXPORT
+void
+glmc_quat_conjugate(versor q, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_inv(versor q, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_add(versor p, versor q, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_sub(versor p, versor q, versor dest);
+
+CGLM_EXPORT
+float
+glmc_quat_real(versor q);
+
+CGLM_EXPORT
+void
+glmc_quat_imag(versor q, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_imagn(versor q, vec3 dest);
+
+CGLM_EXPORT
+float
+glmc_quat_imaglen(versor q);
+
+CGLM_EXPORT
+float
+glmc_quat_angle(versor q);
+
+CGLM_EXPORT
+void
+glmc_quat_axis(versor q, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_mul(versor p, versor q, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_mat4(versor q, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_mat4t(versor q, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_mat3(versor q, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_mat3t(versor q, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_lerp(versor from, versor to, float t, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_lerpc(versor from, versor to, float t, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_nlerp(versor q, versor r, float t, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_slerp(versor q, versor r, float t, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_slerp_longest(versor q, versor r, float t, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_look(vec3 eye, versor ori, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_for(vec3 dir, vec3 up, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_forp(vec3 from, vec3 to, vec3 up, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_rotatev(versor from, vec3 to, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_rotate(mat4 m, versor q, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_rotate_at(mat4 model, versor q, vec3 pivot);
+
+CGLM_EXPORT
+void
+glmc_quat_rotate_atm(mat4 m, versor q, vec3 pivot);
+
+CGLM_EXPORT
+void
+glmc_quat_make(const float * __restrict src, versor dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_quat_h */
diff --git a/external/cglm/call/ray.h b/external/cglm/call/ray.h
new file mode 100644
index 0000000..e529fdf
--- /dev/null
+++ b/external/cglm/call/ray.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_ray_h
+#define cglmc_ray_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "../cglm.h"
+
+CGLM_EXPORT
+bool
+glmc_ray_triangle(vec3   origin,
+                  vec3   direction,
+                  vec3   v0,
+                  vec3   v1,
+                  vec3   v2,
+                  float *d);
+
+CGLM_EXPORT
+bool
+glmc_ray_sphere(vec3 origin,
+                vec3 dir,
+                vec4 s,
+                float * __restrict t1,
+                float * __restrict t2);
+
+CGLM_EXPORT
+void
+glmc_ray_at(vec3 orig, vec3 dir, float t, vec3 point);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_ray_h */
diff --git a/external/cglm/call/sphere.h b/external/cglm/call/sphere.h
new file mode 100644
index 0000000..9b96546
--- /dev/null
+++ b/external/cglm/call/sphere.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_sphere_h
+#define cglmc_sphere_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+float
+glmc_sphere_radii(vec4 s);
+
+CGLM_EXPORT
+void
+glmc_sphere_transform(vec4 s, mat4 m, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_sphere_merge(vec4 s1, vec4 s2, vec4 dest);
+
+CGLM_EXPORT
+bool
+glmc_sphere_sphere(vec4 s1, vec4 s2);
+
+CGLM_EXPORT
+bool
+glmc_sphere_point(vec4 s, vec3 point);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_sphere_h */
diff --git a/external/cglm/call/vec2.h b/external/cglm/call/vec2.h
new file mode 100644
index 0000000..0284a8c
--- /dev/null
+++ b/external/cglm/call/vec2.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_vec2_h
+#define cglmc_vec2_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+CGLM_EXPORT
+void
+glmc_vec2(float * __restrict v, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_fill(vec2 v, float val);
+
+CGLM_EXPORT
+bool
+glmc_vec2_eq(vec2 v, float val);
+
+CGLM_EXPORT
+bool
+glmc_vec2_eqv(vec2 a, vec2 b);
+
+CGLM_EXPORT
+void
+glmc_vec2_copy(vec2 a, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_zero(vec2 v);
+
+CGLM_EXPORT
+void
+glmc_vec2_one(vec2 v);
+
+CGLM_EXPORT
+float
+glmc_vec2_dot(vec2 a, vec2 b);
+
+CGLM_EXPORT
+float
+glmc_vec2_cross(vec2 a, vec2 b);
+
+CGLM_EXPORT
+float
+glmc_vec2_norm2(vec2 v);
+
+CGLM_EXPORT
+float
+glmc_vec2_norm(vec2 v);
+
+CGLM_EXPORT
+void
+glmc_vec2_add(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_adds(vec2 v, float s, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_sub(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_subs(vec2 v, float s, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_mul(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_scale(vec2 v, float s, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_scale_as(vec2 v, float s, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_div(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_divs(vec2 v, float s, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_addadd(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_subadd(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_muladd(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_muladds(vec2 a, float s, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_maxadd(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_minadd(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_subsub(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_addsub(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_mulsub(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_mulsubs(vec2 a, float s, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_maxsub(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_minsub(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_negate_to(vec2 v, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_negate(vec2 v);
+
+CGLM_EXPORT
+void
+glmc_vec2_normalize(vec2 v);
+
+CGLM_EXPORT
+void
+glmc_vec2_normalize_to(vec2 v, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_rotate(vec2 v, float angle, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_center(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+float
+glmc_vec2_distance2(vec2 a, vec2 b);
+
+CGLM_EXPORT
+float
+glmc_vec2_distance(vec2 a, vec2 b);
+
+CGLM_EXPORT
+void
+glmc_vec2_maxv(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_minv(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_clamp(vec2 v, float minval, float maxval);
+
+CGLM_EXPORT
+void
+glmc_vec2_abs(vec2 v, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_fract(vec2 v, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_floor(vec2 v, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_mods(vec2 v, float s, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_swizzle(vec2 v, int mask, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_lerp(vec2 from, vec2 to, float t, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_step(vec2 edge, vec2 x, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_steps(float edge, vec2 x, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_stepr(vec2 edge, float x, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_complex_mul(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_complex_div(vec2 a, vec2 b, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_complex_conjugate(vec2 a, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_make(const float * __restrict src, vec2 dest);
+
+CGLM_EXPORT
+void
+glmc_vec2_reflect(vec2 v, vec2 n, vec2 dest);
+
+CGLM_EXPORT
+bool
+glmc_vec2_refract(vec2 v, vec2 n, float eta, vec2 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_vec2_h */
diff --git a/external/cglm/call/vec3.h b/external/cglm/call/vec3.h
new file mode 100644
index 0000000..640fac7
--- /dev/null
+++ b/external/cglm/call/vec3.h
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_vec3_h
+#define cglmc_vec3_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+/* DEPRECATED! use _copy, _ucopy versions */
+#define glmc_vec_dup(v, dest)          glmc_vec3_copy(v, dest)
+#define glmc_vec3_flipsign(v)          glmc_vec3_negate(v)
+#define glmc_vec3_flipsign_to(v, dest) glmc_vec3_negate_to(v, dest)
+#define glmc_vec3_inv(v)               glmc_vec3_negate(v)
+#define glmc_vec3_inv_to(v, dest)      glmc_vec3_negate_to(v, dest)
+#define glmc_vec3_step_uni(edge, x, dest) glmc_vec3_steps(edge, x, dest);
+
+CGLM_EXPORT
+void
+glmc_vec3(vec4 v4, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_copy(vec3 a, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_zero(vec3 v);
+
+CGLM_EXPORT
+void
+glmc_vec3_one(vec3 v);
+
+CGLM_EXPORT
+float
+glmc_vec3_dot(vec3 a, vec3 b);
+
+CGLM_EXPORT
+void
+glmc_vec3_cross(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_crossn(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+float
+glmc_vec3_norm(vec3 v);
+
+CGLM_EXPORT
+float
+glmc_vec3_norm2(vec3 v);
+
+CGLM_EXPORT
+float
+glmc_vec3_norm_one(vec3 v);
+
+CGLM_EXPORT
+float
+glmc_vec3_norm_inf(vec3 v);
+
+CGLM_EXPORT
+void
+glmc_vec3_normalize_to(vec3 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_normalize(vec3 v);
+
+CGLM_EXPORT
+void
+glmc_vec3_add(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_adds(vec3 v, float s, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_sub(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_subs(vec3 v, float s, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_mul(vec3 a, vec3 b, vec3 d);
+
+CGLM_EXPORT
+void
+glmc_vec3_scale(vec3 v, float s, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_scale_as(vec3 v, float s, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_div(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_divs(vec3 a, float s, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_addadd(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_subadd(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_muladd(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_muladds(vec3 a, float s, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_maxadd(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_minadd(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_subsub(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_addsub(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_mulsub(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_mulsubs(vec3 a, float s, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_maxsub(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_minsub(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_negate(vec3 v);
+
+CGLM_EXPORT
+void
+glmc_vec3_negate_to(vec3 v, vec3 dest);
+
+CGLM_EXPORT
+float
+glmc_vec3_angle(vec3 a, vec3 b);
+
+CGLM_EXPORT
+void
+glmc_vec3_rotate(vec3 v, float angle, vec3 axis);
+
+CGLM_EXPORT
+void
+glmc_vec3_rotate_m4(mat4 m, vec3 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_rotate_m3(mat3 m, vec3 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_proj(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_center(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+float
+glmc_vec3_distance2(vec3 a, vec3 b);
+
+CGLM_EXPORT
+float
+glmc_vec3_distance(vec3 a, vec3 b);
+
+CGLM_EXPORT
+void
+glmc_vec3_maxv(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_minv(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_clamp(vec3 v, float minVal, float maxVal);
+
+CGLM_EXPORT
+void
+glmc_vec3_ortho(vec3 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_lerp(vec3 from, vec3 to, float t, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_lerpc(vec3 from, vec3 to, float t, vec3 dest);
+
+CGLM_INLINE
+void
+glmc_vec3_mix(vec3 from, vec3 to, float t, vec3 dest) {
+  glmc_vec3_lerp(from, to, t, dest);
+}
+
+CGLM_INLINE
+void
+glmc_vec3_mixc(vec3 from, vec3 to, float t, vec3 dest) {
+  glmc_vec3_lerpc(from, to, t, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec3_step(vec3 edge, vec3 x, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_smoothstep_uni(float edge0, float edge1, vec3 x, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_smoothstep(vec3 edge0, vec3 edge1, vec3 x, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_smoothinterp(vec3 from, vec3 to, float t, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_smoothinterpc(vec3 from, vec3 to, float t, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_swizzle(vec3 v, int mask, vec3 dest);
+
+/* ext */
+
+CGLM_EXPORT
+void
+glmc_vec3_mulv(vec3 a, vec3 b, vec3 d);
+
+CGLM_EXPORT
+void
+glmc_vec3_broadcast(float val, vec3 d);
+
+CGLM_EXPORT
+void
+glmc_vec3_fill(vec3 v, float val);
+
+CGLM_EXPORT
+bool
+glmc_vec3_eq(vec3 v, float val);
+
+CGLM_EXPORT
+bool
+glmc_vec3_eq_eps(vec3 v, float val);
+
+CGLM_EXPORT
+bool
+glmc_vec3_eq_all(vec3 v);
+
+CGLM_EXPORT
+bool
+glmc_vec3_eqv(vec3 a, vec3 b);
+
+CGLM_EXPORT
+bool
+glmc_vec3_eqv_eps(vec3 a, vec3 b);
+
+CGLM_EXPORT
+float
+glmc_vec3_max(vec3 v);
+
+CGLM_EXPORT
+float
+glmc_vec3_min(vec3 v);
+
+CGLM_EXPORT
+bool
+glmc_vec3_isnan(vec3 v);
+
+CGLM_EXPORT
+bool
+glmc_vec3_isinf(vec3 v);
+
+CGLM_EXPORT
+bool
+glmc_vec3_isvalid(vec3 v);
+
+CGLM_EXPORT
+void
+glmc_vec3_sign(vec3 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_abs(vec3 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_fract(vec3 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_floor(vec3 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_mods(vec3 v, float s, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_steps(float edge, vec3 x, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_stepr(vec3 edge, float x, vec3 dest);
+
+CGLM_EXPORT
+float
+glmc_vec3_hadd(vec3 v);
+
+CGLM_EXPORT
+void
+glmc_vec3_sqrt(vec3 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_make(const float * __restrict src, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_faceforward(vec3 n, vec3 v, vec3 nref, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec3_reflect(vec3 v, vec3 n, vec3 dest);
+
+CGLM_EXPORT
+bool
+glmc_vec3_refract(vec3 v, vec3 n, float eta, vec3 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_vec3_h */
diff --git a/external/cglm/call/vec4.h b/external/cglm/call/vec4.h
new file mode 100644
index 0000000..22eee24
--- /dev/null
+++ b/external/cglm/call/vec4.h
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglmc_vec4_h
+#define cglmc_vec4_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cglm.h"
+
+/* DEPRECATED! use _copy, _ucopy versions */
+#define glmc_vec4_dup3(v, dest)         glmc_vec4_copy3(v, dest)
+#define glmc_vec4_dup(v, dest)          glmc_vec4_copy(v, dest)
+#define glmc_vec4_flipsign(v)           glmc_vec4_negate(v)
+#define glmc_vec4_flipsign_to(v, dest)  glmc_vec4_negate_to(v, dest)
+#define glmc_vec4_inv(v)                glmc_vec4_negate(v)
+#define glmc_vec4_inv_to(v, dest)       glmc_vec4_negate_to(v, dest)
+#define glmc_vec4_step_uni(edge, x, dest) glmc_vec4_steps(edge, x, dest)
+
+CGLM_EXPORT
+void
+glmc_vec4(vec3 v3, float last, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_zero(vec4 v);
+
+CGLM_EXPORT
+void
+glmc_vec4_one(vec4 v);
+
+CGLM_EXPORT
+void
+glmc_vec4_copy3(vec4 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_copy(vec4 v, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_ucopy(vec4 v, vec4 dest);
+
+CGLM_EXPORT
+float
+glmc_vec4_dot(vec4 a, vec4 b);
+
+CGLM_EXPORT
+float
+glmc_vec4_norm(vec4 v);
+
+CGLM_EXPORT
+float
+glmc_vec4_norm2(vec4 v);
+
+CGLM_EXPORT
+float
+glmc_vec4_norm_one(vec4 v);
+
+CGLM_EXPORT
+float
+glmc_vec4_norm_inf(vec4 v);
+
+CGLM_EXPORT
+void
+glmc_vec4_normalize_to(vec4 v, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_normalize(vec4 v);
+
+CGLM_EXPORT
+void
+glmc_vec4_add(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_adds(vec4 v, float s, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_sub(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_subs(vec4 v, float s, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_mul(vec4 a, vec4 b, vec4 d);
+
+CGLM_EXPORT
+void
+glmc_vec4_scale(vec4 v, float s, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_scale_as(vec4 v, float s, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_div(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_divs(vec4 v, float s, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_addadd(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_subadd(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_muladd(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_muladds(vec4 a, float s, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_maxadd(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_minadd(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_subsub(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_addsub(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_mulsub(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_mulsubs(vec4 a, float s, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_maxsub(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_minsub(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_negate(vec4 v);
+
+CGLM_EXPORT
+void
+glmc_vec4_negate_to(vec4 v, vec4 dest);
+
+CGLM_EXPORT
+float
+glmc_vec4_distance(vec4 a, vec4 b);
+
+CGLM_EXPORT
+float
+glmc_vec4_distance2(vec4 a, vec4 b);
+
+CGLM_EXPORT
+void
+glmc_vec4_maxv(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_minv(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_clamp(vec4 v, float minVal, float maxVal);
+
+CGLM_EXPORT
+void
+glmc_vec4_lerp(vec4 from, vec4 to, float t, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_lerpc(vec4 from, vec4 to, float t, vec4 dest);
+
+CGLM_INLINE
+void
+glmc_vec4_mix(vec4 from, vec4 to, float t, vec4 dest) {
+  glmc_vec4_lerp(from, to, t, dest);
+}
+
+CGLM_INLINE
+void
+glmc_vec4_mixc(vec4 from, vec4 to, float t, vec4 dest) {
+  glmc_vec4_lerpc(from, to, t, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_step(vec4 edge, vec4 x, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_smoothstep_uni(float edge0, float edge1, vec4 x, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_smoothstep(vec4 edge0, vec4 edge1, vec4 x, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_smoothinterp(vec4 from, vec4 to, float t, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_smoothinterpc(vec4 from, vec4 to, float t, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_cubic(float s, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_swizzle(vec4 v, int mask, vec4 dest);
+
+/* ext */
+
+CGLM_EXPORT
+void
+glmc_vec4_mulv(vec4 a, vec4 b, vec4 d);
+
+CGLM_EXPORT
+void
+glmc_vec4_broadcast(float val, vec4 d);
+
+CGLM_EXPORT
+void
+glmc_vec4_fill(vec4 v, float val);
+
+CGLM_EXPORT
+bool
+glmc_vec4_eq(vec4 v, float val);
+
+CGLM_EXPORT
+bool
+glmc_vec4_eq_eps(vec4 v, float val);
+
+CGLM_EXPORT
+bool
+glmc_vec4_eq_all(vec4 v);
+
+CGLM_EXPORT
+bool
+glmc_vec4_eqv(vec4 a, vec4 b);
+
+CGLM_EXPORT
+bool
+glmc_vec4_eqv_eps(vec4 a, vec4 b);
+
+CGLM_EXPORT
+float
+glmc_vec4_max(vec4 v);
+
+CGLM_EXPORT
+float
+glmc_vec4_min(vec4 v);
+
+CGLM_EXPORT
+bool
+glmc_vec4_isnan(vec4 v);
+
+CGLM_EXPORT
+bool
+glmc_vec4_isinf(vec4 v);
+
+CGLM_EXPORT
+bool
+glmc_vec4_isvalid(vec4 v);
+
+CGLM_EXPORT
+void
+glmc_vec4_sign(vec4 v, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_abs(vec4 v, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_fract(vec4 v, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_floor(vec4 v, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_mods(vec4 v, float s, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_steps(float edge, vec4 x, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_stepr(vec4 edge, float x, vec4 dest);
+
+CGLM_EXPORT
+float
+glmc_vec4_hadd(vec4 v);
+
+CGLM_EXPORT
+void
+glmc_vec4_sqrt(vec4 v, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_make(const float * __restrict src, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_reflect(vec4 v, vec4 n, vec4 dest);
+
+CGLM_EXPORT
+bool
+glmc_vec4_refract(vec4 v, vec4 n, float eta, vec4 dest);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglmc_vec4_h */
diff --git a/external/cglm/cam.h b/external/cglm/cam.h
new file mode 100644
index 0000000..816cb5e
--- /dev/null
+++ b/external/cglm/cam.h
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void  glm_frustum(float left,   float right,
+                                 float bottom, float top,
+                                 float nearZ,  float farZ,
+                                 mat4  dest)
+   CGLM_INLINE void  glm_ortho(float left,   float right,
+                               float bottom, float top,
+                               float nearZ,  float farZ,
+                               mat4  dest)
+   CGLM_INLINE void  glm_ortho_aabb(vec3 box[2], mat4 dest)
+   CGLM_INLINE void  glm_ortho_aabb_p(vec3 box[2],  float padding, mat4 dest)
+   CGLM_INLINE void  glm_ortho_aabb_pz(vec3 box[2], float padding, mat4 dest)
+   CGLM_INLINE void  glm_ortho_default(float aspect, mat4  dest)
+   CGLM_INLINE void  glm_ortho_default_s(float aspect, float size, mat4 dest)
+   CGLM_INLINE void  glm_perspective(float fovy,
+                                     float aspect,
+                                     float nearZ,
+                                     float farZ,
+                                     mat4  dest)
+   CGLM_INLINE void  glm_perspective_default(float aspect, mat4 dest)
+   CGLM_INLINE void  glm_perspective_resize(float aspect, mat4 proj)
+   CGLM_INLINE void  glm_lookat(vec3 eye, vec3 center, vec3 up, mat4 dest)
+   CGLM_INLINE void  glm_look(vec3 eye, vec3 dir, vec3 up, mat4 dest)
+   CGLM_INLINE void  glm_look_anyup(vec3 eye, vec3 dir, mat4 dest)
+   CGLM_INLINE void  glm_persp_decomp(mat4   proj,
+                                      float *nearZ, float *farZ,
+                                      float *top,   float *bottom,
+                                      float *left,  float *right)
+   CGLM_INLINE void  glm_persp_decompv(mat4 proj, float dest[6])
+   CGLM_INLINE void  glm_persp_decomp_x(mat4 proj, float *left, float *right)
+   CGLM_INLINE void  glm_persp_decomp_y(mat4 proj, float *top,  float *bottom)
+   CGLM_INLINE void  glm_persp_decomp_z(mat4 proj, float *nearv, float *farv)
+   CGLM_INLINE void  glm_persp_decomp_far(mat4 proj, float *farZ)
+   CGLM_INLINE void  glm_persp_decomp_near(mat4 proj, float *nearZ)
+   CGLM_INLINE float glm_persp_fovy(mat4 proj)
+   CGLM_INLINE float glm_persp_aspect(mat4 proj)
+   CGLM_INLINE void  glm_persp_sizes(mat4 proj, float fovy, vec4 dest)
+ */
+
+#ifndef cglm_cam_h
+#define cglm_cam_h
+
+#include "common.h"
+#include "plane.h"
+
+#include "clipspace/persp.h"
+
+#ifndef CGLM_CLIPSPACE_INCLUDE_ALL
+#  if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+#    include "clipspace/ortho_lh_zo.h"
+#    include "clipspace/persp_lh_zo.h"
+#    include "clipspace/view_lh_zo.h"
+#  elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+#    include "clipspace/ortho_lh_no.h"
+#    include "clipspace/persp_lh_no.h"
+#    include "clipspace/view_lh_no.h"
+#  elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+#    include "clipspace/ortho_rh_zo.h"
+#    include "clipspace/persp_rh_zo.h"
+#    include "clipspace/view_rh_zo.h"
+#  elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+#    include "clipspace/ortho_rh_no.h"
+#    include "clipspace/persp_rh_no.h"
+#    include "clipspace/view_rh_no.h"
+#  endif
+#else
+#  include "clipspace/ortho_lh_zo.h"
+#  include "clipspace/persp_lh_zo.h"
+#  include "clipspace/ortho_lh_no.h"
+#  include "clipspace/persp_lh_no.h"
+#  include "clipspace/ortho_rh_zo.h"
+#  include "clipspace/persp_rh_zo.h"
+#  include "clipspace/ortho_rh_no.h"
+#  include "clipspace/persp_rh_no.h"
+#  include "clipspace/view_lh_zo.h"
+#  include "clipspace/view_lh_no.h"
+#  include "clipspace/view_rh_zo.h"
+#  include "clipspace/view_rh_no.h"
+#endif
+
+/*!
+ * @brief set up perspective peprojection matrix
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_frustum(float left,    float right,
+            float bottom,  float top,
+            float nearZ,   float farZ,
+            mat4  dest) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_frustum_lh_zo(left, right, bottom, top, nearZ, farZ, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_frustum_lh_no(left, right, bottom, top, nearZ, farZ, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_frustum_rh_zo(left, right, bottom, top, nearZ, farZ, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_frustum_rh_no(left, right, bottom, top, nearZ, farZ, dest);
+#endif
+}
+
+/*!
+ * @brief set up orthographic projection matrix
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho(float left,    float right,
+          float bottom,  float top,
+          float nearZ,   float farZ,
+          mat4  dest) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_ortho_lh_zo(left, right, bottom, top, nearZ, farZ, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_ortho_lh_no(left, right, bottom, top, nearZ, farZ, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_ortho_rh_zo(left, right, bottom, top, nearZ, farZ, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_ortho_rh_no(left, right, bottom, top, nearZ, farZ, dest);
+#endif
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box   AABB
+ * @param[out] dest  result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb(vec3 box[2], mat4 dest) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_ortho_aabb_lh_zo(box, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_ortho_aabb_lh_no(box, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_ortho_aabb_rh_zo(box, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_ortho_aabb_rh_no(box, dest);
+#endif
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_p(vec3 box[2], float padding, mat4 dest) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_ortho_aabb_p_lh_zo(box, padding, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_ortho_aabb_p_lh_no(box, padding, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_ortho_aabb_p_rh_zo(box, padding, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_ortho_aabb_p_rh_no(box, padding, dest);
+#endif
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding for near and far
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_pz(vec3 box[2], float padding, mat4 dest) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_ortho_aabb_pz_lh_zo(box, padding, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_ortho_aabb_pz_lh_no(box, padding, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_ortho_aabb_pz_rh_zo(box, padding, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_ortho_aabb_pz_rh_no(box, padding, dest);
+#endif
+}
+
+/*!
+ * @brief set up unit orthographic projection matrix
+ *
+ * @param[in]  aspect aspect ration ( width / height )
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_default(float aspect, mat4 dest) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_ortho_default_lh_zo(aspect, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_ortho_default_lh_no(aspect, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_ortho_default_rh_zo(aspect, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_ortho_default_rh_no(aspect, dest);
+#endif
+}
+
+/*!
+ * @brief set up orthographic projection matrix with given CUBE size
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[in]  size   cube size
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_default_s(float aspect, float size, mat4 dest) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_ortho_default_s_lh_zo(aspect, size, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_ortho_default_s_lh_no(aspect, size, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_ortho_default_s_rh_zo(aspect, size, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_ortho_default_s_rh_no(aspect, size, dest);
+#endif
+}
+
+/*!
+ * @brief set up perspective projection matrix
+ *
+ * @param[in]  fovy    field of view angle
+ * @param[in]  aspect  aspect ratio ( width / height )
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping planes
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_perspective(float fovy, float aspect, float nearZ, float farZ, mat4 dest) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_perspective_lh_zo(fovy, aspect, nearZ, farZ, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_perspective_lh_no(fovy, aspect, nearZ, farZ, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_perspective_rh_zo(fovy, aspect, nearZ, farZ, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_perspective_rh_no(fovy, aspect, nearZ, farZ, dest);
+#endif
+}
+
+/*!
+ * @brief extend perspective projection matrix's far distance
+ *
+ * this function does not guarantee far >= near, be aware of that!
+ *
+ * @param[in, out] proj      projection matrix to extend
+ * @param[in]      deltaFar  distance from existing far (negative to shink)
+ */
+CGLM_INLINE
+void
+glm_persp_move_far(mat4 proj, float deltaFar) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_persp_move_far_lh_zo(proj, deltaFar);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_persp_move_far_lh_no(proj, deltaFar);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_persp_move_far_rh_zo(proj, deltaFar);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_persp_move_far_rh_no(proj, deltaFar);
+#endif
+}
+
+/*!
+ * @brief set up perspective projection matrix with default near/far
+ *        and angle values
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_default(float aspect, mat4 dest) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_perspective_default_lh_zo(aspect, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_perspective_default_lh_no(aspect, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_perspective_default_rh_zo(aspect, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_perspective_default_rh_no(aspect, dest);
+#endif
+}
+
+/*!
+ * @brief resize perspective matrix by aspect ratio ( width / height )
+ *        this makes very easy to resize proj matrix when window /viewport
+ *        reized
+ *
+ * @param[in]      aspect aspect ratio ( width / height )
+ * @param[in, out] proj   perspective projection matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_resize(float aspect, mat4 proj) {
+  if (proj[0][0] == 0.0f)
+    return;
+
+  proj[0][0] = proj[1][1] / aspect;
+}
+
+/*!
+ * @brief set up view matrix
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  center center vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_lookat(vec3 eye, vec3 center, vec3 up, mat4 dest) {
+#if CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_LH_BIT
+  glm_lookat_lh(eye, center, up, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_RH_BIT
+  glm_lookat_rh(eye, center, up, dest);
+#endif
+}
+
+/*!
+ * @brief set up view matrix
+ *
+ * convenient wrapper for lookat: if you only have direction not target self
+ * then this might be useful. Because you need to get target from direction.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look(vec3 eye, vec3 dir, vec3 up, mat4 dest) {
+#if CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_LH_BIT
+  glm_look_lh(eye, dir, up, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_RH_BIT
+  glm_look_rh(eye, dir, up, dest);
+#endif
+}
+
+/*!
+ * @brief set up view matrix
+ *
+ * convenient wrapper for look: if you only have direction and if you don't
+ * care what UP vector is then this might be useful to create view matrix
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look_anyup(vec3 eye, vec3 dir, mat4 dest) {
+#if CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_LH_BIT
+  glm_look_anyup_lh(eye, dir, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_RH_BIT
+  glm_look_anyup_rh(eye, dir, dest);
+#endif
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection.
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ * @param[out] top     top
+ * @param[out] bottom  bottom
+ * @param[out] left    left
+ * @param[out] right   right
+ */
+CGLM_INLINE
+void
+glm_persp_decomp(mat4 proj,
+                 float * __restrict nearZ, float * __restrict farZ,
+                 float * __restrict top,   float * __restrict bottom,
+                 float * __restrict left,  float * __restrict right) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_persp_decomp_lh_zo(proj, nearZ, farZ, top, bottom, left, right);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_persp_decomp_lh_no(proj, nearZ, farZ, top, bottom, left, right);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_persp_decomp_rh_zo(proj, nearZ, farZ, top, bottom, left, right);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_persp_decomp_rh_no(proj, nearZ, farZ, top, bottom, left, right);
+#endif
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection.
+ *        this makes easy to get all values at once
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] dest   array
+ */
+CGLM_INLINE
+void
+glm_persp_decompv(mat4 proj, float dest[6]) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_persp_decompv_lh_zo(proj, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_persp_decompv_lh_no(proj, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_persp_decompv_rh_zo(proj, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_persp_decompv_rh_no(proj, dest);
+#endif
+}
+
+/*!
+ * @brief decomposes left and right values of perspective projection.
+ *        x stands for x axis (left / right axis)
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] left  left
+ * @param[out] right right
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_x(mat4 proj,
+                   float * __restrict left,
+                   float * __restrict right) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_persp_decomp_x_lh_zo(proj, left, right);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_persp_decomp_x_lh_no(proj, left, right);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_persp_decomp_x_rh_zo(proj, left, right);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_persp_decomp_x_rh_no(proj, left, right);
+#endif
+}
+
+/*!
+ * @brief decomposes top and bottom values of perspective projection.
+ *        y stands for y axis (top / bottom axis)
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] top    top
+ * @param[out] bottom bottom
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_y(mat4 proj,
+                   float * __restrict top,
+                   float * __restrict bottom) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_persp_decomp_y_lh_zo(proj, top, bottom);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_persp_decomp_y_lh_no(proj, top, bottom);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_persp_decomp_y_rh_zo(proj, top, bottom);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_persp_decomp_y_rh_no(proj, top, bottom);
+#endif
+}
+
+/*!
+ * @brief decomposes near and far values of perspective projection.
+ *        z stands for z axis (near / far axis)
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_z(mat4 proj, float * __restrict nearZ, float * __restrict farZ) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_persp_decomp_z_lh_zo(proj, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_persp_decomp_z_lh_no(proj, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_persp_decomp_z_rh_zo(proj, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_persp_decomp_z_rh_no(proj, nearZ, farZ);
+#endif
+}
+
+/*!
+ * @brief decomposes far value of perspective projection.
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] farZ   far
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_far(mat4 proj, float * __restrict farZ) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_persp_decomp_far_lh_zo(proj, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_persp_decomp_far_lh_no(proj, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_persp_decomp_far_rh_zo(proj, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_persp_decomp_far_rh_no(proj, farZ);
+#endif
+}
+
+/*!
+ * @brief decomposes near value of perspective projection.
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] nearZ  near
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_near(mat4 proj, float * __restrict nearZ) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_persp_decomp_near_lh_zo(proj, nearZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_persp_decomp_near_lh_no(proj, nearZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_persp_decomp_near_rh_zo(proj, nearZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_persp_decomp_near_rh_no(proj, nearZ);
+#endif
+}
+
+/*!
+ * @brief returns sizes of near and far planes of perspective projection
+ *
+ * @param[in]  proj perspective projection matrix
+ * @param[in]  fovy fovy (see brief)
+ * @param[out] dest sizes order: [Wnear, Hnear, Wfar, Hfar]
+ */
+CGLM_INLINE
+void
+glm_persp_sizes(mat4 proj, float fovy, vec4 dest) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glm_persp_sizes_lh_zo(proj, fovy, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glm_persp_sizes_lh_no(proj, fovy, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glm_persp_sizes_rh_zo(proj, fovy, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glm_persp_sizes_rh_no(proj, fovy, dest);
+#endif
+}
+
+#endif /* cglm_cam_h */
diff --git a/external/cglm/cglm.c b/external/cglm/cglm.c
new file mode 100644
index 0000000..6fe88a7
--- /dev/null
+++ b/external/cglm/cglm.c
@@ -0,0 +1,3 @@
+#define CGLM_FORCE_DEPTH_ZERO_TO_ONE
+
+#include "cglm.h"
diff --git a/external/cglm/cglm.h b/external/cglm/cglm.h
new file mode 100644
index 0000000..3532830
--- /dev/null
+++ b/external/cglm/cglm.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_h
+#define cglm_h
+
+#include "common.h"
+#include "vec2.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "ivec2.h"
+#include "ivec3.h"
+#include "ivec4.h"
+#include "mat4.h"
+#include "mat4x2.h"
+#include "mat4x3.h"
+#include "mat3.h"
+#include "mat3x2.h"
+#include "mat3x4.h"
+#include "mat2.h"
+#include "mat2x3.h"
+#include "mat2x4.h"
+#include "affine.h"
+#include "cam.h"
+#include "frustum.h"
+#include "quat.h"
+#include "euler.h"
+#include "plane.h"
+#include "noise.h"
+#include "aabb2d.h"
+#include "box.h"
+#include "color.h"
+#include "util.h"
+#include "io.h"
+#include "project.h"
+#include "sphere.h"
+#include "ease.h"
+#include "curve.h"
+#include "bezier.h"
+#include "ray.h"
+#include "affine2d.h"
+#include "affine2d-post.h"
+
+#endif /* cglm_h */
diff --git a/external/cglm/clipspace/ortho_lh_no.h b/external/cglm/clipspace/ortho_lh_no.h
new file mode 100644
index 0000000..76c7a94
--- /dev/null
+++ b/external/cglm/clipspace/ortho_lh_no.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_ortho_lh_no(float left,    float right,
+                                    float bottom,  float top,
+                                    float nearZ, float farZ,
+                                    mat4  dest)
+   CGLM_INLINE void glm_ortho_aabb_lh_no(vec3 box[2], mat4 dest)
+   CGLM_INLINE void glm_ortho_aabb_p_lh_no(vec3 box[2],
+                                           float padding,
+                                           mat4 dest)
+   CGLM_INLINE void glm_ortho_aabb_pz_lh_no(vec3 box[2],
+                                            float padding,
+                                            mat4 dest)
+   CGLM_INLINE void glm_ortho_default_lh_no(float aspect,
+                                            mat4 dest)
+   CGLM_INLINE void glm_ortho_default_s_lh_no(float aspect,
+                                              float size,
+                                              mat4 dest)
+ */
+
+#ifndef cglm_ortho_lh_no_h
+#define cglm_ortho_lh_no_h
+
+#include "../common.h"
+#include "../plane.h"
+#include "../mat4.h"
+
+/*!
+ * @brief set up orthographic projection matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_lh_no(float left,    float right,
+                float bottom,  float top,
+                float nearZ, float farZ,
+                mat4  dest) {
+  float rl, tb, fn;
+
+  glm_mat4_zero(dest);
+
+  rl = 1.0f / (right  - left);
+  tb = 1.0f / (top    - bottom);
+  fn =-1.0f / (farZ - nearZ);
+
+  dest[0][0] = 2.0f * rl;
+  dest[1][1] = 2.0f * tb;
+  dest[2][2] =-2.0f * fn;
+  dest[3][0] =-(right  + left)    * rl;
+  dest[3][1] =-(top    + bottom)  * tb;
+  dest[3][2] = (farZ + nearZ) * fn;
+  dest[3][3] = 1.0f;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box   AABB
+ * @param[out] dest  result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_lh_no(vec3 box[2], mat4 dest) {
+  glm_ortho_lh_no(box[0][0],  box[1][0],
+                  box[0][1],  box[1][1],
+                 -box[1][2], -box[0][2],
+                  dest);
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_p_lh_no(vec3 box[2], float padding, mat4 dest) {
+  glm_ortho_lh_no(box[0][0] - padding,    box[1][0] + padding,
+                  box[0][1] - padding,    box[1][1] + padding,
+                -(box[1][2] + padding), -(box[0][2] - padding),
+                  dest);
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding for near and far
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_pz_lh_no(vec3 box[2], float padding, mat4 dest) {
+  glm_ortho_lh_no(box[0][0],              box[1][0],
+                  box[0][1],              box[1][1],
+                -(box[1][2] + padding), -(box[0][2] - padding),
+                  dest);
+}
+
+/*!
+ * @brief set up unit orthographic projection matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  aspect aspect ration ( width / height )
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_default_lh_no(float aspect, mat4 dest) {
+  if (aspect >= 1.0f) {
+    glm_ortho_lh_no(-aspect, aspect, -1.0f, 1.0f, -100.0f, 100.0f, dest);
+    return;
+  }
+
+  aspect = 1.0f / aspect;
+
+  glm_ortho_lh_no(-1.0f, 1.0f, -aspect, aspect, -100.0f, 100.0f, dest);
+}
+
+/*!
+ * @brief set up orthographic projection matrix with given CUBE size
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[in]  size   cube size
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_default_s_lh_no(float aspect, float size, mat4 dest) {
+  if (aspect >= 1.0f) {
+    glm_ortho_lh_no(-size * aspect,
+                     size * aspect,
+                    -size,
+                     size,
+                    -size - 100.0f,
+                     size + 100.0f,
+                     dest);
+    return;
+  }
+
+  glm_ortho_lh_no(-size,
+                   size,
+                  -size / aspect,
+                   size / aspect,
+                  -size - 100.0f,
+                   size + 100.0f,
+                   dest);
+}
+
+#endif /*cglm_ortho_lh_no_h*/
diff --git a/external/cglm/clipspace/ortho_lh_zo.h b/external/cglm/clipspace/ortho_lh_zo.h
new file mode 100644
index 0000000..e45530d
--- /dev/null
+++ b/external/cglm/clipspace/ortho_lh_zo.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_ortho_lh_zo(float left,    float right,
+                                    float bottom,  float top,
+                                    float nearZ, float farZ,
+                                    mat4  dest)
+   CGLM_INLINE void glm_ortho_aabb_lh_zo(vec3 box[2], mat4 dest)
+   CGLM_INLINE void glm_ortho_aabb_p_lh_zo(vec3 box[2],
+                                           float padding,
+                                           mat4 dest)
+   CGLM_INLINE void glm_ortho_aabb_pz_lh_zo(vec3 box[2],
+                                            float padding,
+                                            mat4 dest)
+   CGLM_INLINE void glm_ortho_default_lh_zo(float aspect,
+                                            mat4 dest)
+   CGLM_INLINE void glm_ortho_default_s_lh_zo(float aspect,
+                                              float size,
+                                              mat4 dest)
+ */
+
+#ifndef cglm_ortho_lh_zo_h
+#define cglm_ortho_lh_zo_h
+
+#include "../common.h"
+#include "../plane.h"
+#include "../mat4.h"
+
+/*!
+ * @brief set up orthographic projection matrix with a left-hand coordinate
+ *        system and a clip-space of [0, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_lh_zo(float left,    float right,
+                float bottom,  float top,
+                float nearZ, float farZ,
+                mat4  dest) {
+  float rl, tb, fn;
+
+  glm_mat4_zero(dest);
+
+  rl = 1.0f / (right  - left);
+  tb = 1.0f / (top    - bottom);
+  fn =-1.0f / (farZ - nearZ);
+
+  dest[0][0] = 2.0f * rl;
+  dest[1][1] = 2.0f * tb;
+  dest[2][2] =-fn;
+  dest[3][0] =-(right  + left)    * rl;
+  dest[3][1] =-(top    + bottom)  * tb;
+  dest[3][2] = nearZ * fn;
+  dest[3][3] = 1.0f;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a left-hand coordinate system and a clip-space of [0, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box   AABB
+ * @param[out] dest  result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_lh_zo(vec3 box[2], mat4 dest) {
+  glm_ortho_lh_zo(box[0][0],  box[1][0],
+                  box[0][1],  box[1][1],
+                 -box[1][2], -box[0][2],
+                  dest);
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a left-hand coordinate system and a clip-space of [0, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_p_lh_zo(vec3 box[2], float padding, mat4 dest) {
+  glm_ortho_lh_zo(box[0][0] - padding,    box[1][0] + padding,
+                  box[0][1] - padding,    box[1][1] + padding,
+                -(box[1][2] + padding), -(box[0][2] - padding),
+                  dest);
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a left-hand coordinate system and a clip-space of [0, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding for near and far
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_pz_lh_zo(vec3 box[2], float padding, mat4 dest) {
+  glm_ortho_lh_zo(box[0][0],              box[1][0],
+                  box[0][1],              box[1][1],
+                -(box[1][2] + padding), -(box[0][2] - padding),
+                  dest);
+}
+
+/*!
+ * @brief set up unit orthographic projection matrix
+ *        with a left-hand coordinate system and a clip-space of [0, 1].
+ *
+ * @param[in]  aspect aspect ration ( width / height )
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_default_lh_zo(float aspect, mat4 dest) {
+  if (aspect >= 1.0f) {
+    glm_ortho_lh_zo(-aspect, aspect, -1.0f, 1.0f, -100.0f, 100.0f, dest);
+    return;
+  }
+
+  aspect = 1.0f / aspect;
+
+  glm_ortho_lh_zo(-1.0f, 1.0f, -aspect, aspect, -100.0f, 100.0f, dest);
+}
+
+/*!
+ * @brief set up orthographic projection matrix with given CUBE size
+ *        with a left-hand coordinate system and a clip-space of [0, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[in]  size   cube size
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_default_s_lh_zo(float aspect, float size, mat4 dest) {
+  if (aspect >= 1.0f) {
+    glm_ortho_lh_zo(-size * aspect,
+                     size * aspect,
+                    -size,
+                     size,
+                    -size - 100.0f,
+                     size + 100.0f,
+                     dest);
+    return;
+  }
+
+  glm_ortho_lh_zo(-size,
+                   size,
+                  -size / aspect,
+                   size / aspect,
+                  -size - 100.0f,
+                   size + 100.0f,
+                   dest);
+}
+
+#endif /*cglm_ortho_lh_zo_h*/
diff --git a/external/cglm/clipspace/ortho_rh_no.h b/external/cglm/clipspace/ortho_rh_no.h
new file mode 100644
index 0000000..aa7a906
--- /dev/null
+++ b/external/cglm/clipspace/ortho_rh_no.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_ortho_rh_no(float left,    float right,
+                                    float bottom,  float top,
+                                    float nearZ, float farZ,
+                                    mat4  dest)
+   CGLM_INLINE void glm_ortho_aabb_rh_no(vec3 box[2], mat4 dest)
+   CGLM_INLINE void glm_ortho_aabb_p_rh_no(vec3 box[2],
+                                           float padding,
+                                           mat4 dest)
+   CGLM_INLINE void glm_ortho_aabb_pz_rh_no(vec3 box[2],
+                                            float padding,
+                                            mat4 dest)
+   CGLM_INLINE void glm_ortho_default_rh_no(float aspect,
+                                            mat4 dest)
+   CGLM_INLINE void glm_ortho_default_s_rh_no(float aspect,
+                                              float size,
+                                              mat4 dest)
+ */
+
+#ifndef cglm_ortho_rh_no_h
+#define cglm_ortho_rh_no_h
+
+#include "../common.h"
+#include "../plane.h"
+#include "../mat4.h"
+
+/*!
+ * @brief set up orthographic projection matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_rh_no(float left,    float right,
+                float bottom,  float top,
+                float nearZ, float farZ,
+                mat4  dest) {
+  float rl, tb, fn;
+
+  glm_mat4_zero(dest);
+
+  rl = 1.0f / (right  - left);
+  tb = 1.0f / (top    - bottom);
+  fn =-1.0f / (farZ - nearZ);
+
+  dest[0][0] = 2.0f * rl;
+  dest[1][1] = 2.0f * tb;
+  dest[2][2] = 2.0f * fn;
+  dest[3][0] =-(right  + left)    * rl;
+  dest[3][1] =-(top    + bottom)  * tb;
+  dest[3][2] = (farZ + nearZ) * fn;
+  dest[3][3] = 1.0f;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box   AABB
+ * @param[out] dest  result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_rh_no(vec3 box[2], mat4 dest) {
+  glm_ortho_rh_no(box[0][0],  box[1][0],
+                  box[0][1],  box[1][1],
+                 -box[1][2], -box[0][2],
+                  dest);
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_p_rh_no(vec3 box[2], float padding, mat4 dest) {
+  glm_ortho_rh_no(box[0][0] - padding,    box[1][0] + padding,
+                  box[0][1] - padding,    box[1][1] + padding,
+                -(box[1][2] + padding), -(box[0][2] - padding),
+                  dest);
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding for near and far
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_pz_rh_no(vec3 box[2], float padding, mat4 dest) {
+  glm_ortho_rh_no(box[0][0],              box[1][0],
+                  box[0][1],              box[1][1],
+                -(box[1][2] + padding), -(box[0][2] - padding),
+                  dest);
+}
+
+/*!
+ * @brief set up unit orthographic projection matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  aspect aspect ration ( width / height )
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_default_rh_no(float aspect, mat4 dest) {
+  if (aspect >= 1.0f) {
+    glm_ortho_rh_no(-aspect, aspect, -1.0f, 1.0f, -100.0f, 100.0f, dest);
+    return;
+  }
+
+  aspect = 1.0f / aspect;
+
+  glm_ortho_rh_no(-1.0f, 1.0f, -aspect, aspect, -100.0f, 100.0f, dest);
+}
+
+/*!
+ * @brief set up orthographic projection matrix with given CUBE size
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[in]  size   cube size
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_default_s_rh_no(float aspect, float size, mat4 dest) {
+  if (aspect >= 1.0f) {
+    glm_ortho_rh_no(-size * aspect,
+                     size * aspect,
+                    -size,
+                     size,
+                    -size - 100.0f,
+                     size + 100.0f,
+                     dest);
+    return;
+  }
+
+  glm_ortho_rh_no(-size,
+                   size,
+                  -size / aspect,
+                   size / aspect,
+                  -size - 100.0f,
+                   size + 100.0f,
+                   dest);
+}
+
+#endif /*cglm_ortho_rh_no_h*/
diff --git a/external/cglm/clipspace/ortho_rh_zo.h b/external/cglm/clipspace/ortho_rh_zo.h
new file mode 100644
index 0000000..7a0876c
--- /dev/null
+++ b/external/cglm/clipspace/ortho_rh_zo.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_ortho_rh_zo(float left,    float right,
+                                    float bottom,  float top,
+                                    float nearZ, float farZ,
+                                    mat4  dest)
+   CGLM_INLINE void glm_ortho_aabb_rh_zo(vec3 box[2], mat4 dest)
+   CGLM_INLINE void glm_ortho_aabb_p_rh_zo(vec3 box[2],
+                                           float padding,
+                                           mat4 dest)
+   CGLM_INLINE void glm_ortho_aabb_pz_rh_zo(vec3 box[2],
+                                            float padding,
+                                            mat4 dest)
+   CGLM_INLINE void glm_ortho_default_rh_zo(float aspect,
+                                            mat4 dest)
+   CGLM_INLINE void glm_ortho_default_s_rh_zo(float aspect,
+                                              float size,
+                                              mat4 dest)
+ */
+
+#ifndef cglm_ortho_rh_zo_h
+#define cglm_ortho_rh_zo_h
+
+#include "../common.h"
+#include "../plane.h"
+#include "../mat4.h"
+
+/*!
+ * @brief set up orthographic projection matrix with a right-hand coordinate
+ *        system and a clip-space of [0, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_rh_zo(float left,    float right,
+                float bottom,  float top,
+                float nearZ, float farZ,
+                mat4  dest) {
+  float rl, tb, fn;
+
+  glm_mat4_zero(dest);
+
+  rl = 1.0f / (right  - left);
+  tb = 1.0f / (top    - bottom);
+  fn =-1.0f / (farZ - nearZ);
+
+  dest[0][0] = 2.0f * rl;
+  dest[1][1] = 2.0f * tb;
+  dest[2][2] = fn;
+  dest[3][0] =-(right  + left)    * rl;
+  dest[3][1] =-(top    + bottom)  * tb;
+  dest[3][2] = nearZ * fn;
+  dest[3][3] = 1.0f;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a right-hand coordinate system and a clip-space with depth
+ *        values from zero to one.
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box   AABB
+ * @param[out] dest  result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_rh_zo(vec3 box[2], mat4 dest) {
+  glm_ortho_rh_zo(box[0][0],  box[1][0],
+                  box[0][1],  box[1][1],
+                 -box[1][2], -box[0][2],
+                  dest);
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a right-hand coordinate system and a clip-space with depth
+ *        values from zero to one.
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_p_rh_zo(vec3 box[2], float padding, mat4 dest) {
+  glm_ortho_rh_zo(box[0][0] - padding,    box[1][0] + padding,
+                  box[0][1] - padding,    box[1][1] + padding,
+                -(box[1][2] + padding), -(box[0][2] - padding),
+                  dest);
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a right-hand coordinate system and a clip-space with depth
+ *        values from zero to one.
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding for near and far
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_aabb_pz_rh_zo(vec3 box[2], float padding, mat4 dest) {
+  glm_ortho_rh_zo(box[0][0],              box[1][0],
+                  box[0][1],              box[1][1],
+                -(box[1][2] + padding), -(box[0][2] - padding),
+                  dest);
+}
+
+/*!
+ * @brief set up unit orthographic projection matrix with a right-hand
+ *        coordinate system and a clip-space of [0, 1].
+ *
+ * @param[in]  aspect aspect ration ( width / height )
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_default_rh_zo(float aspect, mat4 dest) {
+  if (aspect >= 1.0f) {
+    glm_ortho_rh_zo(-aspect, aspect, -1.0f, 1.0f, -100.0f, 100.0f, dest);
+    return;
+  }
+
+  aspect = 1.0f / aspect;
+
+  glm_ortho_rh_zo(-1.0f, 1.0f, -aspect, aspect, -100.0f, 100.0f, dest);
+}
+
+/*!
+ * @brief set up orthographic projection matrix with given CUBE size
+ *        with a right-hand coordinate system and a clip-space with depth
+ *        values from zero to one.
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[in]  size   cube size
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_ortho_default_s_rh_zo(float aspect, float size, mat4 dest) {
+  if (aspect >= 1.0f) {
+    glm_ortho_rh_zo(-size * aspect,
+                     size * aspect,
+                    -size,
+                     size,
+                    -size - 100.0f,
+                     size + 100.0f,
+                     dest);
+    return;
+  }
+
+  glm_ortho_rh_zo(-size,
+                   size,
+                  -size / aspect,
+                   size / aspect,
+                  -size - 100.0f,
+                   size + 100.0f,
+                   dest);
+}
+
+#endif /*cglm_ortho_rh_zo_h*/
diff --git a/external/cglm/clipspace/persp.h b/external/cglm/clipspace/persp.h
new file mode 100644
index 0000000..15aa715
--- /dev/null
+++ b/external/cglm/clipspace/persp.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void  glm_persp_decomp_far(mat4 proj, float *farZ)
+   CGLM_INLINE float glm_persp_fovy(mat4 proj)
+   CGLM_INLINE float glm_persp_aspect(mat4 proj)
+   CGLM_INLINE void  glm_persp_sizes(mat4 proj, float fovy, vec4 dest)
+ */
+
+#ifndef cglm_persp_h
+#define cglm_persp_h
+
+#include "../common.h"
+#include "../plane.h"
+#include "../mat4.h"
+
+/*!
+ * @brief returns field of view angle along the Y-axis (in radians)
+ *
+ * if you need to degrees, use glm_deg to convert it or use this:
+ * fovy_deg = glm_deg(glm_persp_fovy(projMatrix))
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glm_persp_fovy(mat4 proj) {
+  return 2.0f * atanf(1.0f / proj[1][1]);
+}
+
+/*!
+ * @brief returns aspect ratio of perspective projection
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glm_persp_aspect(mat4 proj) {
+  return proj[1][1] / proj[0][0];
+}
+
+#endif /* cglm_persp_h */
diff --git a/external/cglm/clipspace/persp_lh_no.h b/external/cglm/clipspace/persp_lh_no.h
new file mode 100644
index 0000000..d28923a
--- /dev/null
+++ b/external/cglm/clipspace/persp_lh_no.h
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_frustum_lh_no(float left,    float right,
+                                       float bottom,  float top,
+                                       float nearZ, float farZ,
+                                       mat4  dest)
+   CGLM_INLINE void glm_perspective_lh_no(float fovy,
+                                          float aspect,
+                                          float nearZ,
+                                          float farZ,
+                                          mat4  dest)
+   CGLM_INLINE void glm_perspective_default_lh_no(float aspect, mat4 dest)
+   CGLM_INLINE void glm_perspective_resize_lh_no(float aspect, mat4 proj)
+   CGLM_INLINE void glm_persp_move_far_lh_no(mat4 proj,
+                                             float deltaFar)
+   CGLM_INLINE void glm_persp_decomp_lh_no(mat4 proj,
+                                           float * __restrict nearZ,
+                                           float * __restrict farZ,
+                                           float * __restrict top,
+                                           float * __restrict bottom,
+                                           float * __restrict left,
+                                           float * __restrict right)
+  CGLM_INLINE void glm_persp_decompv_lh_no(mat4 proj,
+                                           float dest[6])
+  CGLM_INLINE void glm_persp_decomp_x_lh_no(mat4 proj,
+                                            float * __restrict left,
+                                            float * __restrict right)
+  CGLM_INLINE void glm_persp_decomp_y_lh_no(mat4 proj,
+                                            float * __restrict top,
+                                            float * __restrict bottom)
+  CGLM_INLINE void glm_persp_decomp_z_lh_no(mat4 proj,
+                                            float * __restrict nearZ,
+                                            float * __restrict farZ)
+  CGLM_INLINE void glm_persp_decomp_far_lh_no(mat4 proj, float * __restrict farZ)
+  CGLM_INLINE void glm_persp_decomp_near_lh_no(mat4 proj, float * __restrict nearZ)
+  CGLM_INLINE void glm_persp_sizes_lh_no(mat4 proj, float fovy, vec4 dest)
+ */
+
+#ifndef cglm_persp_lh_no_h
+#define cglm_persp_lh_no_h
+
+#include "../common.h"
+#include "persp.h"
+
+/*!
+ * @brief set up perspective peprojection matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_frustum_lh_no(float left,    float right,
+                  float bottom,  float top,
+                  float nearZ, float farZ,
+                  mat4  dest) {
+  float rl, tb, fn, nv;
+
+  glm_mat4_zero(dest);
+
+  rl = 1.0f / (right  - left);
+  tb = 1.0f / (top    - bottom);
+  fn =-1.0f / (farZ - nearZ);
+  nv = 2.0f * nearZ;
+
+  dest[0][0] = nv * rl;
+  dest[1][1] = nv * tb;
+  dest[2][0] = (right  + left)    * rl;
+  dest[2][1] = (top    + bottom)  * tb;
+  dest[2][2] =-(farZ + nearZ) * fn;
+  dest[2][3] = 1.0f;
+  dest[3][2] = farZ * nv * fn;
+}
+
+/*!
+ * @brief set up perspective projection matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  fovy    field of view angle
+ * @param[in]  aspect  aspect ratio ( width / height )
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping planes
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_lh_no(float fovy,
+                      float aspect,
+                      float nearZ,
+                      float farZ,
+                      mat4  dest) {
+  float f, fn;
+
+  glm_mat4_zero(dest);
+
+  f  = 1.0f / tanf(fovy * 0.5f);
+  fn = 1.0f / (nearZ - farZ);
+
+  dest[0][0] = f / aspect;
+  dest[1][1] = f;
+  dest[2][2] =-(nearZ + farZ) * fn;
+  dest[2][3] = 1.0f;
+  dest[3][2] = 2.0f * nearZ * farZ * fn;
+
+}
+
+/*!
+ * @brief set up perspective projection matrix with default near/far
+ *        and angle values with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_default_lh_no(float aspect, mat4 dest) {
+  glm_perspective_lh_no(GLM_PI_4f, aspect, 0.01f, 100.0f, dest);
+}
+
+/*!
+ * @brief resize perspective matrix by aspect ratio ( width / height )
+ *        this makes very easy to resize proj matrix when window /viewport
+ *        resized with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]      aspect aspect ratio ( width / height )
+ * @param[in, out] proj   perspective projection matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_resize_lh_no(float aspect, mat4 proj) {
+  if (proj[0][0] == 0.0f)
+    return;
+
+  proj[0][0] = proj[1][1] / aspect;
+}
+
+/*!
+ * @brief extend perspective projection matrix's far distance
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * this function does not guarantee far >= near, be aware of that!
+ *
+ * @param[in, out] proj      projection matrix to extend
+ * @param[in]      deltaFar  distance from existing far (negative to shink)
+ */
+CGLM_INLINE
+void
+glm_persp_move_far_lh_no(mat4 proj, float deltaFar) {
+  float fn, farZ, nearZ, p22, p32;
+
+  p22        = -proj[2][2];
+  p32        = proj[3][2];
+
+  nearZ    = p32 / (p22 - 1.0f);
+  farZ     = p32 / (p22 + 1.0f) + deltaFar;
+  fn         = 1.0f / (nearZ - farZ);
+
+  proj[2][2] = -(farZ + nearZ) * fn;
+  proj[3][2] = 2.0f * nearZ * farZ * fn;
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ * @param[out] top     top
+ * @param[out] bottom  bottom
+ * @param[out] left    left
+ * @param[out] right   right
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_lh_no(mat4 proj,
+                       float * __restrict nearZ, float * __restrict farZ,
+                       float * __restrict top,   float * __restrict bottom,
+                       float * __restrict left,  float * __restrict right) {
+  float m00, m11, m20, m21, m22, m32, n, f;
+  float n_m11, n_m00;
+
+  m00 = proj[0][0];
+  m11 = proj[1][1];
+  m20 = proj[2][0];
+  m21 = proj[2][1];
+  m22 =-proj[2][2];
+  m32 = proj[3][2];
+
+  n = m32 / (m22 - 1.0f);
+  f = m32 / (m22 + 1.0f);
+
+  n_m11 = n / m11;
+  n_m00 = n / m00;
+
+  *nearZ = n;
+  *farZ  = f;
+  *bottom  = n_m11 * (m21 - 1.0f);
+  *top     = n_m11 * (m21 + 1.0f);
+  *left    = n_m00 * (m20 - 1.0f);
+  *right   = n_m00 * (m20 + 1.0f);
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        this makes easy to get all values at once
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] dest   array
+ */
+CGLM_INLINE
+void
+glm_persp_decompv_lh_no(mat4 proj, float dest[6]) {
+  glm_persp_decomp_lh_no(proj, &dest[0], &dest[1], &dest[2],
+                               &dest[3], &dest[4], &dest[5]);
+}
+
+/*!
+ * @brief decomposes left and right values of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        x stands for x axis (left / right axis)
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] left  left
+ * @param[out] right right
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_x_lh_no(mat4 proj,
+                         float * __restrict left,
+                         float * __restrict right) {
+  float nearZ, m20, m00, m22;
+
+  m00 = proj[0][0];
+  m20 = proj[2][0];
+  m22 =-proj[2][2];
+
+  nearZ = proj[3][2] / (m22 - 1.0f);
+  *left   = nearZ * (m20 - 1.0f) / m00;
+  *right  = nearZ * (m20 + 1.0f) / m00;
+}
+
+/*!
+ * @brief decomposes top and bottom values of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        y stands for y axis (top / bottom axis)
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] top    top
+ * @param[out] bottom bottom
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_y_lh_no(mat4 proj,
+                         float * __restrict top,
+                         float * __restrict bottom) {
+  float nearZ, m21, m11, m22;
+
+  m21 = proj[2][1];
+  m11 = proj[1][1];
+  m22 =-proj[2][2];
+
+  nearZ = proj[3][2] / (m22 - 1.0f);
+  *bottom = nearZ * (m21 - 1.0f) / m11;
+  *top    = nearZ * (m21 + 1.0f) / m11;
+}
+
+/*!
+ * @brief decomposes near and far values of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        z stands for z axis (near / far axis)
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_z_lh_no(mat4 proj,
+                         float * __restrict nearZ,
+                         float * __restrict farZ) {
+  float m32, m22;
+
+  m32 = proj[3][2];
+  m22 =-proj[2][2];
+
+  *nearZ = m32 / (m22 - 1.0f);
+  *farZ  = m32 / (m22 + 1.0f);
+}
+
+/*!
+ * @brief decomposes far value of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] farZ   far
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_far_lh_no(mat4 proj, float * __restrict farZ) {
+  *farZ = proj[3][2] / (-proj[2][2] + 1.0f);
+}
+
+/*!
+ * @brief decomposes near value of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_near_lh_no(mat4 proj, float * __restrict nearZ) {
+  *nearZ = proj[3][2] / (-proj[2][2] - 1.0f);
+}
+
+/*!
+ * @brief returns sizes of near and far planes of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj perspective projection matrix
+ * @param[in]  fovy fovy (see brief)
+ * @param[out] dest sizes order: [Wnear, Hnear, Wfar, Hfar]
+ */
+CGLM_INLINE
+void
+glm_persp_sizes_lh_no(mat4 proj, float fovy, vec4 dest) {
+  float t, a, nearZ, farZ;
+
+  t = 2.0f * tanf(fovy * 0.5f);
+  a = glm_persp_aspect(proj);
+
+  glm_persp_decomp_z_lh_no(proj, &nearZ, &farZ);
+
+  dest[1]  = t * nearZ;
+  dest[3]  = t * farZ;
+  dest[0]  = a * dest[1];
+  dest[2]  = a * dest[3];
+}
+
+/*!
+ * @brief returns field of view angle along the Y-axis (in radians)
+ *        with a left-hand coordinate system and a clip-space of [-1, 1].
+ *
+ * if you need to degrees, use glm_deg to convert it or use this:
+ * fovy_deg = glm_deg(glm_persp_fovy(projMatrix))
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glm_persp_fovy_lh_no(mat4 proj) {
+  return glm_persp_fovy(proj);
+}
+
+/*!
+ * @brief returns aspect ratio of perspective projection
+ *        with a left-hand coordinate system and a clip-space of [-1, 1].
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glm_persp_aspect_lh_no(mat4 proj) {
+  return glm_persp_aspect(proj);
+}
+
+#endif /*cglm_cam_lh_no_h*/
diff --git a/external/cglm/clipspace/persp_lh_zo.h b/external/cglm/clipspace/persp_lh_zo.h
new file mode 100644
index 0000000..de89643
--- /dev/null
+++ b/external/cglm/clipspace/persp_lh_zo.h
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_frustum_lh_zo(float left,    float right,
+                                      float bottom,  float top,
+                                      float nearZ, float farZ,
+                                      mat4  dest)
+   CGLM_INLINE void glm_perspective_lh_zo(float fovy,
+                                          float aspect,
+                                          float nearZ,
+                                          float farZ,
+                                          mat4  dest)
+   CGLM_INLINE void glm_perspective_default_lh_zo(float aspect, mat4 dest)
+   CGLM_INLINE void glm_perspective_resize_lh_zo(float aspect, mat4 proj)
+   CGLM_INLINE void glm_persp_move_far_lh_zo(mat4 proj,
+                                             float deltaFar)
+   CGLM_INLINE void glm_persp_decomp_lh_zo(mat4 proj,
+                                           float * __restrict nearZ,
+                                           float * __restrict farZ,
+                                           float * __restrict top,
+                                           float * __restrict bottom,
+                                           float * __restrict left,
+                                           float * __restrict right)
+  CGLM_INLINE void glm_persp_decompv_lh_zo(mat4 proj,
+                                           float dest[6])
+  CGLM_INLINE void glm_persp_decomp_x_lh_zo(mat4 proj,
+                                            float * __restrict left,
+                                            float * __restrict right)
+  CGLM_INLINE void glm_persp_decomp_y_lh_zo(mat4 proj,
+                                            float * __restrict top,
+                                            float * __restrict bottom)
+  CGLM_INLINE void glm_persp_decomp_z_lh_zo(mat4 proj,
+                                            float * __restrict nearZ,
+                                            float * __restrict farZ)
+  CGLM_INLINE void glm_persp_decomp_far_lh_zo(mat4 proj, float * __restrict farZ)
+  CGLM_INLINE void glm_persp_decomp_near_lh_zo(mat4 proj, float * __restrict nearZ)
+  CGLM_INLINE void glm_persp_sizes_lh_zo(mat4 proj, float fovy, vec4 dest)
+ */
+
+#ifndef cglm_persp_lh_zo_h
+#define cglm_persp_lh_zo_h
+
+#include "../common.h"
+#include "persp.h"
+
+/*!
+ * @brief set up perspective peprojection matrix with a left-hand coordinate
+ *        system and a clip-space of [0, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_frustum_lh_zo(float left,    float right,
+                  float bottom,  float top,
+                  float nearZ, float farZ,
+                  mat4  dest) {
+  float rl, tb, fn, nv;
+
+  glm_mat4_zero(dest);
+
+  rl = 1.0f / (right  - left);
+  tb = 1.0f / (top    - bottom);
+  fn =-1.0f / (farZ - nearZ);
+  nv = 2.0f * nearZ;
+
+  dest[0][0] = nv * rl;
+  dest[1][1] = nv * tb;
+  dest[2][0] = (right  + left)    * rl;
+  dest[2][1] = (top    + bottom)  * tb;
+  dest[2][2] =-farZ * fn;
+  dest[2][3] = 1.0f;
+  dest[3][2] = farZ * nearZ * fn;
+}
+
+/*!
+ * @brief set up perspective projection matrix with a left-hand coordinate
+ * system and a clip-space of [0, 1].
+ *
+ * @param[in]  fovy    field of view angle
+ * @param[in]  aspect  aspect ratio ( width / height )
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping planes
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_lh_zo(float fovy,
+                      float aspect,
+                      float nearZ,
+                      float farZ,
+                      mat4  dest) {
+  float f, fn;
+
+  glm_mat4_zero(dest);
+
+  f  = 1.0f / tanf(fovy * 0.5f);
+  fn = 1.0f / (nearZ - farZ);
+
+  dest[0][0] = f / aspect;
+  dest[1][1] = f;
+  dest[2][2] =-farZ * fn;
+  dest[2][3] = 1.0f;
+  dest[3][2] = nearZ * farZ * fn;
+}
+
+/*!
+ * @brief extend perspective projection matrix's far distance with a
+ *        left-hand coordinate system and a clip-space with depth values
+ *        from zero to one.
+ *
+ * this function does not guarantee far >= near, be aware of that!
+ *
+ * @param[in, out] proj      projection matrix to extend
+ * @param[in]      deltaFar  distance from existing far (negative to shink)
+ */
+CGLM_INLINE
+void
+glm_persp_move_far_lh_zo(mat4 proj, float deltaFar) {
+  float fn, farZ, nearZ, p22, p32;
+
+  p22        = -proj[2][2];
+  p32        = proj[3][2];
+
+  nearZ    = p32 / p22;
+  farZ     = p32 / (p22 + 1.0f) + deltaFar;
+  fn         = 1.0f / (nearZ - farZ);
+
+  proj[2][2] = -farZ * fn;
+  proj[3][2] = nearZ * farZ * fn;
+}
+
+/*!
+ * @brief set up perspective projection matrix with default near/far
+ *        and angle values with a left-hand coordinate system and a 
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_default_lh_zo(float aspect, mat4 dest) {
+  glm_perspective_lh_zo(GLM_PI_4f, aspect, 0.01f, 100.0f, dest);
+}
+
+/*!
+ * @brief resize perspective matrix by aspect ratio ( width / height )
+ *        this makes very easy to resize proj matrix when window /viewport
+ *        reized
+ *
+ * @param[in]      aspect aspect ratio ( width / height )
+ * @param[in, out] proj   perspective projection matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_resize_lh_zo(float aspect, mat4 proj) {
+  if (proj[0][0] == 0.0f)
+    return;
+
+  proj[0][0] = proj[1][1] / aspect;
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection
+ *        with angle values with a left-hand coordinate system and a 
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ * @param[out] top     top
+ * @param[out] bottom  bottom
+ * @param[out] left    left
+ * @param[out] right   right
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_lh_zo(mat4 proj,
+                    float * __restrict nearZ, float * __restrict farZ,
+                    float * __restrict top,     float * __restrict bottom,
+                    float * __restrict left,    float * __restrict right) {
+  float m00, m11, m20, m21, m22, m32, n, f;
+  float n_m11, n_m00;
+
+  m00 = proj[0][0];
+  m11 = proj[1][1];
+  m20 = proj[2][0];
+  m21 = proj[2][1];
+  m22 =-proj[2][2];
+  m32 = proj[3][2];
+
+  n = m32 / m22;
+  f = m32 / (m22 + 1.0f);
+
+  n_m11 = n / m11;
+  n_m00 = n / m00;
+
+  *nearZ = n;
+  *farZ  = f;
+  *bottom  = n_m11 * (m21 - 1.0f);
+  *top     = n_m11 * (m21 + 1.0f);
+  *left    = n_m00 * (m20 - 1.0f);
+  *right   = n_m00 * (m20 + 1.0f);
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection
+ *        with angle values with a left-hand coordinate system and a 
+ *        clip-space of [0, 1].
+ *        this makes easy to get all values at once
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] dest   array
+ */
+CGLM_INLINE
+void
+glm_persp_decompv_lh_zo(mat4 proj, float dest[6]) {
+  glm_persp_decomp_lh_zo(proj, &dest[0], &dest[1], &dest[2],
+                               &dest[3], &dest[4], &dest[5]);
+}
+
+/*!
+ * @brief decomposes left and right values of perspective projection (ZO).
+ *        x stands for x axis (left / right axis)
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] left  left
+ * @param[out] right right
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_x_lh_zo(mat4 proj,
+                         float * __restrict left,
+                         float * __restrict right) {
+  float nearZ, m20, m00;
+
+  m00 = proj[0][0];
+  m20 = proj[2][0];
+
+  nearZ = proj[3][2] / (proj[3][3]);
+  *left   = nearZ * (m20 - 1.0f) / m00;
+  *right  = nearZ * (m20 + 1.0f) / m00;
+}
+
+/*!
+ * @brief decomposes top and bottom values of perspective projection
+ *        with angle values with a left-hand coordinate system and a 
+ *        clip-space of [0, 1].
+ *        y stands for y axis (top / bottom axis)
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] top    top
+ * @param[out] bottom bottom
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_y_lh_zo(mat4 proj,
+                         float * __restrict top,
+                         float * __restrict bottom) {
+  float nearZ, m21, m11;
+
+  m21 = proj[2][1];
+  m11 = proj[1][1];
+
+  nearZ = proj[3][2] / (proj[3][3]);
+  *bottom = nearZ * (m21 - 1) / m11;
+  *top    = nearZ * (m21 + 1) / m11;
+}
+
+/*!
+ * @brief decomposes near and far values of perspective projection
+ *        with angle values with a left-hand coordinate system and a 
+ *        clip-space of [0, 1].
+ *        z stands for z axis (near / far axis)
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_z_lh_zo(mat4 proj,
+                         float * __restrict nearZ,
+                         float * __restrict farZ) {
+  float m32, m22;
+
+  m32 = proj[3][2];
+  m22 = -proj[2][2];
+
+  *nearZ = m32 / m22;
+  *farZ  = m32 / (m22 + 1.0f);
+}
+
+/*!
+ * @brief decomposes far value of perspective projection
+ *        with angle values with a left-hand coordinate system and a 
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] farZ   far
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_far_lh_zo(mat4 proj, float * __restrict farZ) {
+  *farZ = proj[3][2] / (-proj[2][2] + 1.0f);
+}
+
+/*!
+ * @brief decomposes near value of perspective projection
+ *        with angle values with a left-hand coordinate system and a 
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_near_lh_zo(mat4 proj, float * __restrict nearZ) {
+  *nearZ = proj[3][2] / -proj[2][2];
+}
+
+/*!
+ * @brief returns sizes of near and far planes of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj perspective projection matrix
+ * @param[in]  fovy fovy (see brief)
+ * @param[out] dest sizes order: [Wnear, Hnear, Wfar, Hfar]
+ */
+CGLM_INLINE
+void
+glm_persp_sizes_lh_zo(mat4 proj, float fovy, vec4 dest) {
+  float t, a, nearZ, farZ;
+
+  t = 2.0f * tanf(fovy * 0.5f);
+  a = glm_persp_aspect(proj);
+
+  glm_persp_decomp_z_lh_zo(proj, &nearZ, &farZ);
+
+  dest[1]  = t * nearZ;
+  dest[3]  = t * farZ;
+  dest[0]  = a * dest[1];
+  dest[2]  = a * dest[3];
+}
+
+/*!
+ * @brief returns field of view angle along the Y-axis (in radians)
+ *        with a left-hand coordinate system and a clip-space of [0, 1].
+ *
+ * if you need to degrees, use glm_deg to convert it or use this:
+ * fovy_deg = glm_deg(glm_persp_fovy(projMatrix))
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glm_persp_fovy_lh_zo(mat4 proj) {
+  return glm_persp_fovy(proj);
+}
+
+/*!
+ * @brief returns aspect ratio of perspective projection
+ *        with a left-hand coordinate system and a clip-space of [0, 1].
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glm_persp_aspect_lh_zo(mat4 proj) {
+  return glm_persp_aspect(proj);
+}
+
+#endif /*cglm_persp_lh_zo_h*/
diff --git a/external/cglm/clipspace/persp_rh_no.h b/external/cglm/clipspace/persp_rh_no.h
new file mode 100644
index 0000000..9252332
--- /dev/null
+++ b/external/cglm/clipspace/persp_rh_no.h
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_frustum_rh_no(float left,    float right,
+                                      float bottom,  float top,
+                                      float nearZ, float farZ,
+                                      mat4  dest)
+   CGLM_INLINE void glm_perspective_rh_no(float fovy,
+                                          float aspect,
+                                          float nearZ,
+                                          float farZ,
+                                          mat4  dest)
+   CGLM_INLINE void glm_perspective_default_rh_no(float aspect, mat4 dest)
+   CGLM_INLINE void glm_perspective_resize_rh_no(float aspect, mat4 proj)
+   CGLM_INLINE void glm_persp_move_far_rh_no(mat4 proj,
+                                             float deltaFar)
+   CGLM_INLINE void glm_persp_decomp_rh_no(mat4 proj,
+                                           float * __restrict nearZ,
+                                           float * __restrict farZ,
+                                           float * __restrict top,
+                                           float * __restrict bottom,
+                                           float * __restrict left,
+                                           float * __restrict right)
+  CGLM_INLINE void glm_persp_decompv_rh_no(mat4 proj,
+                                           float dest[6])
+  CGLM_INLINE void glm_persp_decomp_x_rh_no(mat4 proj,
+                                            float * __restrict left,
+                                            float * __restrict right)
+  CGLM_INLINE void glm_persp_decomp_y_rh_no(mat4 proj,
+                                            float * __restrict top,
+                                            float * __restrict bottom)
+  CGLM_INLINE void glm_persp_decomp_z_rh_no(mat4 proj,
+                                            float * __restrict nearZ,
+                                            float * __restrict farZ)
+  CGLM_INLINE void glm_persp_decomp_far_rh_no(mat4 proj, float * __restrict farZ)
+  CGLM_INLINE void glm_persp_decomp_near_rh_no(mat4 proj, float * __restrict nearZ)
+  CGLM_INLINE void glm_persp_sizes_rh_no(mat4 proj, float fovy, vec4 dest)
+ */
+
+#ifndef cglm_persp_rh_no_h
+#define cglm_persp_rh_no_h
+
+#include "../common.h"
+#include "persp.h"
+
+/*!
+ * @brief set up perspective peprojection matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_frustum_rh_no(float left,    float right,
+                  float bottom,  float top,
+                  float nearZ, float farZ,
+                  mat4  dest) {
+  float rl, tb, fn, nv;
+
+  glm_mat4_zero(dest);
+
+  rl = 1.0f / (right  - left);
+  tb = 1.0f / (top    - bottom);
+  fn =-1.0f / (farZ - nearZ);
+  nv = 2.0f * nearZ;
+
+  dest[0][0] = nv * rl;
+  dest[1][1] = nv * tb;
+  dest[2][0] = (right  + left)    * rl;
+  dest[2][1] = (top    + bottom)  * tb;
+  dest[2][2] = (farZ + nearZ) * fn;
+  dest[2][3] =-1.0f;
+  dest[3][2] = farZ * nv * fn;
+}
+
+/*!
+ * @brief set up perspective projection matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  fovy    field of view angle
+ * @param[in]  aspect  aspect ratio ( width / height )
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping planes
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_rh_no(float fovy,
+                      float aspect,
+                      float nearZ,
+                      float farZ,
+                      mat4  dest) {
+  float f, fn;
+
+  glm_mat4_zero(dest);
+
+  f  = 1.0f / tanf(fovy * 0.5f);
+  fn = 1.0f / (nearZ - farZ);
+
+  dest[0][0] = f / aspect;
+  dest[1][1] = f;
+  dest[2][2] = (nearZ + farZ) * fn;
+  dest[2][3] =-1.0f;
+  dest[3][2] = 2.0f * nearZ * farZ * fn;
+
+}
+
+/*!
+ * @brief set up perspective projection matrix with default near/far
+ *        and angle values with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_default_rh_no(float aspect, mat4 dest) {
+  glm_perspective_rh_no(GLM_PI_4f, aspect, 0.01f, 100.0f, dest);
+}
+
+/*!
+ * @brief resize perspective matrix by aspect ratio ( width / height )
+ *        this makes very easy to resize proj matrix when window /viewport
+ *        resized with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]      aspect aspect ratio ( width / height )
+ * @param[in, out] proj   perspective projection matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_resize_rh_no(float aspect, mat4 proj) {
+  if (proj[0][0] == 0.0f)
+    return;
+
+  proj[0][0] = proj[1][1] / aspect;
+}
+
+/*!
+ * @brief extend perspective projection matrix's far distance
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * this function does not guarantee far >= near, be aware of that!
+ *
+ * @param[in, out] proj      projection matrix to extend
+ * @param[in]      deltaFar  distance from existing far (negative to shink)
+ */
+CGLM_INLINE
+void
+glm_persp_move_far_rh_no(mat4 proj, float deltaFar) {
+  float fn, farZ, nearZ, p22, p32;
+
+  p22        = proj[2][2];
+  p32        = proj[3][2];
+
+  nearZ    = p32 / (p22 - 1.0f);
+  farZ     = p32 / (p22 + 1.0f) + deltaFar;
+  fn         = 1.0f / (nearZ - farZ);
+
+  proj[2][2] = (farZ + nearZ) * fn;
+  proj[3][2] = 2.0f * nearZ * farZ * fn;
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ * @param[out] top     top
+ * @param[out] bottom  bottom
+ * @param[out] left    left
+ * @param[out] right   right
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_rh_no(mat4 proj,
+                       float * __restrict nearZ, float * __restrict farZ,
+                       float * __restrict top,     float * __restrict bottom,
+                       float * __restrict left,    float * __restrict right) {
+  float m00, m11, m20, m21, m22, m32, n, f;
+  float n_m11, n_m00;
+
+  m00 = proj[0][0];
+  m11 = proj[1][1];
+  m20 = proj[2][0];
+  m21 = proj[2][1];
+  m22 = proj[2][2];
+  m32 = proj[3][2];
+
+  n = m32 / (m22 - 1.0f);
+  f = m32 / (m22 + 1.0f);
+
+  n_m11 = n / m11;
+  n_m00 = n / m00;
+
+  *nearZ = n;
+  *farZ  = f;
+  *bottom  = n_m11 * (m21 - 1.0f);
+  *top     = n_m11 * (m21 + 1.0f);
+  *left    = n_m00 * (m20 - 1.0f);
+  *right   = n_m00 * (m20 + 1.0f);
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        this makes easy to get all values at once
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] dest   array
+ */
+CGLM_INLINE
+void
+glm_persp_decompv_rh_no(mat4 proj, float dest[6]) {
+  glm_persp_decomp_rh_no(proj, &dest[0], &dest[1], &dest[2],
+                               &dest[3], &dest[4], &dest[5]);
+}
+
+/*!
+ * @brief decomposes left and right values of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        x stands for x axis (left / right axis)
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] left  left
+ * @param[out] right right
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_x_rh_no(mat4 proj,
+                         float * __restrict left,
+                         float * __restrict right) {
+  float nearZ, m20, m00, m22;
+
+  m00 = proj[0][0];
+  m20 = proj[2][0];
+  m22 = proj[2][2];
+
+  nearZ = proj[3][2] / (m22 - 1.0f);
+  *left   = nearZ * (m20 - 1.0f) / m00;
+  *right  = nearZ * (m20 + 1.0f) / m00;
+}
+
+/*!
+ * @brief decomposes top and bottom values of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        y stands for y axis (top / bottom axis)
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] top    top
+ * @param[out] bottom bottom
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_y_rh_no(mat4 proj,
+                         float * __restrict top,
+                         float * __restrict bottom) {
+  float nearZ, m21, m11, m22;
+
+  m21 = proj[2][1];
+  m11 = proj[1][1];
+  m22 = proj[2][2];
+
+  nearZ = proj[3][2] / (m22 - 1.0f);
+  *bottom = nearZ * (m21 - 1.0f) / m11;
+  *top    = nearZ * (m21 + 1.0f) / m11;
+}
+
+/*!
+ * @brief decomposes near and far values of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        z stands for z axis (near / far axis)
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_z_rh_no(mat4 proj,
+                      float * __restrict nearZ,
+                      float * __restrict farZ) {
+  float m32, m22;
+
+  m32 = proj[3][2];
+  m22 = proj[2][2];
+
+  *nearZ = m32 / (m22 - 1.0f);
+  *farZ  = m32 / (m22 + 1.0f);
+}
+
+/*!
+ * @brief decomposes far value of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] farZ   far
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_far_rh_no(mat4 proj, float * __restrict farZ) {
+  *farZ = proj[3][2] / (proj[2][2] + 1.0f);
+}
+
+/*!
+ * @brief decomposes near value of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ near
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_near_rh_no(mat4 proj, float * __restrict nearZ) {
+  *nearZ = proj[3][2] / (proj[2][2] - 1.0f);
+}
+
+/*!
+ * @brief returns sizes of near and far planes of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj perspective projection matrix
+ * @param[in]  fovy fovy (see brief)
+ * @param[out] dest sizes order: [Wnear, Hnear, Wfar, Hfar]
+ */
+CGLM_INLINE
+void
+glm_persp_sizes_rh_no(mat4 proj, float fovy, vec4 dest) {
+  float t, a, nearZ, farZ;
+
+  t = 2.0f * tanf(fovy * 0.5f);
+  a = glm_persp_aspect(proj);
+
+  glm_persp_decomp_z_rh_no(proj, &nearZ, &farZ);
+
+  dest[1]  = t * nearZ;
+  dest[3]  = t * farZ;
+  dest[0]  = a * dest[1];
+  dest[2]  = a * dest[3];
+}
+
+/*!
+ * @brief returns field of view angle along the Y-axis (in radians)
+ *        with a right-hand coordinate system and a clip-space of [-1, 1].
+ *
+ * if you need to degrees, use glm_deg to convert it or use this:
+ * fovy_deg = glm_deg(glm_persp_fovy(projMatrix))
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glm_persp_fovy_rh_no(mat4 proj) {
+  return glm_persp_fovy(proj);
+}
+
+/*!
+ * @brief returns aspect ratio of perspective projection
+ *        with a right-hand coordinate system and a clip-space of [-1, 1].
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glm_persp_aspect_rh_no(mat4 proj) {
+  return glm_persp_aspect(proj);
+}
+
+#endif /*cglm_cam_rh_no_h*/
diff --git a/external/cglm/clipspace/persp_rh_zo.h b/external/cglm/clipspace/persp_rh_zo.h
new file mode 100644
index 0000000..ce632b3
--- /dev/null
+++ b/external/cglm/clipspace/persp_rh_zo.h
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_frustum_rh_zo(float left,    float right,
+                                      float bottom,  float top,
+                                      float nearZ, float farZ,
+                                      mat4  dest)
+   CGLM_INLINE void glm_perspective_rh_zo(float fovy,
+                                          float aspect,
+                                          float nearZ,
+                                          float farZ,
+                                          mat4  dest)
+   CGLM_INLINE void glm_perspective_default_rh_zo(float aspect, mat4 dest)
+   CGLM_INLINE void glm_perspective_resize_rh_zo(float aspect, mat4 proj)
+   CGLM_INLINE void glm_persp_move_far_rh_zo(mat4 proj,
+                                             float deltaFar)
+   CGLM_INLINE void glm_persp_decomp_rh_zo(mat4 proj,
+                                           float * __restrict nearZ,
+                                           float * __restrict farZ,
+                                           float * __restrict top,
+                                           float * __restrict bottom,
+                                           float * __restrict left,
+                                           float * __restrict right)
+  CGLM_INLINE void glm_persp_decompv_rh_zo(mat4 proj,
+                                           float dest[6])
+  CGLM_INLINE void glm_persp_decomp_x_rh_zo(mat4 proj,
+                                            float * __restrict left,
+                                            float * __restrict right)
+  CGLM_INLINE void glm_persp_decomp_y_rh_zo(mat4 proj,
+                                            float * __restrict top,
+                                            float * __restrict bottom)
+  CGLM_INLINE void glm_persp_decomp_z_rh_zo(mat4 proj,
+                                            float * __restrict nearZ,
+                                            float * __restrict farZ)
+  CGLM_INLINE void glm_persp_decomp_far_rh_zo(mat4 proj, float * __restrict farZ)
+  CGLM_INLINE void glm_persp_decomp_near_rh_zo(mat4 proj, float * __restrict nearZ)
+  CGLM_INLINE void glm_persp_sizes_rh_zo(mat4 proj, float fovy, vec4 dest)
+ */
+
+#ifndef cglm_persp_rh_zo_h
+#define cglm_persp_rh_zo_h
+
+#include "../common.h"
+#include "persp.h"
+
+/*!
+ * @brief set up perspective peprojection matrix with a right-hand coordinate
+ *        system and a clip-space of [0, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_frustum_rh_zo(float left,    float right,
+                  float bottom,  float top,
+                  float nearZ, float farZ,
+                  mat4  dest) {
+  float rl, tb, fn, nv;
+
+  glm_mat4_zero(dest);
+
+  rl = 1.0f / (right  - left);
+  tb = 1.0f / (top    - bottom);
+  fn =-1.0f / (farZ - nearZ);
+  nv = 2.0f * nearZ;
+
+  dest[0][0] = nv * rl;
+  dest[1][1] = nv * tb;
+  dest[2][0] = (right  + left)    * rl;
+  dest[2][1] = (top    + bottom)  * tb;
+  dest[2][2] = farZ * fn;
+  dest[2][3] =-1.0f;
+  dest[3][2] = farZ * nearZ * fn;
+}
+
+/*!
+ * @brief set up perspective projection matrix with a right-hand coordinate
+ *        system and a clip-space of [0, 1].
+ *
+ * @param[in]  fovy    field of view angle
+ * @param[in]  aspect  aspect ratio ( width / height )
+ * @param[in]  nearZ near clipping plane
+ * @param[in]  farZ  far clipping planes
+ * @param[out] dest    result matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_rh_zo(float fovy,
+                      float aspect,
+                      float nearZ,
+                      float farZ,
+                      mat4  dest) {
+  float f, fn;
+
+  glm_mat4_zero(dest);
+
+  f  = 1.0f / tanf(fovy * 0.5f);
+  fn = 1.0f / (nearZ - farZ);
+
+  dest[0][0] = f / aspect;
+  dest[1][1] = f;
+  dest[2][2] = farZ * fn;
+  dest[2][3] =-1.0f;
+  dest[3][2] = nearZ * farZ * fn;
+}
+
+/*!
+ * @brief set up perspective projection matrix with default near/far
+ *        and angle values with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_default_rh_zo(float aspect, mat4 dest) {
+  glm_perspective_rh_zo(GLM_PI_4f, aspect, 0.01f, 100.0f, dest);
+}
+
+/*!
+ * @brief resize perspective matrix by aspect ratio ( width / height )
+ *        this makes very easy to resize proj matrix when window /viewport
+ *        resized with a right-hand coordinate system and a clip-space of
+ *        [0, 1].
+ *
+ * @param[in]      aspect aspect ratio ( width / height )
+ * @param[in, out] proj   perspective projection matrix
+ */
+CGLM_INLINE
+void
+glm_perspective_resize_rh_zo(float aspect, mat4 proj) {
+  if (proj[0][0] == 0.0f)
+    return;
+
+  proj[0][0] = proj[1][1] / aspect;
+}
+
+/*!
+ * @brief extend perspective projection matrix's far distance with a
+ *        right-hand coordinate system and a clip-space of [0, 1].
+ *
+ * this function does not guarantee far >= near, be aware of that!
+ *
+ * @param[in, out] proj      projection matrix to extend
+ * @param[in]      deltaFar  distance from existing far (negative to shink)
+ */
+CGLM_INLINE
+void
+glm_persp_move_far_rh_zo(mat4 proj, float deltaFar) {
+  float fn, farZ, nearZ, p22, p32;
+
+  p22        = proj[2][2];
+  p32        = proj[3][2];
+
+  nearZ    = p32 / p22;
+  farZ     = p32 / (p22 + 1.0f) + deltaFar;
+  fn         = 1.0f / (nearZ - farZ);
+
+  proj[2][2] = farZ * fn;
+  proj[3][2] = nearZ * farZ * fn;
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection
+ *        with angle values with a right-hand coordinate system and a 
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ * @param[out] top     top
+ * @param[out] bottom  bottom
+ * @param[out] left    left
+ * @param[out] right   right
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_rh_zo(mat4 proj,
+                    float * __restrict nearZ, float * __restrict farZ,
+                    float * __restrict top,     float * __restrict bottom,
+                    float * __restrict left,    float * __restrict right) {
+  float m00, m11, m20, m21, m22, m32, n, f;
+  float n_m11, n_m00;
+
+  m00 = proj[0][0];
+  m11 = proj[1][1];
+  m20 = proj[2][0];
+  m21 = proj[2][1];
+  m22 = proj[2][2];
+  m32 = proj[3][2];
+
+  n = m32 / m22;
+  f = m32 / (m22 + 1.0f);
+
+  n_m11 = n / m11;
+  n_m00 = n / m00;
+
+  *nearZ = n;
+  *farZ  = f;
+  *bottom  = n_m11 * (m21 - 1.0f);
+  *top     = n_m11 * (m21 + 1.0f);
+  *left    = n_m00 * (m20 - 1.0f);
+  *right   = n_m00 * (m20 + 1.0f);
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection
+ *        with angle values with a right-hand coordinate system and a 
+ *        clip-space of [0, 1].
+ *        this makes easy to get all values at once
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] dest   array
+ */
+CGLM_INLINE
+void
+glm_persp_decompv_rh_zo(mat4 proj, float dest[6]) {
+  glm_persp_decomp_rh_zo(proj, &dest[0], &dest[1], &dest[2],
+                               &dest[3], &dest[4], &dest[5]);
+}
+
+/*!
+ * @brief decomposes left and right values of perspective projection (ZO).
+ *        x stands for x axis (left / right axis)
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] left  left
+ * @param[out] right right
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_x_rh_zo(mat4 proj,
+                         float * __restrict left,
+                         float * __restrict right) {
+  float nearZ, m20, m00, m22;
+
+  m00 = proj[0][0];
+  m20 = proj[2][0];
+  m22 = proj[2][2];
+
+  nearZ = proj[3][2] / m22;
+  *left   = nearZ * (m20 - 1.0f) / m00;
+  *right  = nearZ * (m20 + 1.0f) / m00;
+}
+
+/*!
+ * @brief decomposes top and bottom values of perspective projection
+ *        with angle values with a right-hand coordinate system and a 
+ *        clip-space of [0, 1].
+ *        y stands for y axis (top / bottom axis)
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] top    top
+ * @param[out] bottom bottom
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_y_rh_zo(mat4 proj,
+                         float * __restrict top,
+                         float * __restrict bottom) {
+  float nearZ, m21, m11, m22;
+
+  m21 = proj[2][1];
+  m11 = proj[1][1];
+  m22 = proj[2][2];
+
+  nearZ = proj[3][2] / m22;
+  *bottom = nearZ * (m21 - 1) / m11;
+  *top    = nearZ * (m21 + 1) / m11;
+}
+
+/*!
+ * @brief decomposes near and far values of perspective projection
+ *        with angle values with a right-hand coordinate system and a 
+ *        clip-space of [0, 1].
+ *        z stands for z axis (near / far axis)
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_z_rh_zo(mat4 proj,
+                         float * __restrict nearZ,
+                         float * __restrict farZ) {
+  float m32, m22;
+
+  m32 = proj[3][2];
+  m22 = proj[2][2];
+
+  *nearZ = m32 / m22;
+  *farZ  = m32 / (m22 + 1.0f);
+}
+
+/*!
+ * @brief decomposes far value of perspective projection
+ *        with angle values with a right-hand coordinate system and a 
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] farZ   far
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_far_rh_zo(mat4 proj, float * __restrict farZ) {
+  *farZ = proj[3][2] / (proj[2][2] + 1.0f);
+}
+
+/*!
+ * @brief decomposes near value of perspective projection
+ *        with angle values with a right-hand coordinate system and a 
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ */
+CGLM_INLINE
+void
+glm_persp_decomp_near_rh_zo(mat4 proj, float * __restrict nearZ) {
+  *nearZ = proj[3][2] / proj[2][2];
+}
+
+/*!
+ * @brief returns sizes of near and far planes of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj perspective projection matrix
+ * @param[in]  fovy fovy (see brief)
+ * @param[out] dest sizes order: [Wnear, Hnear, Wfar, Hfar]
+ */
+CGLM_INLINE
+void
+glm_persp_sizes_rh_zo(mat4 proj, float fovy, vec4 dest) {
+  float t, a, nearZ, farZ;
+
+  t = 2.0f * tanf(fovy * 0.5f);
+  a = glm_persp_aspect(proj);
+
+  glm_persp_decomp_z_rh_zo(proj, &nearZ, &farZ);
+
+  dest[1]  = t * nearZ;
+  dest[3]  = t * farZ;
+  dest[0]  = a * dest[1];
+  dest[2]  = a * dest[3];
+}
+
+/*!
+ * @brief returns field of view angle along the Y-axis (in radians)
+ *        with a right-hand coordinate system and a clip-space of [0, 1].
+ *
+ * if you need to degrees, use glm_deg to convert it or use this:
+ * fovy_deg = glm_deg(glm_persp_fovy(projMatrix))
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glm_persp_fovy_rh_zo(mat4 proj) {
+  return glm_persp_fovy(proj);
+}
+
+/*!
+ * @brief returns aspect ratio of perspective projection
+ *        with a right-hand coordinate system and a clip-space of [0, 1].
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glm_persp_aspect_rh_zo(mat4 proj) {
+  return glm_persp_aspect(proj);
+}
+
+#endif /*cglm_persp_rh_zo_h*/
diff --git a/external/cglm/clipspace/project_no.h b/external/cglm/clipspace/project_no.h
new file mode 100644
index 0000000..71fbc52
--- /dev/null
+++ b/external/cglm/clipspace/project_no.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_project_no_h
+#define cglm_project_no_h
+
+#include "../common.h"
+#include "../vec3.h"
+#include "../vec4.h"
+#include "../mat4.h"
+
+/*!
+ * @brief maps the specified viewport coordinates into specified space [1]
+ *        the matrix should contain projection matrix.
+ *
+ * if you don't have ( and don't want to have ) an inverse matrix then use
+ * glm_unproject version. You may use existing inverse of matrix in somewhere
+ * else, this is why glm_unprojecti exists to save save inversion cost
+ *
+ * [1] space:
+ *  1- if m = invProj:     View Space
+ *  2- if m = invViewProj: World Space
+ *  3- if m = invMVP:      Object Space
+ *
+ * You probably want to map the coordinates into object space
+ * so use invMVP as m
+ *
+ * Computing viewProj:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *   glm_mat4_inv(viewProj, invMVP);
+ *
+ * @param[in]  pos      point/position in viewport coordinates
+ * @param[in]  invMat   matrix (see brief)
+ * @param[in]  vp       viewport as [x, y, width, height]
+ * @param[out] dest     unprojected coordinates
+ */
+CGLM_INLINE
+void
+glm_unprojecti_no(vec3 pos, mat4 invMat, vec4 vp, vec3 dest) {
+  vec4 v;
+
+  v[0] = 2.0f * (pos[0] - vp[0]) / vp[2] - 1.0f;
+  v[1] = 2.0f * (pos[1] - vp[1]) / vp[3] - 1.0f;
+  v[2] = 2.0f *  pos[2]                  - 1.0f;
+  v[3] = 1.0f;
+
+  glm_mat4_mulv(invMat, v, v);
+  glm_vec4_scale(v, 1.0f / v[3], v);
+  glm_vec3(v, dest);
+}
+
+/*!
+ * @brief map object coordinates to window coordinates
+ *
+ * Computing MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * @param[in]  pos      object coordinates
+ * @param[in]  m        MVP matrix
+ * @param[in]  vp       viewport as [x, y, width, height]
+ * @param[out] dest     projected coordinates
+ */
+CGLM_INLINE
+void
+glm_project_no(vec3 pos, mat4 m, vec4 vp, vec3 dest) {
+  CGLM_ALIGN(16) vec4 pos4;
+
+  glm_vec4(pos, 1.0f, pos4);
+
+  glm_mat4_mulv(m, pos4, pos4);
+  glm_vec4_scale(pos4, 1.0f / pos4[3], pos4); /* pos = pos / pos.w */
+  glm_vec4_scale(pos4, 0.5f, pos4);
+  glm_vec4_adds(pos4,  0.5f, pos4);
+
+  dest[0] = pos4[0] * vp[2] + vp[0];
+  dest[1] = pos4[1] * vp[3] + vp[1];
+  dest[2] = pos4[2];
+}
+
+/*!
+ * @brief map object's z coordinate to window coordinates
+ *
+ * Computing MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * @param[in]  v  object coordinates
+ * @param[in]  m  MVP matrix
+ *
+ * @returns projected z coordinate
+ */
+CGLM_INLINE
+float
+glm_project_z_no(vec3 v, mat4 m) {
+  float z, w;
+
+  z = m[0][2] * v[0] + m[1][2] * v[1] + m[2][2] * v[2] + m[3][2];
+  w = m[0][3] * v[0] + m[1][3] * v[1] + m[2][3] * v[2] + m[3][3];
+
+  return 0.5f * (z / w) + 0.5f;
+}
+
+#endif /* cglm_project_no_h */
diff --git a/external/cglm/clipspace/project_zo.h b/external/cglm/clipspace/project_zo.h
new file mode 100644
index 0000000..dc32078
--- /dev/null
+++ b/external/cglm/clipspace/project_zo.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_project_zo_h
+#define cglm_project_zo_h
+
+#include "../common.h"
+#include "../vec3.h"
+#include "../vec4.h"
+#include "../mat4.h"
+
+/*!
+ * @brief maps the specified viewport coordinates into specified space [1]
+ *        the matrix should contain projection matrix.
+ *
+ * if you don't have ( and don't want to have ) an inverse matrix then use
+ * glm_unproject version. You may use existing inverse of matrix in somewhere
+ * else, this is why glm_unprojecti exists to save save inversion cost
+ *
+ * [1] space:
+ *  1- if m = invProj:     View Space
+ *  2- if m = invViewProj: World Space
+ *  3- if m = invMVP:      Object Space
+ *
+ * You probably want to map the coordinates into object space
+ * so use invMVP as m
+ *
+ * Computing viewProj:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *   glm_mat4_inv(viewProj, invMVP);
+ *
+ * @param[in]  pos      point/position in viewport coordinates
+ * @param[in]  invMat   matrix (see brief)
+ * @param[in]  vp       viewport as [x, y, width, height]
+ * @param[out] dest     unprojected coordinates
+ */
+CGLM_INLINE
+void
+glm_unprojecti_zo(vec3 pos, mat4 invMat, vec4 vp, vec3 dest) {
+  vec4 v;
+
+  v[0] = 2.0f * (pos[0] - vp[0]) / vp[2] - 1.0f;
+  v[1] = 2.0f * (pos[1] - vp[1]) / vp[3] - 1.0f;
+  v[2] = pos[2];
+  v[3] = 1.0f;
+
+  glm_mat4_mulv(invMat, v, v);
+  glm_vec4_scale(v, 1.0f / v[3], v);
+  glm_vec3(v, dest);
+}
+
+/*!
+ * @brief map object coordinates to window coordinates
+ *
+ * Computing MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * @param[in]  pos      object coordinates
+ * @param[in]  m        MVP matrix
+ * @param[in]  vp       viewport as [x, y, width, height]
+ * @param[out] dest     projected coordinates
+ */
+CGLM_INLINE
+void
+glm_project_zo(vec3 pos, mat4 m, vec4 vp, vec3 dest) {
+  CGLM_ALIGN(16) vec4 pos4;
+
+  glm_vec4(pos, 1.0f, pos4);
+
+  glm_mat4_mulv(m, pos4, pos4);
+  glm_vec4_scale(pos4, 1.0f / pos4[3], pos4); /* pos = pos / pos.w */
+
+  dest[2] = pos4[2];
+  
+  glm_vec4_scale(pos4, 0.5f, pos4);
+  glm_vec4_adds(pos4,  0.5f, pos4);
+
+  dest[0] = pos4[0] * vp[2] + vp[0];
+  dest[1] = pos4[1] * vp[3] + vp[1];
+}
+
+/*!
+ * @brief map object's z coordinate to window coordinates
+ *
+ * Computing MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * @param[in]  v  object coordinates
+ * @param[in]  m  MVP matrix
+ *
+ * @returns projected z coordinate
+ */
+CGLM_INLINE
+float
+glm_project_z_zo(vec3 v, mat4 m) {
+  float z, w;
+
+  z = m[0][2] * v[0] + m[1][2] * v[1] + m[2][2] * v[2] + m[3][2];
+  w = m[0][3] * v[0] + m[1][3] * v[1] + m[2][3] * v[2] + m[3][3];
+
+  return z / w;
+}
+
+#endif /* cglm_project_zo_h */
diff --git a/external/cglm/clipspace/view_lh.h b/external/cglm/clipspace/view_lh.h
new file mode 100644
index 0000000..5667694
--- /dev/null
+++ b/external/cglm/clipspace/view_lh.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_lookat_lh(vec3 eye, vec3 center, vec3 up, mat4 dest)
+   CGLM_INLINE void glm_look_lh(vec3 eye, vec3 dir, vec3 up, mat4 dest)
+   CGLM_INLINE void glm_look_anyup_lh(vec3 eye, vec3 dir, mat4 dest)
+ */
+
+#ifndef cglm_view_lh_h
+#define cglm_view_lh_h
+
+#include "../common.h"
+#include "../plane.h"
+
+/*!
+ * @brief set up view matrix (LH)
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  center center vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_lookat_lh(vec3 eye, vec3 center, vec3 up, mat4 dest) {
+  CGLM_ALIGN(8) vec3 f, u, s;
+
+  glm_vec3_sub(center, eye, f);
+  glm_vec3_normalize(f);
+
+  glm_vec3_crossn(up, f, s);
+  glm_vec3_cross(f, s, u);
+
+  dest[0][0] = s[0];
+  dest[0][1] = u[0];
+  dest[0][2] = f[0];
+  dest[1][0] = s[1];
+  dest[1][1] = u[1];
+  dest[1][2] = f[1];
+  dest[2][0] = s[2];
+  dest[2][1] = u[2];
+  dest[2][2] = f[2];
+  dest[3][0] =-glm_vec3_dot(s, eye);
+  dest[3][1] =-glm_vec3_dot(u, eye);
+  dest[3][2] =-glm_vec3_dot(f, eye);
+  dest[0][3] = dest[1][3] = dest[2][3] = 0.0f;
+  dest[3][3] = 1.0f;
+}
+
+/*!
+ * @brief set up view matrix with left handed coordinate system
+ *
+ * convenient wrapper for lookat: if you only have direction not target self
+ * then this might be useful. Because you need to get target from direction.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look_lh(vec3 eye, vec3 dir, vec3 up, mat4 dest) {
+  CGLM_ALIGN(8) vec3 target;
+  glm_vec3_add(eye, dir, target);
+  glm_lookat_lh(eye, target, up, dest);
+}
+
+/*!
+ * @brief set up view matrix with left handed coordinate system
+ *
+ * convenient wrapper for look: if you only have direction and if you don't
+ * care what UP vector is then this might be useful to create view matrix
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look_anyup_lh(vec3 eye, vec3 dir, mat4 dest) {
+  CGLM_ALIGN(8) vec3 up;
+  glm_vec3_ortho(dir, up);
+  glm_look_lh(eye, dir, up, dest);
+}
+
+#endif /*cglm_view_lh_h*/
diff --git a/external/cglm/clipspace/view_lh_no.h b/external/cglm/clipspace/view_lh_no.h
new file mode 100644
index 0000000..454d903
--- /dev/null
+++ b/external/cglm/clipspace/view_lh_no.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_lookat_lh_no(vec3 eye, vec3 center, vec3 up, mat4 dest)
+   CGLM_INLINE void glm_look_lh_no(vec3 eye, vec3 dir, vec3 up, mat4 dest)
+   CGLM_INLINE void glm_look_anyup_lh_no(vec3 eye, vec3 dir, mat4 dest)
+ */
+
+#ifndef cglm_view_lh_no_h
+#define cglm_view_lh_no_h
+
+#include "../common.h"
+#include "view_lh.h"
+
+/*!
+ * @brief set up view matrix with left handed coordinate system.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  center center vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_lookat_lh_no(vec3 eye, vec3 center, vec3 up, mat4 dest) {
+  glm_lookat_lh(eye, center, up, dest);
+}
+
+/*!
+ * @brief set up view matrix with left handed coordinate system.
+ *
+ * convenient wrapper for lookat: if you only have direction not target self
+ * then this might be useful. Because you need to get target from direction.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look_lh_no(vec3 eye, vec3 dir, vec3 up, mat4 dest) {
+  glm_look_lh(eye, dir, up, dest);
+}
+
+/*!
+ * @brief set up view matrix with left handed coordinate system.
+ *
+ * convenient wrapper for look: if you only have direction and if you don't
+ * care what UP vector is then this might be useful to create view matrix
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look_anyup_lh_no(vec3 eye, vec3 dir, mat4 dest) {
+  glm_look_anyup_lh(eye, dir, dest);
+}
+
+#endif /*cglm_view_lh_no_h*/
diff --git a/external/cglm/clipspace/view_lh_zo.h b/external/cglm/clipspace/view_lh_zo.h
new file mode 100644
index 0000000..6b0c4d1
--- /dev/null
+++ b/external/cglm/clipspace/view_lh_zo.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_lookat_lh_zo(vec3 eye, vec3 center, vec3 up, mat4 dest)
+   CGLM_INLINE void glm_look_lh_zo(vec3 eye, vec3 dir, vec3 up, mat4 dest)
+   CGLM_INLINE void glm_look_anyup_lh_zo(vec3 eye, vec3 dir, mat4 dest)
+ */
+
+#ifndef cglm_view_lh_zo_h
+#define cglm_view_lh_zo_h
+
+#include "../common.h"
+#include "view_lh.h"
+
+/*!
+ * @brief set up view matrix with left handed coordinate system.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  center center vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_lookat_lh_zo(vec3 eye, vec3 center, vec3 up, mat4 dest) {
+  glm_lookat_lh(eye, center, up, dest);
+}
+
+/*!
+ * @brief set up view matrix with left handed coordinate system.
+ *
+ * convenient wrapper for lookat: if you only have direction not target self
+ * then this might be useful. Because you need to get target from direction.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look_lh_zo(vec3 eye, vec3 dir, vec3 up, mat4 dest) {
+  glm_look_lh(eye, dir, up, dest);
+}
+
+/*!
+ * @brief set up view matrix with left handed coordinate system.
+ *
+ * convenient wrapper for look: if you only have direction and if you don't
+ * care what UP vector is then this might be useful to create view matrix
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look_anyup_lh_zo(vec3 eye, vec3 dir, mat4 dest) {
+  glm_look_anyup_lh(eye, dir, dest);
+}
+
+#endif /*cglm_view_lh_zo_h*/
diff --git a/external/cglm/clipspace/view_rh.h b/external/cglm/clipspace/view_rh.h
new file mode 100644
index 0000000..51ec916
--- /dev/null
+++ b/external/cglm/clipspace/view_rh.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_lookat_rh(vec3 eye, vec3 center, vec3 up, mat4 dest)
+   CGLM_INLINE void glm_look_rh(vec3 eye, vec3 dir, vec3 up, mat4 dest)
+   CGLM_INLINE void glm_look_anyup_rh(vec3 eye, vec3 dir, mat4 dest)
+ */
+
+#ifndef cglm_view_rh_h
+#define cglm_view_rh_h
+
+#include "../common.h"
+#include "../plane.h"
+
+/*!
+ * @brief set up view matrix with right handed coordinate system.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  center center vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_lookat_rh(vec3 eye, vec3 center, vec3 up, mat4 dest) {
+  CGLM_ALIGN(8) vec3 f, u, s;
+
+  glm_vec3_sub(center, eye, f);
+  glm_vec3_normalize(f);
+
+  glm_vec3_crossn(f, up, s);
+  glm_vec3_cross(s, f, u);
+
+  dest[0][0] = s[0];
+  dest[0][1] = u[0];
+  dest[0][2] =-f[0];
+  dest[1][0] = s[1];
+  dest[1][1] = u[1];
+  dest[1][2] =-f[1];
+  dest[2][0] = s[2];
+  dest[2][1] = u[2];
+  dest[2][2] =-f[2];
+  dest[3][0] =-glm_vec3_dot(s, eye);
+  dest[3][1] =-glm_vec3_dot(u, eye);
+  dest[3][2] = glm_vec3_dot(f, eye);
+  dest[0][3] = dest[1][3] = dest[2][3] = 0.0f;
+  dest[3][3] = 1.0f;
+}
+
+/*!
+ * @brief set up view matrix with right handed coordinate system.
+ *
+ * convenient wrapper for lookat: if you only have direction not target self
+ * then this might be useful. Because you need to get target from direction.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look_rh(vec3 eye, vec3 dir, vec3 up, mat4 dest) {
+  CGLM_ALIGN(8) vec3 target;
+  glm_vec3_add(eye, dir, target);
+  glm_lookat_rh(eye, target, up, dest);
+}
+
+/*!
+ * @brief set up view matrix with right handed coordinate system.
+ *
+ * convenient wrapper for look: if you only have direction and if you don't
+ * care what UP vector is then this might be useful to create view matrix
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look_anyup_rh(vec3 eye, vec3 dir, mat4 dest) {
+  CGLM_ALIGN(8) vec3 up;
+  glm_vec3_ortho(dir, up);
+  glm_look_rh(eye, dir, up, dest);
+}
+
+#endif /*cglm_view_rh_h*/
diff --git a/external/cglm/clipspace/view_rh_no.h b/external/cglm/clipspace/view_rh_no.h
new file mode 100644
index 0000000..ca36d30
--- /dev/null
+++ b/external/cglm/clipspace/view_rh_no.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_lookat_rh_no(vec3 eye, vec3 center, vec3 up, mat4 dest)
+   CGLM_INLINE void glm_look_rh_no(vec3 eye, vec3 dir, vec3 up, mat4 dest)
+   CGLM_INLINE void glm_look_anyup_rh_no(vec3 eye, vec3 dir, mat4 dest)
+ */
+
+#ifndef cglm_view_rh_no_h
+#define cglm_view_rh_no_h
+
+#include "../common.h"
+#include "view_rh.h"
+
+/*!
+ * @brief set up view matrix with right handed coordinate system.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  center center vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_lookat_rh_no(vec3 eye, vec3 center, vec3 up, mat4 dest) {
+  glm_lookat_rh(eye, center, up, dest);
+}
+
+/*!
+ * @brief set up view matrix with right handed coordinate system.
+ *
+ * convenient wrapper for lookat: if you only have direction not target self
+ * then this might be useful. Because you need to get target from direction.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look_rh_no(vec3 eye, vec3 dir, vec3 up, mat4 dest) {
+  glm_look_rh(eye, dir, up, dest);
+}
+
+/*!
+ * @brief set up view matrix with right handed coordinate system.
+ *
+ * convenient wrapper for look: if you only have direction and if you don't
+ * care what UP vector is then this might be useful to create view matrix
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look_anyup_rh_no(vec3 eye, vec3 dir, mat4 dest) {
+  glm_look_anyup_rh(eye, dir, dest);
+}
+
+#endif /*cglm_view_rh_no_h*/
diff --git a/external/cglm/clipspace/view_rh_zo.h b/external/cglm/clipspace/view_rh_zo.h
new file mode 100644
index 0000000..1ad5c91
--- /dev/null
+++ b/external/cglm/clipspace/view_rh_zo.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_lookat_rh_zo(vec3 eye, vec3 center, vec3 up, mat4 dest)
+   CGLM_INLINE void glm_look_rh_zo(vec3 eye, vec3 dir, vec3 up, mat4 dest)
+   CGLM_INLINE void glm_look_anyup_rh_zo(vec3 eye, vec3 dir, mat4 dest)
+ */
+
+#ifndef cglm_view_rh_zo_h
+#define cglm_view_rh_zo_h
+
+#include "../common.h"
+#include "view_rh.h"
+
+/*!
+ * @brief set up view matrix with right handed coordinate system.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  center center vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_lookat_rh_zo(vec3 eye, vec3 center, vec3 up, mat4 dest) {
+  glm_lookat_rh(eye, center, up, dest);
+}
+
+/*!
+ * @brief set up view matrix with right handed coordinate system.
+ *
+ * convenient wrapper for lookat: if you only have direction not target self
+ * then this might be useful. Because you need to get target from direction.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[in]  up     up vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look_rh_zo(vec3 eye, vec3 dir, vec3 up, mat4 dest) {
+  glm_look_rh(eye, dir, up, dest);
+}
+
+/*!
+ * @brief set up view matrix with right handed coordinate system.
+ *
+ * convenient wrapper for look: if you only have direction and if you don't
+ * care what UP vector is then this might be useful to create view matrix
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[out] dest   result matrix
+ */
+CGLM_INLINE
+void
+glm_look_anyup_rh_zo(vec3 eye, vec3 dir, mat4 dest) {
+  glm_look_anyup_rh(eye, dir, dest);
+}
+
+#endif /*cglm_view_rh_zo_h*/
diff --git a/external/cglm/color.h b/external/cglm/color.h
new file mode 100644
index 0000000..69566ad
--- /dev/null
+++ b/external/cglm/color.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_color_h
+#define cglm_color_h
+
+#include "common.h"
+#include "vec3.h"
+
+/*!
+ * @brief averages the color channels into one value
+ *
+ * @param[in]  rgb RGB color
+ */
+CGLM_INLINE
+float
+glm_luminance(vec3 rgb) {
+  vec3 l = {0.212671f, 0.715160f, 0.072169f};
+  return glm_dot(rgb, l);
+}
+
+#endif /* cglm_color_h */
diff --git a/external/cglm/common.h b/external/cglm/common.h
new file mode 100644
index 0000000..1af754d
--- /dev/null
+++ b/external/cglm/common.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_common_h
+#define cglm_common_h
+
+#define __cglm__ 1
+
+#ifndef _USE_MATH_DEFINES
+#  define _USE_MATH_DEFINES       /* for windows */
+#endif
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#  define _CRT_SECURE_NO_WARNINGS /* for windows */
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <stdbool.h>
+
+#if defined(_MSC_VER)
+#  ifdef CGLM_STATIC
+#    define CGLM_EXPORT
+#  elif defined(CGLM_EXPORTS)
+#    define CGLM_EXPORT __declspec(dllexport)
+#  else
+#    define CGLM_EXPORT __declspec(dllimport)
+#  endif
+#  define CGLM_INLINE __forceinline
+#else
+#  define CGLM_EXPORT __attribute__((visibility("default")))
+#  define CGLM_INLINE static inline __attribute((always_inline))
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  define CGLM_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#  define CGLM_LIKELY(expr)   __builtin_expect(!!(expr), 1)
+#else
+#  define CGLM_UNLIKELY(expr) (expr)
+#  define CGLM_LIKELY(expr)   (expr)
+#endif
+
+#if defined(_M_FP_FAST) || defined(__FAST_MATH__)
+#  define CGLM_FAST_MATH
+#endif
+
+#define GLM_SHUFFLE4(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+#define GLM_SHUFFLE3(z, y, x)    (((z) << 4) | ((y) << 2) | (x))
+#define GLM_SHUFFLE2(y, x)       (((y) << 2) | (x))
+
+#include "types.h"
+#include "simd/intrin.h"
+
+#ifndef CGLM_USE_DEFAULT_EPSILON
+#  ifndef GLM_FLT_EPSILON
+#    define GLM_FLT_EPSILON 1e-5f
+#  endif
+#else
+#  define GLM_FLT_EPSILON FLT_EPSILON
+#endif
+
+/*
+ * Clip control: define CGLM_FORCE_DEPTH_ZERO_TO_ONE before including
+ * CGLM to use a clip space between 0 to 1.
+ * Coordinate system: define CGLM_FORCE_LEFT_HANDED before including
+ * CGLM to use the left handed coordinate system by default.
+ */
+
+#define CGLM_CLIP_CONTROL_ZO_BIT (1 << 0) /* ZERO_TO_ONE */
+#define CGLM_CLIP_CONTROL_NO_BIT (1 << 1) /* NEGATIVE_ONE_TO_ONE */
+#define CGLM_CLIP_CONTROL_LH_BIT (1 << 2) /* LEFT_HANDED, For DirectX, Metal, Vulkan */
+#define CGLM_CLIP_CONTROL_RH_BIT (1 << 3) /* RIGHT_HANDED, For OpenGL, default in GLM */
+
+#define CGLM_CLIP_CONTROL_LH_ZO (CGLM_CLIP_CONTROL_LH_BIT | CGLM_CLIP_CONTROL_ZO_BIT)
+#define CGLM_CLIP_CONTROL_LH_NO (CGLM_CLIP_CONTROL_LH_BIT | CGLM_CLIP_CONTROL_NO_BIT)
+#define CGLM_CLIP_CONTROL_RH_ZO (CGLM_CLIP_CONTROL_RH_BIT | CGLM_CLIP_CONTROL_ZO_BIT)
+#define CGLM_CLIP_CONTROL_RH_NO (CGLM_CLIP_CONTROL_RH_BIT | CGLM_CLIP_CONTROL_NO_BIT)
+
+#ifdef CGLM_FORCE_DEPTH_ZERO_TO_ONE
+#  ifdef CGLM_FORCE_LEFT_HANDED
+#    define CGLM_CONFIG_CLIP_CONTROL CGLM_CLIP_CONTROL_LH_ZO
+#  else
+#    define CGLM_CONFIG_CLIP_CONTROL CGLM_CLIP_CONTROL_RH_ZO
+#  endif
+#else
+#  ifdef CGLM_FORCE_LEFT_HANDED
+#    define CGLM_CONFIG_CLIP_CONTROL CGLM_CLIP_CONTROL_LH_NO
+#  else
+#    define CGLM_CONFIG_CLIP_CONTROL CGLM_CLIP_CONTROL_RH_NO
+#  endif
+#endif
+
+/* struct API configurator */
+/* TODO: move struct/common.h? */
+/* WARN: dont use concant helpers outside cglm headers, because they may be changed */
+
+#define CGLM_MACRO_CONCAT_HELPER(A, B, C, D, E, ...) A ## B ## C ## D ## E ## __VA_ARGS__
+#define CGLM_MACRO_CONCAT(A, B, C, D, E, ...) CGLM_MACRO_CONCAT_HELPER(A, B, C, D, E,__VA_ARGS__)
+
+#ifndef CGLM_OMIT_NS_FROM_STRUCT_API
+#  ifndef CGLM_STRUCT_API_NS
+#    define CGLM_STRUCT_API_NS glms
+#  endif
+#  ifndef CGLM_STRUCT_API_NS_SEPERATOR
+#    define CGLM_STRUCT_API_NS_SEPERATOR _
+#  endif
+#else
+#  define CGLM_STRUCT_API_NS
+#  define CGLM_STRUCT_API_NS_SEPERATOR
+#endif
+
+#ifndef CGLM_STRUCT_API_NAME_SUFFIX
+#  define CGLM_STRUCT_API_NAME_SUFFIX
+#endif
+
+#define CGLM_STRUCTAPI(A, ...) CGLM_MACRO_CONCAT(CGLM_STRUCT_API_NS,             \
+                                                 CGLM_STRUCT_API_NS_SEPERATOR,   \
+                                                 A,                              \
+                                                 CGLM_STRUCT_API_NAME_SUFFIX,    \
+                                                 _,                              \
+                                                 __VA_ARGS__)
+
+#endif /* cglm_common_h */
diff --git a/external/cglm/curve.h b/external/cglm/curve.h
new file mode 100644
index 0000000..5033be5
--- /dev/null
+++ b/external/cglm/curve.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_curve_h
+#define cglm_curve_h
+
+#include "common.h"
+#include "vec4.h"
+#include "mat4.h"
+
+/*!
+ * @brief helper function to calculate S*M*C multiplication for curves
+ *
+ * This function does not encourage you to use SMC,
+ * instead it is a helper if you use SMC.
+ *
+ * if you want to specify S as vector then use more generic glm_mat4_rmc() func.
+ *
+ * Example usage:
+ *  B(s) = glm_smc(s, GLM_BEZIER_MAT, (vec4){p0, c0, c1, p1})
+ *
+ * @param[in]  s  parameter between 0 and 1 (this will be [s3, s2, s, 1])
+ * @param[in]  m  basis matrix
+ * @param[in]  c  position/control vector
+ *
+ * @return B(s)
+ */
+CGLM_INLINE
+float
+glm_smc(float s, mat4 m, vec4 c) {
+  vec4 vs;
+  glm_vec4_cubic(s, vs);
+  return glm_mat4_rmc(vs, m, c);
+}
+
+#endif /* cglm_curve_h */
diff --git a/external/cglm/ease.h b/external/cglm/ease.h
new file mode 100644
index 0000000..e26b48c
--- /dev/null
+++ b/external/cglm/ease.h
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_ease_h
+#define cglm_ease_h
+
+#include "common.h"
+
+CGLM_INLINE
+float
+glm_ease_linear(float t) {
+  return t;
+}
+
+CGLM_INLINE
+float
+glm_ease_sine_in(float t) {
+  return sinf((t - 1.0f) * GLM_PI_2f) + 1.0f;
+}
+
+CGLM_INLINE
+float
+glm_ease_sine_out(float t) {
+  return sinf(t * GLM_PI_2f);
+}
+
+CGLM_INLINE
+float
+glm_ease_sine_inout(float t) {
+  return 0.5f * (1.0f - cosf(t * GLM_PIf));
+}
+
+CGLM_INLINE
+float
+glm_ease_quad_in(float t) {
+  return t * t;
+}
+
+CGLM_INLINE
+float
+glm_ease_quad_out(float t) {
+  return -(t * (t - 2.0f));
+}
+
+CGLM_INLINE
+float
+glm_ease_quad_inout(float t) {
+  float tt;
+
+  tt = t * t;
+  if (t < 0.5f)
+    return 2.0f * tt;
+
+  return (-2.0f * tt) + (4.0f * t) - 1.0f;
+}
+
+CGLM_INLINE
+float
+glm_ease_cubic_in(float t) {
+  return t * t * t;
+}
+
+CGLM_INLINE
+float
+glm_ease_cubic_out(float t) {
+  float f;
+  f = t - 1.0f;
+  return f * f * f + 1.0f;
+}
+
+CGLM_INLINE
+float
+glm_ease_cubic_inout(float t) {
+  float f;
+
+  if (t < 0.5f)
+    return 4.0f * t * t * t;
+
+  f = 2.0f * t - 2.0f;
+
+  return 0.5f * f * f * f + 1.0f;
+}
+
+CGLM_INLINE
+float
+glm_ease_quart_in(float t) {
+  float f;
+  f = t * t;
+  return f * f;
+}
+
+CGLM_INLINE
+float
+glm_ease_quart_out(float t) {
+  float f;
+
+  f = t - 1.0f;
+
+  return f * f * f * (1.0f - t) + 1.0f;
+}
+
+CGLM_INLINE
+float
+glm_ease_quart_inout(float t) {
+  float f, g;
+
+  if (t < 0.5f) {
+    f = t * t;
+    return 8.0f * f * f;
+  }
+
+  f = t - 1.0f;
+  g = f * f;
+
+  return -8.0f * g * g + 1.0f;
+}
+
+CGLM_INLINE
+float
+glm_ease_quint_in(float t) {
+  float f;
+  f = t * t;
+  return f * f * t;
+}
+
+CGLM_INLINE
+float
+glm_ease_quint_out(float t) {
+  float f, g;
+
+  f = t - 1.0f;
+  g = f * f;
+
+  return g * g * f + 1.0f;
+}
+
+CGLM_INLINE
+float
+glm_ease_quint_inout(float t) {
+  float f, g;
+
+  if (t < 0.5f) {
+    f = t * t;
+    return 16.0f * f * f * t;
+  }
+
+  f = 2.0f * t - 2.0f;
+  g = f * f;
+
+  return 0.5f * g * g * f + 1.0f;
+}
+
+CGLM_INLINE
+float
+glm_ease_exp_in(float t) {
+  if (t == 0.0f)
+    return t;
+
+  return powf(2.0f,  10.0f * (t - 1.0f));
+}
+
+CGLM_INLINE
+float
+glm_ease_exp_out(float t) {
+  if (t == 1.0f)
+    return t;
+
+  return 1.0f - powf(2.0f, -10.0f * t);
+}
+
+CGLM_INLINE
+float
+glm_ease_exp_inout(float t) {
+  if (t == 0.0f || t == 1.0f)
+    return t;
+
+  if (t < 0.5f)
+    return 0.5f * powf(2.0f, (20.0f * t) - 10.0f);
+
+  return -0.5f * powf(2.0f, (-20.0f * t) + 10.0f) + 1.0f;
+}
+
+CGLM_INLINE
+float
+glm_ease_circ_in(float t) {
+  return 1.0f - sqrtf(1.0f - (t * t));
+}
+
+CGLM_INLINE
+float
+glm_ease_circ_out(float t) {
+  return sqrtf((2.0f - t) * t);
+}
+
+CGLM_INLINE
+float
+glm_ease_circ_inout(float t) {
+  if (t < 0.5f)
+    return 0.5f * (1.0f - sqrtf(1.0f - 4.0f * (t * t)));
+
+  return 0.5f * (sqrtf(-((2.0f * t) - 3.0f) * ((2.0f * t) - 1.0f)) + 1.0f);
+}
+
+CGLM_INLINE
+float
+glm_ease_back_in(float t) {
+  float o, z;
+
+  o = 1.70158f;
+  z = ((o + 1.0f) * t) - o;
+
+  return t * t * z;
+}
+
+CGLM_INLINE
+float
+glm_ease_back_out(float t) {
+  float o, z, n;
+
+  o = 1.70158f;
+  n = t - 1.0f;
+  z = (o + 1.0f) * n + o;
+
+  return n * n * z + 1.0f;
+}
+
+CGLM_INLINE
+float
+glm_ease_back_inout(float t) {
+  float o, z, n, m, s, x;
+
+  o = 1.70158f;
+  s = o * 1.525f;
+  x = 0.5f;
+  n = t / 0.5f;
+
+  if (n < 1.0f) {
+    z = (s + 1) * n - s;
+    m = n * n * z;
+    return x * m;
+  }
+
+  n -= 2.0f;
+  z  = (s + 1.0f) * n + s;
+  m  = (n * n * z) + 2;
+
+  return x * m;
+}
+
+CGLM_INLINE
+float
+glm_ease_elast_in(float t) {
+  return sinf(13.0f * GLM_PI_2f * t) * powf(2.0f, 10.0f * (t - 1.0f));
+}
+
+CGLM_INLINE
+float
+glm_ease_elast_out(float t) {
+  return sinf(-13.0f * GLM_PI_2f * (t + 1.0f)) * powf(2.0f, -10.0f * t) + 1.0f;
+}
+
+CGLM_INLINE
+float
+glm_ease_elast_inout(float t) {
+  float a;
+
+  a = 2.0f * t;
+
+  if (t < 0.5f)
+    return 0.5f * sinf(13.0f * GLM_PI_2f * a)
+                * powf(2.0f, 10.0f * (a - 1.0f));
+
+  return 0.5f * (sinf(-13.0f * GLM_PI_2f * a)
+                 * powf(2.0f, -10.0f * (a - 1.0f)) + 2.0f);
+}
+
+CGLM_INLINE
+float
+glm_ease_bounce_out(float t) {
+  float tt;
+
+  tt = t * t;
+
+  if (t < (4.0f / 11.0f))
+    return (121.0f * tt) / 16.0f;
+
+  if (t < 8.0f / 11.0f)
+    return ((363.0f / 40.0f) * tt) - ((99.0f / 10.0f) * t) + (17.0f / 5.0f);
+
+  if (t < (9.0f / 10.0f))
+    return (4356.0f / 361.0f) * tt
+            - (35442.0f / 1805.0f) * t
+            + (16061.0f / 1805.0f);
+
+  return ((54.0f / 5.0f) * tt) - ((513.0f / 25.0f) * t) + (268.0f / 25.0f);
+}
+
+CGLM_INLINE
+float
+glm_ease_bounce_in(float t) {
+  return 1.0f - glm_ease_bounce_out(1.0f - t);
+}
+
+CGLM_INLINE
+float
+glm_ease_bounce_inout(float t) {
+  if (t < 0.5f)
+    return 0.5f * (1.0f - glm_ease_bounce_out(t * 2.0f));
+
+  return 0.5f * glm_ease_bounce_out(t * 2.0f - 1.0f) + 0.5f;
+}
+
+#endif /* cglm_ease_h */
diff --git a/external/cglm/euler.h b/external/cglm/euler.h
new file mode 100644
index 0000000..8fae039
--- /dev/null
+++ b/external/cglm/euler.h
@@ -0,0 +1,601 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ NOTE:
+  angles must be passed as [X-Angle, Y-Angle, Z-angle] order
+  For instance you don't pass angles as [Z-Angle, X-Angle, Y-angle] to
+  glm_euler_zxy function, All RELATED functions accept angles same order
+  which is [X, Y, Z].
+ */
+
+/*
+ Types:
+   enum glm_euler_seq
+
+ Functions:
+   CGLM_INLINE glm_euler_seq glm_euler_order(int newOrder[3]);
+   CGLM_INLINE void glm_euler_angles(mat4 m, vec3 dest);
+   CGLM_INLINE void glm_euler(vec3 angles, mat4 dest);
+   CGLM_INLINE void glm_euler_xyz(vec3 angles, mat4 dest);
+   CGLM_INLINE void glm_euler_zyx(vec3 angles, mat4 dest);
+   CGLM_INLINE void glm_euler_zxy(vec3 angles, mat4 dest);
+   CGLM_INLINE void glm_euler_xzy(vec3 angles, mat4 dest);
+   CGLM_INLINE void glm_euler_yzx(vec3 angles, mat4 dest);
+   CGLM_INLINE void glm_euler_yxz(vec3 angles, mat4 dest);
+   CGLM_INLINE void glm_euler_by_order(vec3         angles,
+                                       glm_euler_seq ord,
+                                       mat4         dest);
+   CGLM_INLINE void glm_euler_xyz_quat(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_xzy_quat(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_yxz_quat(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_yzx_quat(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_zxy_quat(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_zyx_quat(vec3 angles, versor dest);
+ */
+
+#ifndef cglm_euler_h
+#define cglm_euler_h
+
+#include "common.h"
+
+#ifdef CGLM_FORCE_LEFT_HANDED
+#  include "handed/euler_to_quat_lh.h"
+#else
+#  include "handed/euler_to_quat_rh.h"
+#endif
+
+
+#ifndef CGLM_CLIPSPACE_INCLUDE_ALL
+#  if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+#    include "clipspace/ortho_lh_zo.h"
+#    include "clipspace/persp_lh_zo.h"
+#    include "clipspace/view_lh_zo.h"
+#  elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+#    include "clipspace/ortho_lh_no.h"
+#    include "clipspace/persp_lh_no.h"
+#    include "clipspace/view_lh_no.h"
+#  elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+#    include "clipspace/ortho_rh_zo.h"
+#    include "clipspace/persp_rh_zo.h"
+#    include "clipspace/view_rh_zo.h"
+#  elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+#    include "clipspace/ortho_rh_no.h"
+#    include "clipspace/persp_rh_no.h"
+#    include "clipspace/view_rh_no.h"
+#  endif
+#else
+#  include "clipspace/ortho_lh_zo.h"
+#  include "clipspace/persp_lh_zo.h"
+#  include "clipspace/ortho_lh_no.h"
+#  include "clipspace/persp_lh_no.h"
+#  include "clipspace/ortho_rh_zo.h"
+#  include "clipspace/persp_rh_zo.h"
+#  include "clipspace/ortho_rh_no.h"
+#  include "clipspace/persp_rh_no.h"
+#  include "clipspace/view_lh_zo.h"
+#  include "clipspace/view_lh_no.h"
+#  include "clipspace/view_rh_zo.h"
+#  include "clipspace/view_rh_no.h"
+#endif
+
+
+/*!
+ * if you have axis order like vec3 orderVec = [0, 1, 2] or [0, 2, 1]...
+ * vector then you can convert it to this enum by doing this:
+ * @code
+ * glm_euler_seq order;
+ * order = orderVec[0] | orderVec[1] << 2 | orderVec[2] << 4;
+ * @endcode
+ * you may need to explicit cast if required
+ */
+typedef enum glm_euler_seq {
+  GLM_EULER_XYZ = 0 << 0 | 1 << 2 | 2 << 4,
+  GLM_EULER_XZY = 0 << 0 | 2 << 2 | 1 << 4,
+  GLM_EULER_YZX = 1 << 0 | 2 << 2 | 0 << 4,
+  GLM_EULER_YXZ = 1 << 0 | 0 << 2 | 2 << 4,
+  GLM_EULER_ZXY = 2 << 0 | 0 << 2 | 1 << 4,
+  GLM_EULER_ZYX = 2 << 0 | 1 << 2 | 0 << 4
+} glm_euler_seq;
+
+CGLM_INLINE
+glm_euler_seq
+glm_euler_order(int ord[3]) {
+  return (glm_euler_seq)(ord[0] << 0 | ord[1] << 2 | ord[2] << 4);
+}
+
+/*!
+ * @brief extract euler angles (in radians) using xyz order
+ *
+ * @param[in]  m    affine transform
+ * @param[out] dest angles vector [x, y, z]
+ */
+CGLM_INLINE
+void
+glm_euler_angles(mat4 m, vec3 dest) {
+  float m00, m01, m10, m11, m20, m21, m22;
+  float thetaX, thetaY, thetaZ;
+
+  m00 = m[0][0];  m10 = m[1][0];  m20 = m[2][0];
+  m01 = m[0][1];  m11 = m[1][1];  m21 = m[2][1];
+                                  m22 = m[2][2];
+
+  if (m20 < 1.0f) {
+    if (m20 > -1.0f) {
+      thetaY = asinf(m20);
+      thetaX = atan2f(-m21, m22);
+      thetaZ = atan2f(-m10, m00);
+    } else { /* m20 == -1 */
+      /* Not a unique solution */
+      thetaY = -GLM_PI_2f;
+      thetaX = -atan2f(m01, m11);
+      thetaZ =  0.0f;
+    }
+  } else { /* m20 == +1 */
+    thetaY = GLM_PI_2f;
+    thetaX = atan2f(m01, m11);
+    thetaZ = 0.0f;
+  }
+
+  dest[0] = thetaX;
+  dest[1] = thetaY;
+  dest[2] = thetaZ;
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @param[out] dest   rotation matrix
+ */
+CGLM_INLINE
+void
+glm_euler_xyz(vec3 angles, mat4 dest) {
+  float cx, cy, cz,
+        sx, sy, sz, czsx, cxcz, sysz;
+
+  sx   = sinf(angles[0]); cx = cosf(angles[0]);
+  sy   = sinf(angles[1]); cy = cosf(angles[1]);
+  sz   = sinf(angles[2]); cz = cosf(angles[2]);
+
+  czsx = cz * sx;
+  cxcz = cx * cz;
+  sysz = sy * sz;
+
+  dest[0][0] =  cy * cz;
+  dest[0][1] =  czsx * sy + cx * sz;
+  dest[0][2] = -cxcz * sy + sx * sz;
+  dest[1][0] = -cy * sz;
+  dest[1][1] =  cxcz - sx * sysz;
+  dest[1][2] =  czsx + cx * sysz;
+  dest[2][0] =  sy;
+  dest[2][1] = -cy * sx;
+  dest[2][2] =  cx * cy;
+  dest[0][3] =  0.0f;
+  dest[1][3] =  0.0f;
+  dest[2][3] =  0.0f;
+  dest[3][0] =  0.0f;
+  dest[3][1] =  0.0f;
+  dest[3][2] =  0.0f;
+  dest[3][3] =  1.0f;
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @param[out] dest   rotation matrix
+ */
+CGLM_INLINE
+void
+glm_euler(vec3 angles, mat4 dest) {
+  glm_euler_xyz(angles, dest);
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @param[out] dest   rotation matrix
+ */
+CGLM_INLINE
+void
+glm_euler_xzy(vec3 angles, mat4 dest) {
+  float cx, cy, cz,
+  sx, sy, sz, sxsy, cysx, cxsy, cxcy;
+
+  sx   = sinf(angles[0]); cx = cosf(angles[0]);
+  sy   = sinf(angles[1]); cy = cosf(angles[1]);
+  sz   = sinf(angles[2]); cz = cosf(angles[2]);
+
+  sxsy = sx * sy;
+  cysx = cy * sx;
+  cxsy = cx * sy;
+  cxcy = cx * cy;
+
+  dest[0][0] =  cy * cz;
+  dest[0][1] =  sxsy + cxcy * sz;
+  dest[0][2] = -cxsy + cysx * sz;
+  dest[1][0] = -sz;
+  dest[1][1] =  cx * cz;
+  dest[1][2] =  cz * sx;
+  dest[2][0] =  cz * sy;
+  dest[2][1] = -cysx + cxsy * sz;
+  dest[2][2] =  cxcy + sxsy * sz;
+  dest[0][3] =  0.0f;
+  dest[1][3] =  0.0f;
+  dest[2][3] =  0.0f;
+  dest[3][0] =  0.0f;
+  dest[3][1] =  0.0f;
+  dest[3][2] =  0.0f;
+  dest[3][3] =  1.0f;
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @param[out] dest   rotation matrix
+ */
+CGLM_INLINE
+void
+glm_euler_yxz(vec3 angles, mat4 dest) {
+  float cx, cy, cz,
+        sx, sy, sz, cycz, sysz, czsy, cysz;
+
+  sx   = sinf(angles[0]); cx = cosf(angles[0]);
+  sy   = sinf(angles[1]); cy = cosf(angles[1]);
+  sz   = sinf(angles[2]); cz = cosf(angles[2]);
+
+  cycz = cy * cz;
+  sysz = sy * sz;
+  czsy = cz * sy;
+  cysz = cy * sz;
+
+  dest[0][0] =  cycz + sx * sysz;
+  dest[0][1] =  cx * sz;
+  dest[0][2] = -czsy + cysz * sx;
+  dest[1][0] = -cysz + czsy * sx;
+  dest[1][1] =  cx * cz;
+  dest[1][2] =  cycz * sx + sysz;
+  dest[2][0] =  cx * sy;
+  dest[2][1] = -sx;
+  dest[2][2] =  cx * cy;
+  dest[0][3] =  0.0f;
+  dest[1][3] =  0.0f;
+  dest[2][3] =  0.0f;
+  dest[3][0] =  0.0f;
+  dest[3][1] =  0.0f;
+  dest[3][2] =  0.0f;
+  dest[3][3] =  1.0f;
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @param[out] dest   rotation matrix
+ */
+CGLM_INLINE
+void
+glm_euler_yzx(vec3 angles, mat4 dest) {
+  float cx, cy, cz,
+        sx, sy, sz, sxsy, cxcy, cysx, cxsy;
+
+  sx   = sinf(angles[0]); cx = cosf(angles[0]);
+  sy   = sinf(angles[1]); cy = cosf(angles[1]);
+  sz   = sinf(angles[2]); cz = cosf(angles[2]);
+
+  sxsy = sx * sy;
+  cxcy = cx * cy;
+  cysx = cy * sx;
+  cxsy = cx * sy;
+
+  dest[0][0] =  cy * cz;
+  dest[0][1] =  sz;
+  dest[0][2] = -cz * sy;
+  dest[1][0] =  sxsy - cxcy * sz;
+  dest[1][1] =  cx * cz;
+  dest[1][2] =  cysx + cxsy * sz;
+  dest[2][0] =  cxsy + cysx * sz;
+  dest[2][1] = -cz * sx;
+  dest[2][2] =  cxcy - sxsy * sz;
+  dest[0][3] =  0.0f;
+  dest[1][3] =  0.0f;
+  dest[2][3] =  0.0f;
+  dest[3][0] =  0.0f;
+  dest[3][1] =  0.0f;
+  dest[3][2] =  0.0f;
+  dest[3][3] =  1.0f;
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @param[out] dest   rotation matrix
+ */
+CGLM_INLINE
+void
+glm_euler_zxy(vec3 angles, mat4 dest) {
+  float cx, cy, cz,
+        sx, sy, sz, cycz, sxsy, cysz;
+
+  sx   = sinf(angles[0]); cx = cosf(angles[0]);
+  sy   = sinf(angles[1]); cy = cosf(angles[1]);
+  sz   = sinf(angles[2]); cz = cosf(angles[2]);
+
+  cycz = cy * cz;
+  sxsy = sx * sy;
+  cysz = cy * sz;
+
+  dest[0][0] =  cycz - sxsy * sz;
+  dest[0][1] =  cz * sxsy + cysz;
+  dest[0][2] = -cx * sy;
+  dest[1][0] = -cx * sz;
+  dest[1][1] =  cx * cz;
+  dest[1][2] =  sx;
+  dest[2][0] =  cz * sy + cysz * sx;
+  dest[2][1] = -cycz * sx + sy * sz;
+  dest[2][2] =  cx * cy;
+  dest[0][3] =  0.0f;
+  dest[1][3] =  0.0f;
+  dest[2][3] =  0.0f;
+  dest[3][0] =  0.0f;
+  dest[3][1] =  0.0f;
+  dest[3][2] =  0.0f;
+  dest[3][3] =  1.0f;
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @param[out] dest   rotation matrix
+ */
+CGLM_INLINE
+void
+glm_euler_zyx(vec3 angles, mat4 dest) {
+  float cx, cy, cz,
+        sx, sy, sz, czsx, cxcz, sysz;
+
+  sx   = sinf(angles[0]); cx = cosf(angles[0]);
+  sy   = sinf(angles[1]); cy = cosf(angles[1]);
+  sz   = sinf(angles[2]); cz = cosf(angles[2]);
+
+  czsx = cz * sx;
+  cxcz = cx * cz;
+  sysz = sy * sz;
+
+  dest[0][0] =  cy * cz;
+  dest[0][1] =  cy * sz;
+  dest[0][2] = -sy;
+  dest[1][0] =  czsx * sy - cx * sz;
+  dest[1][1] =  cxcz + sx * sysz;
+  dest[1][2] =  cy * sx;
+  dest[2][0] =  cxcz * sy + sx * sz;
+  dest[2][1] = -czsx + cx * sysz;
+  dest[2][2] =  cx * cy;
+  dest[0][3] =  0.0f;
+  dest[1][3] =  0.0f;
+  dest[2][3] =  0.0f;
+  dest[3][0] =  0.0f;
+  dest[3][1] =  0.0f;
+  dest[3][2] =  0.0f;
+  dest[3][3] =  1.0f;
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @param[in]  ord    euler order
+ * @param[out] dest   rotation matrix
+ */
+CGLM_INLINE
+void
+glm_euler_by_order(vec3 angles, glm_euler_seq ord, mat4 dest) {
+  float cx, cy, cz,
+        sx, sy, sz;
+
+  float cycz, cysz, cysx, cxcy,
+        czsy, cxcz, czsx, cxsz,
+        sysz;
+
+  sx = sinf(angles[0]); cx = cosf(angles[0]);
+  sy = sinf(angles[1]); cy = cosf(angles[1]);
+  sz = sinf(angles[2]); cz = cosf(angles[2]);
+
+  cycz = cy * cz; cysz = cy * sz;
+  cysx = cy * sx; cxcy = cx * cy;
+  czsy = cz * sy; cxcz = cx * cz;
+  czsx = cz * sx; cxsz = cx * sz;
+  sysz = sy * sz;
+
+  switch (ord) {
+    case GLM_EULER_XZY:
+      dest[0][0] =  cycz;
+      dest[0][1] =  sx * sy + cx * cysz;
+      dest[0][2] = -cx * sy + cysx * sz;
+      dest[1][0] = -sz;
+      dest[1][1] =  cxcz;
+      dest[1][2] =  czsx;
+      dest[2][0] =  czsy;
+      dest[2][1] = -cysx + cx * sysz;
+      dest[2][2] =  cxcy + sx * sysz;
+      break;
+    case GLM_EULER_XYZ:
+      dest[0][0] =  cycz;
+      dest[0][1] =  czsx * sy + cxsz;
+      dest[0][2] = -cx * czsy + sx * sz;
+      dest[1][0] = -cysz;
+      dest[1][1] =  cxcz - sx * sysz;
+      dest[1][2] =  czsx + cx * sysz;
+      dest[2][0] =  sy;
+      dest[2][1] = -cysx;
+      dest[2][2] =  cxcy;
+      break;
+    case GLM_EULER_YXZ:
+      dest[0][0] =  cycz + sx * sysz;
+      dest[0][1] =  cxsz;
+      dest[0][2] = -czsy + cysx * sz;
+      dest[1][0] =  czsx * sy - cysz;
+      dest[1][1] =  cxcz;
+      dest[1][2] =  cycz * sx + sysz;
+      dest[2][0] =  cx * sy;
+      dest[2][1] = -sx;
+      dest[2][2] =  cxcy;
+      break;
+    case GLM_EULER_YZX:
+      dest[0][0] =  cycz;
+      dest[0][1] =  sz;
+      dest[0][2] = -czsy;
+      dest[1][0] =  sx * sy - cx * cysz;
+      dest[1][1] =  cxcz;
+      dest[1][2] =  cysx + cx * sysz;
+      dest[2][0] =  cx * sy + cysx * sz;
+      dest[2][1] = -czsx;
+      dest[2][2] =  cxcy - sx * sysz;
+      break;
+    case GLM_EULER_ZXY:
+      dest[0][0] =  cycz - sx * sysz;
+      dest[0][1] =  czsx * sy + cysz;
+      dest[0][2] = -cx * sy;
+      dest[1][0] = -cxsz;
+      dest[1][1] =  cxcz;
+      dest[1][2] =  sx;
+      dest[2][0] =  czsy + cysx * sz;
+      dest[2][1] = -cycz * sx + sysz;
+      dest[2][2] =  cxcy;
+      break;
+    case GLM_EULER_ZYX:
+      dest[0][0] =  cycz;
+      dest[0][1] =  cysz;
+      dest[0][2] = -sy;
+      dest[1][0] =  czsx * sy - cxsz;
+      dest[1][1] =  cxcz + sx * sysz;
+      dest[1][2] =  cysx;
+      dest[2][0] =  cx * czsy + sx * sz;
+      dest[2][1] = -czsx + cx * sysz;
+      dest[2][2] =  cxcy;
+      break;
+  }
+
+  dest[0][3] = 0.0f;
+  dest[1][3] = 0.0f;
+  dest[2][3] = 0.0f;
+  dest[3][0] = 0.0f;
+  dest[3][1] = 0.0f;
+  dest[3][2] = 0.0f;
+  dest[3][3] = 1.0f;
+}
+
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in x y z order (roll pitch yaw)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_xyz_quat(vec3 angles, versor dest) {
+#ifdef CGLM_FORCE_LEFT_HANDED
+  glm_euler_xyz_quat_lh(angles, dest);
+#else
+  glm_euler_xyz_quat_rh(angles, dest);
+#endif
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in x z y order (roll yaw pitch)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_xzy_quat(vec3 angles, versor dest) {
+#ifdef CGLM_FORCE_LEFT_HANDED
+  glm_euler_xzy_quat_lh(angles, dest);
+#else
+  glm_euler_xzy_quat_rh(angles, dest);
+#endif
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in y x z order (pitch roll yaw)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_yxz_quat(vec3 angles, versor dest) {
+#ifdef CGLM_FORCE_LEFT_HANDED
+  glm_euler_yxz_quat_lh(angles, dest);
+#else
+  glm_euler_yxz_quat_rh(angles, dest);
+#endif
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in y z x order (pitch yaw roll)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_yzx_quat(vec3 angles, versor dest) {
+#ifdef CGLM_FORCE_LEFT_HANDED
+  glm_euler_yzx_quat_lh(angles, dest);
+#else
+  glm_euler_yzx_quat_rh(angles, dest);
+#endif
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in z x y order (yaw roll pitch)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_zxy_quat(vec3 angles, versor dest) {
+#ifdef CGLM_FORCE_LEFT_HANDED
+  glm_euler_zxy_quat_lh(angles, dest);
+#else
+  glm_euler_zxy_quat_rh(angles, dest);
+#endif
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in z y x order (yaw pitch roll)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_zyx_quat(vec3 angles, versor dest) {
+#ifdef CGLM_FORCE_LEFT_HANDED
+  glm_euler_zyx_quat_lh(angles, dest);
+#else
+  glm_euler_zyx_quat_rh(angles, dest);
+#endif
+}
+
+
+#endif /* cglm_euler_h */
diff --git a/external/cglm/frustum.h b/external/cglm/frustum.h
new file mode 100644
index 0000000..5aa3c17
--- /dev/null
+++ b/external/cglm/frustum.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_frustum_h
+#define cglm_frustum_h
+
+#include "common.h"
+#include "plane.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+
+#define GLM_LBN 0 /* left  bottom near */
+#define GLM_LTN 1 /* left  top    near */
+#define GLM_RTN 2 /* right top    near */
+#define GLM_RBN 3 /* right bottom near */
+
+#define GLM_LBF 4 /* left  bottom far  */
+#define GLM_LTF 5 /* left  top    far  */
+#define GLM_RTF 6 /* right top    far  */
+#define GLM_RBF 7 /* right bottom far  */
+
+#define GLM_LEFT   0
+#define GLM_RIGHT  1
+#define GLM_BOTTOM 2
+#define GLM_TOP    3
+#define GLM_NEAR   4
+#define GLM_FAR    5
+
+/* you can override clip space coords
+   but you have to provide all with same name
+   e.g.: define GLM_CSCOORD_LBN {0.0f, 0.0f, 1.0f, 1.0f} */
+#ifndef GLM_CUSTOM_CLIPSPACE
+
+/* near */
+#define GLM_CSCOORD_LBN {-1.0f, -1.0f, -1.0f, 1.0f}
+#define GLM_CSCOORD_LTN {-1.0f,  1.0f, -1.0f, 1.0f}
+#define GLM_CSCOORD_RTN { 1.0f,  1.0f, -1.0f, 1.0f}
+#define GLM_CSCOORD_RBN { 1.0f, -1.0f, -1.0f, 1.0f}
+
+/* far */
+#define GLM_CSCOORD_LBF {-1.0f, -1.0f,  1.0f, 1.0f}
+#define GLM_CSCOORD_LTF {-1.0f,  1.0f,  1.0f, 1.0f}
+#define GLM_CSCOORD_RTF { 1.0f,  1.0f,  1.0f, 1.0f}
+#define GLM_CSCOORD_RBF { 1.0f, -1.0f,  1.0f, 1.0f}
+
+#endif
+
+/*!
+ * @brief extracts view frustum planes
+ *
+ * planes' space:
+ *  1- if m = proj:     View Space
+ *  2- if m = viewProj: World Space
+ *  3- if m = MVP:      Object Space
+ *
+ * You probably want to extract planes in world space so use viewProj as m
+ * Computing viewProj:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *
+ * Exracted planes order: [left, right, bottom, top, near, far]
+ *
+ * @param[in]  m    matrix (see brief)
+ * @param[out] dest extracted view frustum planes (see brief)
+ */
+CGLM_INLINE
+void
+glm_frustum_planes(mat4 m, vec4 dest[6]) {
+  mat4 t;
+
+  glm_mat4_transpose_to(m, t);
+
+  glm_vec4_add(t[3], t[0], dest[0]); /* left   */
+  glm_vec4_sub(t[3], t[0], dest[1]); /* right  */
+  glm_vec4_add(t[3], t[1], dest[2]); /* bottom */
+  glm_vec4_sub(t[3], t[1], dest[3]); /* top    */
+  glm_vec4_add(t[3], t[2], dest[4]); /* near   */
+  glm_vec4_sub(t[3], t[2], dest[5]); /* far    */
+
+  glm_plane_normalize(dest[0]);
+  glm_plane_normalize(dest[1]);
+  glm_plane_normalize(dest[2]);
+  glm_plane_normalize(dest[3]);
+  glm_plane_normalize(dest[4]);
+  glm_plane_normalize(dest[5]);
+}
+
+/*!
+ * @brief extracts view frustum corners using clip-space coordinates
+ *
+ * corners' space:
+ *  1- if m = invViewProj: World Space
+ *  2- if m = invMVP:      Object Space
+ *
+ * You probably want to extract corners in world space so use invViewProj
+ * Computing invViewProj:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   ...
+ *   glm_mat4_inv(viewProj, invViewProj);
+ *
+ * if you have a near coord at i index, you can get it's far coord by i + 4
+ *
+ * Find center coordinates:
+ *   for (j = 0; j < 4; j++) {
+ *     glm_vec3_center(corners[i], corners[i + 4], centerCorners[i]);
+ *   }
+ *
+ * @param[in]  invMat matrix (see brief)
+ * @param[out] dest   exracted view frustum corners (see brief)
+ */
+CGLM_INLINE
+void
+glm_frustum_corners(mat4 invMat, vec4 dest[8]) {
+  vec4 c[8];
+
+  /* indexOf(nearCoord) = indexOf(farCoord) + 4 */
+  vec4 csCoords[8] = {
+    GLM_CSCOORD_LBN,
+    GLM_CSCOORD_LTN,
+    GLM_CSCOORD_RTN,
+    GLM_CSCOORD_RBN,
+
+    GLM_CSCOORD_LBF,
+    GLM_CSCOORD_LTF,
+    GLM_CSCOORD_RTF,
+    GLM_CSCOORD_RBF
+  };
+
+  glm_mat4_mulv(invMat, csCoords[0], c[0]);
+  glm_mat4_mulv(invMat, csCoords[1], c[1]);
+  glm_mat4_mulv(invMat, csCoords[2], c[2]);
+  glm_mat4_mulv(invMat, csCoords[3], c[3]);
+  glm_mat4_mulv(invMat, csCoords[4], c[4]);
+  glm_mat4_mulv(invMat, csCoords[5], c[5]);
+  glm_mat4_mulv(invMat, csCoords[6], c[6]);
+  glm_mat4_mulv(invMat, csCoords[7], c[7]);
+
+  glm_vec4_scale(c[0], 1.0f / c[0][3], dest[0]);
+  glm_vec4_scale(c[1], 1.0f / c[1][3], dest[1]);
+  glm_vec4_scale(c[2], 1.0f / c[2][3], dest[2]);
+  glm_vec4_scale(c[3], 1.0f / c[3][3], dest[3]);
+  glm_vec4_scale(c[4], 1.0f / c[4][3], dest[4]);
+  glm_vec4_scale(c[5], 1.0f / c[5][3], dest[5]);
+  glm_vec4_scale(c[6], 1.0f / c[6][3], dest[6]);
+  glm_vec4_scale(c[7], 1.0f / c[7][3], dest[7]);
+}
+
+/*!
+ * @brief finds center of view frustum
+ *
+ * @param[in]  corners view frustum corners
+ * @param[out] dest    view frustum center
+ */
+CGLM_INLINE
+void
+glm_frustum_center(vec4 corners[8], vec4 dest) {
+  vec4 center;
+
+  glm_vec4_copy(corners[0], center);
+
+  glm_vec4_add(corners[1], center, center);
+  glm_vec4_add(corners[2], center, center);
+  glm_vec4_add(corners[3], center, center);
+  glm_vec4_add(corners[4], center, center);
+  glm_vec4_add(corners[5], center, center);
+  glm_vec4_add(corners[6], center, center);
+  glm_vec4_add(corners[7], center, center);
+
+  glm_vec4_scale(center, 0.125f, dest);
+}
+
+/*!
+ * @brief finds bounding box of frustum relative to given matrix e.g. view mat
+ *
+ * @param[in]  corners view frustum corners
+ * @param[in]  m       matrix to convert existing conners
+ * @param[out] box     bounding box as array [min, max]
+ */
+CGLM_INLINE
+void
+glm_frustum_box(vec4 corners[8], mat4 m, vec3 box[2]) {
+  vec4 v;
+  vec3 min, max;
+  int  i;
+
+  glm_vec3_broadcast(FLT_MAX, min);
+  glm_vec3_broadcast(-FLT_MAX, max);
+
+  for (i = 0; i < 8; i++) {
+    glm_mat4_mulv(m, corners[i], v);
+
+    min[0] = glm_min(min[0], v[0]);
+    min[1] = glm_min(min[1], v[1]);
+    min[2] = glm_min(min[2], v[2]);
+
+    max[0] = glm_max(max[0], v[0]);
+    max[1] = glm_max(max[1], v[1]);
+    max[2] = glm_max(max[2], v[2]);
+  }
+
+  glm_vec3_copy(min, box[0]);
+  glm_vec3_copy(max, box[1]);
+}
+
+/*!
+ * @brief finds planes corners which is between near and far planes (parallel)
+ *
+ * this will be helpful if you want to split a frustum e.g. CSM/PSSM. This will
+ * find planes' corners but you will need to one more plane.
+ * Actually you have it, it is near, far or created previously with this func ;)
+ *
+ * @param[in]  corners view  frustum corners
+ * @param[in]  splitDist     split distance
+ * @param[in]  farDist       far distance (zFar)
+ * @param[out] planeCorners  plane corners [LB, LT, RT, RB]
+ */
+CGLM_INLINE
+void
+glm_frustum_corners_at(vec4  corners[8],
+                       float splitDist,
+                       float farDist,
+                       vec4  planeCorners[4]) {
+  vec4  corner;
+  float dist, sc;
+
+  /* because distance and scale is same for all */
+  dist = glm_vec3_distance(corners[GLM_RTF], corners[GLM_RTN]);
+  sc   = dist * (splitDist / farDist);
+
+  /* left bottom */
+  glm_vec4_sub(corners[GLM_LBF], corners[GLM_LBN], corner);
+  glm_vec4_scale_as(corner, sc, corner);
+  glm_vec4_add(corners[GLM_LBN], corner, planeCorners[0]);
+
+  /* left top */
+  glm_vec4_sub(corners[GLM_LTF], corners[GLM_LTN], corner);
+  glm_vec4_scale_as(corner, sc, corner);
+  glm_vec4_add(corners[GLM_LTN], corner, planeCorners[1]);
+
+  /* right top */
+  glm_vec4_sub(corners[GLM_RTF], corners[GLM_RTN], corner);
+  glm_vec4_scale_as(corner, sc, corner);
+  glm_vec4_add(corners[GLM_RTN], corner, planeCorners[2]);
+
+  /* right bottom */
+  glm_vec4_sub(corners[GLM_RBF], corners[GLM_RBN], corner);
+  glm_vec4_scale_as(corner, sc, corner);
+  glm_vec4_add(corners[GLM_RBN], corner, planeCorners[3]);
+}
+
+#endif /* cglm_frustum_h */
diff --git a/external/cglm/handed/euler_to_quat_lh.h b/external/cglm/handed/euler_to_quat_lh.h
new file mode 100644
index 0000000..1bb350b
--- /dev/null
+++ b/external/cglm/handed/euler_to_quat_lh.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_euler_xyz_quat_lh(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_xzy_quat_lh(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_yxz_quat_lh(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_yzx_quat_lh(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_zxy_quat_lh(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_zyx_quat_lh(vec3 angles, versor dest);
+ */
+
+/*
+ Things to note:
+ The only difference between euler to quat rh vs lh is that the zsin part is negative
+ */
+
+#ifndef cglm_euler_to_quat_lh_h
+#define cglm_euler_to_quat_lh_h
+
+#include "../common.h"
+
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in x y z order in left hand (roll pitch yaw)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_xyz_quat_lh(vec3 angles, versor dest) {
+  float xc, yc, zc,
+        xs, ys, zs;
+
+  xs =  sinf(angles[0] * 0.5f); xc = cosf(angles[0] * 0.5f);
+  ys =  sinf(angles[1] * 0.5f); yc = cosf(angles[1] * 0.5f);
+  zs = -sinf(angles[2] * 0.5f); zc = cosf(angles[2] * 0.5f);
+
+  dest[0] = xc * ys * zs + xs * yc * zc;
+  dest[1] = xc * ys * zc - xs * yc * zs;
+  dest[2] = xc * yc * zs + xs * ys * zc;
+  dest[3] = xc * yc * zc - xs * ys * zs;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in x z y order in left hand (roll yaw pitch)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_xzy_quat_lh(vec3 angles, versor dest) {
+  float xc, yc, zc,
+        xs, ys, zs;
+
+  xs =  sinf(angles[0] * 0.5f); xc = cosf(angles[0] * 0.5f);
+  ys =  sinf(angles[1] * 0.5f); yc = cosf(angles[1] * 0.5f);
+  zs = -sinf(angles[2] * 0.5f); zc = cosf(angles[2] * 0.5f);
+
+  dest[0] = -xc * zs * ys + xs * zc * yc;
+  dest[1] =  xc * zc * ys - xs * zs * yc;
+  dest[2] =  xc * zs * yc + xs * zc * ys;
+  dest[3] =  xc * zc * yc + xs * zs * ys;  
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in y x z order in left hand (pitch roll yaw)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_yxz_quat_lh(vec3 angles, versor dest) {
+  float xc, yc, zc,
+        xs, ys, zs;
+
+  xs =  sinf(angles[0] * 0.5f); xc = cosf(angles[0] * 0.5f);
+  ys =  sinf(angles[1] * 0.5f); yc = cosf(angles[1] * 0.5f);
+  zs = -sinf(angles[2] * 0.5f); zc = cosf(angles[2] * 0.5f);
+
+  dest[0] =  yc * xs * zc + ys * xc * zs;
+  dest[1] = -yc * xs * zs + ys * xc * zc;
+  dest[2] =  yc * xc * zs - ys * xs * zc;
+  dest[3] =  yc * xc * zc + ys * xs * zs;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in y z x order in left hand (pitch yaw roll)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_yzx_quat_lh(vec3 angles, versor dest) {
+  float xc, yc, zc,
+        xs, ys, zs;
+
+  xs =  sinf(angles[0] * 0.5f); xc = cosf(angles[0] * 0.5f);
+  ys =  sinf(angles[1] * 0.5f); yc = cosf(angles[1] * 0.5f);
+  zs = -sinf(angles[2] * 0.5f); zc = cosf(angles[2] * 0.5f);
+
+  dest[0] = yc * zc * xs + ys * zs * xc;
+  dest[1] = yc * zs * xs + ys * zc * xc;
+  dest[2] = yc * zs * xc - ys * zc * xs;
+  dest[3] = yc * zc * xc - ys * zs * xs;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in z x y order in left hand (yaw roll pitch)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_zxy_quat_lh(vec3 angles, versor dest) {
+  float xc, yc, zc,
+        xs, ys, zs;
+
+  xs =  sinf(angles[0] * 0.5f); xc = cosf(angles[0] * 0.5f);
+  ys =  sinf(angles[1] * 0.5f); yc = cosf(angles[1] * 0.5f);
+  zs = -sinf(angles[2] * 0.5f); zc = cosf(angles[2] * 0.5f);
+
+  dest[0] = zc * xs * yc - zs * xc * ys;
+  dest[1] = zc * xc * ys + zs * xs * yc;
+  dest[2] = zc * xs * ys + zs * xc * yc;
+  dest[3] = zc * xc * yc - zs * xs * ys;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in z y x order in left hand (yaw pitch roll)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_zyx_quat_lh(vec3 angles, versor dest) {
+  float xc, yc, zc,
+        xs, ys, zs;
+
+  xs =  sinf(angles[0] * 0.5f); xc = cosf(angles[0] * 0.5f);
+  ys =  sinf(angles[1] * 0.5f); yc = cosf(angles[1] * 0.5f);
+  zs = -sinf(angles[2] * 0.5f); zc = cosf(angles[2] * 0.5f);
+
+  dest[0] =  zc * yc * xs - zs * ys * xc;
+  dest[1] =  zc * ys * xc + zs * yc * xs;
+  dest[2] = -zc * ys * xs + zs * yc * xc;
+  dest[3] =  zc * yc * xc + zs * ys * xs;
+}
+
+#endif /*cglm_euler_to_quat_lh_h*/
diff --git a/external/cglm/handed/euler_to_quat_rh.h b/external/cglm/handed/euler_to_quat_rh.h
new file mode 100644
index 0000000..aeb6f81
--- /dev/null
+++ b/external/cglm/handed/euler_to_quat_rh.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_euler_xyz_quat_rh(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_xzy_quat_rh(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_yxz_quat_rh(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_yzx_quat_rh(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_zxy_quat_rh(vec3 angles, versor dest);
+   CGLM_INLINE void glm_euler_zyx_quat_rh(vec3 angles, versor dest);
+ */
+
+/*
+ Things to note:
+ The only difference between euler to quat rh vs lh is that the zsin part is negative
+ */
+
+#ifndef cglm_euler_to_quat_rh_h
+#define cglm_euler_to_quat_rh_h
+
+#include "../common.h"
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in x y z order in right hand (roll pitch yaw)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_xyz_quat_rh(vec3 angles, versor dest) {
+  float xc, yc, zc,
+        xs, ys, zs;
+
+  xs = sinf(angles[0] * 0.5f); xc = cosf(angles[0] * 0.5f);
+  ys = sinf(angles[1] * 0.5f); yc = cosf(angles[1] * 0.5f);
+  zs = sinf(angles[2] * 0.5f); zc = cosf(angles[2] * 0.5f);
+
+  dest[0] = xc * ys * zs + xs * yc * zc;
+  dest[1] = xc * ys * zc - xs * yc * zs;
+  dest[2] = xc * yc * zs + xs * ys * zc;
+  dest[3] = xc * yc * zc - xs * ys * zs;
+
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in x z y order in right hand (roll yaw pitch)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_xzy_quat_rh(vec3 angles, versor dest) {
+  float xc, yc, zc,
+        xs, ys, zs;
+
+  xs = sinf(angles[0] * 0.5f); xc = cosf(angles[0] * 0.5f);
+  ys = sinf(angles[1] * 0.5f); yc = cosf(angles[1] * 0.5f);
+  zs = sinf(angles[2] * 0.5f); zc = cosf(angles[2] * 0.5f);
+
+  dest[0] = -xc * zs * ys + xs * zc * yc;
+  dest[1] =  xc * zc * ys - xs * zs * yc;
+  dest[2] =  xc * zs * yc + xs * zc * ys;
+  dest[3] =  xc * zc * yc + xs * zs * ys;  
+
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in y x z order in right hand (pitch roll yaw)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_yxz_quat_rh(vec3 angles, versor dest) {
+  float xc, yc, zc,
+        xs, ys, zs;
+
+  xs = sinf(angles[0] * 0.5f); xc = cosf(angles[0] * 0.5f);
+  ys = sinf(angles[1] * 0.5f); yc = cosf(angles[1] * 0.5f);
+  zs = sinf(angles[2] * 0.5f); zc = cosf(angles[2] * 0.5f);
+
+  dest[0] =  yc * xs * zc + ys * xc * zs;
+  dest[1] = -yc * xs * zs + ys * xc * zc;
+  dest[2] =  yc * xc * zs - ys * xs * zc;
+  dest[3] =  yc * xc * zc + ys * xs * zs;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in y z x order in right hand (pitch yaw roll)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_yzx_quat_rh(vec3 angles, versor dest) {
+  float xc, yc, zc,
+        xs, ys, zs;
+
+  xs = sinf(angles[0] * 0.5f); xc = cosf(angles[0] * 0.5f);
+  ys = sinf(angles[1] * 0.5f); yc = cosf(angles[1] * 0.5f);
+  zs = sinf(angles[2] * 0.5f); zc = cosf(angles[2] * 0.5f);
+
+  dest[0] = yc * zc * xs + ys * zs * xc;
+  dest[1] = yc * zs * xs + ys * zc * xc;
+  dest[2] = yc * zs * xc - ys * zc * xs;
+  dest[3] = yc * zc * xc - ys * zs * xs;
+
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in z x y order in right hand (yaw roll pitch)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_zxy_quat_rh(vec3 angles, versor dest) {
+  float xc, yc, zc,
+        xs, ys, zs;
+
+  xs = sinf(angles[0] * 0.5f); xc = cosf(angles[0] * 0.5f);
+  ys = sinf(angles[1] * 0.5f); yc = cosf(angles[1] * 0.5f);
+  zs = sinf(angles[2] * 0.5f); zc = cosf(angles[2] * 0.5f);
+
+  dest[0] = zc * xs * yc - zs * xc * ys;
+  dest[1] = zc * xc * ys + zs * xs * yc;
+  dest[2] = zc * xs * ys + zs * xc * yc;
+  dest[3] = zc * xc * yc - zs * xs * ys;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in z y x order in right hand (yaw pitch roll)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+void
+glm_euler_zyx_quat_rh(vec3 angles, versor dest) {
+  float xc, yc, zc,
+        xs, ys, zs;
+
+  xs = sinf(angles[0] * 0.5f); xc = cosf(angles[0] * 0.5f);
+  ys = sinf(angles[1] * 0.5f); yc = cosf(angles[1] * 0.5f);
+  zs = sinf(angles[2] * 0.5f); zc = cosf(angles[2] * 0.5f);
+
+  dest[0] =  zc * yc * xs - zs * ys * xc;
+  dest[1] =  zc * ys * xc + zs * yc * xs;
+  dest[2] = -zc * ys * xs + zs * yc * xc;
+  dest[3] =  zc * yc * xc + zs * ys * xs;
+}
+
+
+#endif /*cglm_euler_to_quat_rh_h*/
diff --git a/external/cglm/io.h b/external/cglm/io.h
new file mode 100644
index 0000000..baa80f1
--- /dev/null
+++ b/external/cglm/io.h
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glm_mat4_print(mat4 matrix, FILE *ostream);
+   CGLM_INLINE void glm_mat3_print(mat3 matrix, FILE *ostream);
+   CGLM_INLINE void glm_vec4_print(vec4 vec, FILE *ostream);
+   CGLM_INLINE void glm_ivec4_print(ivec4 vec, FILE *ostream);
+   CGLM_INLINE void glm_vec3_print(vec3 vec, FILE *ostream);
+   CGLM_INLINE void glm_ivec3_print(ivec3 vec, FILE *ostream);
+   CGLM_INLINE void glm_vec2_print(vec2 vec, FILE *ostream);
+   CGLM_INLINE void glm_ivec2_print(ivec2 vec, FILE *ostream);
+   CGLM_INLINE void glm_versor_print(versor vec, FILE *ostream);
+   CGLM_INLINE void glm_arch_print(FILE *ostream);
+ */
+
+/*
+ cglm tried to enable print functions in debug mode and disable them in
+ release/production mode to eliminate printing costs.
+ 
+ if you need to force enable then define CGLM_DEFINE_PRINTS macro not DEBUG one
+ 
+ Print functions are enabled if:
+ 
+ - DEBUG or _DEBUG macro is defined (mostly defined automatically in debugging)
+ - CGLM_DEFINE_PRINTS macro is defined including release/production
+   which makes enabled printing always
+ - glmc_ calls for io are always prints
+
+ */
+
+/* DEPRECATED: CGLM_NO_PRINTS_NOOP (use CGLM_DEFINE_PRINTS) */
+
+#ifndef cglm_io_h
+#define cglm_io_h
+#if !defined(NDEBUG) \
+   || defined(CGLM_DEFINE_PRINTS) || defined(CGLM_LIB_SRC) \
+   || defined(CGLM_NO_PRINTS_NOOP)
+
+#include "common.h"
+#include "util.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef CGLM_PRINT_PRECISION
+#  define CGLM_PRINT_PRECISION    5
+#endif
+
+#ifndef CGLM_PRINT_MAX_TO_SHORT
+#  define CGLM_PRINT_MAX_TO_SHORT 1e5f
+#endif
+
+#ifndef GLM_TESTS_NO_COLORFUL_OUTPUT
+#  ifndef CGLM_PRINT_COLOR
+#    define CGLM_PRINT_COLOR        "\033[36m"
+#  endif
+#  ifndef CGLM_PRINT_COLOR_RESET
+#    define CGLM_PRINT_COLOR_RESET  "\033[0m"
+#  endif
+#else
+#  ifndef CGLM_PRINT_COLOR
+#    define CGLM_PRINT_COLOR
+#  endif
+#  ifndef CGLM_PRINT_COLOR_RESET
+#    define CGLM_PRINT_COLOR_RESET
+#  endif
+#endif
+
+/*!
+ * @brief prints current SIMD path in general
+ *
+ * @param[in] ostream    stream to print e.g. stdout, stderr, FILE ...
+ */
+CGLM_INLINE
+void
+glm_arch_print(FILE* __restrict ostream) {
+  fprintf(ostream, CGLM_PRINT_COLOR "arch: "
+#if defined(CGLM_SIMD_WASM)
+  "wasm SIMD128"
+#elif defined(CGLM_SIMD_x86)
+  "x86 SSE* "
+#  ifdef __AVX__
+  " AVX"
+#  endif
+#elif defined(CGLM_SIMD_ARM)
+  "arm"
+#  ifndef __ARM_NEON_FP
+    " NEON_FP"
+#  endif
+#  ifdef CGLM_ARM64
+    " ARM64"
+#  endif
+#else
+  "uncommon"
+#endif
+  CGLM_PRINT_COLOR_RESET);
+}
+
+/*!
+ * @brief prints current SIMD path in general
+ *
+ * @param[in] ostream    stream to print e.g. stdout, stderr, FILE ...
+ */
+CGLM_INLINE
+void
+glm_arch_print_name(FILE* __restrict ostream) {
+  fprintf(ostream, CGLM_PRINT_COLOR "\ncglm ");
+  glm_arch_print(ostream);
+  fprintf(ostream, "\n\n" CGLM_PRINT_COLOR_RESET);
+}
+
+CGLM_INLINE
+void
+glm_mat4_print(mat4              matrix,
+               FILE * __restrict ostream) {
+  char buff[16];
+  int  i, j, cw[4], cwi;
+
+#define m 4
+#define n 4
+
+  fprintf(ostream, "Matrix (float%dx%d): " CGLM_PRINT_COLOR "\n" , m, n);
+
+  cw[0] = cw[1] = cw[2] = cw[3] = 0;
+
+  for (i = 0; i < m; i++) {
+    for (j = 0; j < n; j++) {
+      if (matrix[i][j] < CGLM_PRINT_MAX_TO_SHORT)
+        cwi = snprintf(buff, sizeof(buff), "% .*f", CGLM_PRINT_PRECISION, (double)matrix[i][j]);
+      else
+        cwi = snprintf(buff, sizeof(buff), "% g", (double)matrix[i][j]);
+      cw[i] = GLM_MAX(cw[i], cwi);
+    }
+  }
+
+  for (i = 0; i < m; i++) {
+    fprintf(ostream, "  |");
+
+    for (j = 0; j < n; j++)
+      if (matrix[i][j] < CGLM_PRINT_MAX_TO_SHORT)
+        fprintf(ostream, " % *.*f", cw[j], CGLM_PRINT_PRECISION, (double)matrix[j][i]);
+      else
+        fprintf(ostream, " % *g", cw[j], (double)matrix[j][i]);
+
+    fprintf(ostream, "  |\n");
+  }
+
+  fprintf(ostream, CGLM_PRINT_COLOR_RESET "\n");
+
+#undef m
+#undef n
+}
+
+
+CGLM_INLINE
+void
+glm_mat3_print(mat3              matrix,
+               FILE * __restrict ostream) {
+  char buff[16];
+  int  i, j, cw[4], cwi;
+
+#define m 3
+#define n 3
+
+  fprintf(ostream, "Matrix (float%dx%d): " CGLM_PRINT_COLOR "\n", m, n);
+
+  cw[0] = cw[1] = cw[2] = 0;
+
+  for (i = 0; i < m; i++) {
+    for (j = 0; j < n; j++) {
+      if (matrix[i][j] < CGLM_PRINT_MAX_TO_SHORT)
+        cwi = snprintf(buff, sizeof(buff), "% .*f", CGLM_PRINT_PRECISION, (double)matrix[i][j]);
+      else
+        cwi = snprintf(buff, sizeof(buff), "% g", (double)matrix[i][j]);
+      cw[i] = GLM_MAX(cw[i], cwi);
+    }
+  }
+
+  for (i = 0; i < m; i++) {
+    fprintf(ostream, "  |");
+
+    for (j = 0; j < n; j++)
+      if (matrix[i][j] < CGLM_PRINT_MAX_TO_SHORT)
+        fprintf(ostream, " % *.*f", cw[j], CGLM_PRINT_PRECISION, (double)matrix[j][i]);
+      else
+        fprintf(ostream, " % *g", cw[j], (double)matrix[j][i]);
+
+    fprintf(ostream, "  |\n");
+  }
+
+  fprintf(ostream, CGLM_PRINT_COLOR_RESET "\n");
+
+#undef m
+#undef n
+}
+
+CGLM_INLINE
+void
+glm_mat2_print(mat2              matrix,
+               FILE * __restrict ostream) {
+  char buff[16];
+  int  i, j, cw[4], cwi;
+
+#define m 2
+#define n 2
+
+  fprintf(ostream, "Matrix (float%dx%d): " CGLM_PRINT_COLOR "\n", m, n);
+
+  cw[0] = cw[1] = 0;
+
+  for (i = 0; i < m; i++) {
+    for (j = 0; j < n; j++) {
+      if (matrix[i][j] < CGLM_PRINT_MAX_TO_SHORT)
+        cwi = snprintf(buff, sizeof(buff), "% .*f", CGLM_PRINT_PRECISION, (double)matrix[i][j]);
+      else
+        cwi = snprintf(buff, sizeof(buff), "% g", (double)matrix[i][j]);
+      cw[i] = GLM_MAX(cw[i], cwi);
+    }
+  }
+
+  for (i = 0; i < m; i++) {
+    fprintf(ostream, "  |");
+
+    for (j = 0; j < n; j++)
+      if (matrix[i][j] < CGLM_PRINT_MAX_TO_SHORT)
+        fprintf(ostream, " % *.*f", cw[j], CGLM_PRINT_PRECISION, (double)matrix[j][i]);
+      else
+        fprintf(ostream, " % *g", cw[j], (double)matrix[j][i]);
+
+    fprintf(ostream, "  |\n");
+  }
+
+  fprintf(ostream, CGLM_PRINT_COLOR_RESET "\n");
+
+#undef m
+#undef n
+}
+
+CGLM_INLINE
+void
+glm_vec4_print(vec4              vec,
+               FILE * __restrict ostream) {
+  int i;
+
+#define m 4
+
+  fprintf(ostream, "Vector (float%d): " CGLM_PRINT_COLOR "\n  (", m);
+
+  for (i = 0; i < m; i++) {
+    if (vec[i] < CGLM_PRINT_MAX_TO_SHORT)
+      fprintf(ostream, " % .*f", CGLM_PRINT_PRECISION, (double)vec[i]);
+    else
+      fprintf(ostream, " % g", (double)vec[i]);
+  }
+
+  fprintf(ostream, "  )" CGLM_PRINT_COLOR_RESET "\n\n");
+
+#undef m
+}
+
+CGLM_INLINE
+void
+glm_ivec4_print(ivec4             vec,
+                FILE * __restrict ostream) {
+  int i;
+
+#define m 4
+
+  fprintf(ostream, "Vector (int%d): " CGLM_PRINT_COLOR "\n  (", m);
+
+  for (i = 0; i < m; i++)
+    fprintf(ostream, " % d", vec[i]);
+
+  fprintf(ostream, "  )" CGLM_PRINT_COLOR_RESET "\n\n");
+
+#undef m
+}
+
+CGLM_INLINE
+void
+glm_vec3_print(vec3              vec,
+               FILE * __restrict ostream) {
+  int i;
+
+#define m 3
+
+  fprintf(ostream, "Vector (float%d): " CGLM_PRINT_COLOR "\n  (", m);
+
+  for (i = 0; i < m; i++) {
+    if (vec[i] < CGLM_PRINT_MAX_TO_SHORT)
+      fprintf(ostream, " % .*f", CGLM_PRINT_PRECISION, (double)vec[i]);
+    else
+      fprintf(ostream, " % g", (double)vec[i]);
+  }
+
+  fprintf(ostream, "  )" CGLM_PRINT_COLOR_RESET "\n\n");
+
+#undef m
+}
+
+CGLM_INLINE
+void
+glm_ivec3_print(ivec3             vec,
+                FILE * __restrict ostream) {
+  int i;
+
+#define m 3
+
+  fprintf(ostream, "Vector (int%d): " CGLM_PRINT_COLOR "\n  (", m);
+
+  for (i = 0; i < m; i++)
+    fprintf(ostream, " % d", vec[i]);
+
+  fprintf(ostream, "  )" CGLM_PRINT_COLOR_RESET "\n\n");
+
+#undef m
+}
+
+CGLM_INLINE
+void
+glm_vec2_print(vec2              vec,
+               FILE * __restrict ostream) {
+  int i;
+
+#define m 2
+
+  fprintf(ostream, "Vector (float%d): " CGLM_PRINT_COLOR "\n  (", m);
+
+  for (i = 0; i < m; i++) {
+    if (vec[i] < CGLM_PRINT_MAX_TO_SHORT)
+      fprintf(ostream, " % .*f", CGLM_PRINT_PRECISION, (double)vec[i]);
+    else
+      fprintf(ostream, " % g", (double)vec[i]);
+  }
+
+  fprintf(ostream, "  )" CGLM_PRINT_COLOR_RESET "\n\n");
+
+#undef m
+}
+
+CGLM_INLINE
+void
+glm_ivec2_print(ivec2             vec,
+                FILE * __restrict ostream) {
+  int i;
+
+#define m 2
+
+  fprintf(ostream, "Vector (int%d): " CGLM_PRINT_COLOR "\n  (", m);
+
+  for (i = 0; i < m; i++)
+    fprintf(ostream, " % d", vec[i]);
+
+  fprintf(ostream, "  )" CGLM_PRINT_COLOR_RESET "\n\n");
+
+#undef m
+}
+
+CGLM_INLINE
+void
+glm_versor_print(versor            vec,
+                 FILE * __restrict ostream) {
+  int i;
+
+#define m 4
+
+  fprintf(ostream, "Quaternion (float%d): " CGLM_PRINT_COLOR "\n  (", m);
+
+  for (i = 0; i < m; i++) {
+    if (vec[i] < CGLM_PRINT_MAX_TO_SHORT)
+      fprintf(ostream, " % .*f", CGLM_PRINT_PRECISION, (double)vec[i]);
+    else
+      fprintf(ostream, " % g", (double)vec[i]);
+  }
+
+
+  fprintf(ostream, "  )" CGLM_PRINT_COLOR_RESET "\n\n");
+
+#undef m
+}
+
+CGLM_INLINE
+void
+glm_aabb_print(vec3                    bbox[2],
+               const char * __restrict tag,
+               FILE       * __restrict ostream) {
+  int i, j;
+
+#define m 3
+
+  fprintf(ostream, "AABB (%s): " CGLM_PRINT_COLOR "\n", tag ? tag: "float");
+
+  for (i = 0; i < 2; i++) {
+    fprintf(ostream, "  (");
+
+    for (j = 0; j < m; j++) {
+      if (bbox[i][j] < CGLM_PRINT_MAX_TO_SHORT)
+        fprintf(ostream, " % .*f", CGLM_PRINT_PRECISION, (double)bbox[i][j]);
+      else
+        fprintf(ostream, " % g", (double)bbox[i][j]);
+    }
+
+    fprintf(ostream, "  )\n");
+  }
+
+  fprintf(ostream, CGLM_PRINT_COLOR_RESET "\n");
+
+#undef m
+}
+
+#else
+
+#include "common.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+/* NOOP: Remove print from DEBUG */
+#define glm_mat4_print(v, s) (void)v; (void)s;
+#define glm_mat3_print(v, s) (void)v; (void)s;
+#define glm_mat2_print(v, s) (void)v; (void)s;
+#define glm_vec4_print(v, s) (void)v; (void)s;
+#define glm_ivec4_print(v, s) (void)v; (void)s;
+#define glm_vec3_print(v, s) (void)v; (void)s;
+#define glm_ivec3_print(v, s) (void)v; (void)s;
+#define glm_vec2_print(v, s) (void)v; (void)s;
+#define glm_ivec2_print(v, s) (void)v; (void)s;
+#define glm_versor_print(v, s) (void)v; (void)s;
+#define glm_aabb_print(v, t, s) (void)v; (void)t; (void)s;
+#define glm_arch_print(s) (void)s;
+#define glm_arch_print_name(s) (void)s;
+
+#endif
+#endif /* cglm_io_h */
diff --git a/external/cglm/ivec2.h b/external/cglm/ivec2.h
new file mode 100644
index 0000000..8d5ad88
--- /dev/null
+++ b/external/cglm/ivec2.h
@@ -0,0 +1,659 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_IVEC2_ONE_INIT
+   GLM_IVEC2_ZERO_INIT
+   GLM_IVEC2_ONE
+   GLM_IVEC2_ZERO
+
+ Functions:
+  CGLM_INLINE void glm_ivec2(int * __restrict v, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_copy(ivec2 a, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_zero(ivec2 v)
+  CGLM_INLINE void glm_ivec2_one(ivec2 v)
+  CGLM_INLINE int glm_ivec2_dot(ivec2 a, ivec2 b)
+  CGLM_INLINE int glm_ivec2_cross(ivec2 a, ivec2 b)
+  CGLM_INLINE void glm_ivec2_add(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_adds(ivec2 v, int s, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_sub(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_subs(ivec2 v, int s, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_mul(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_scale(ivec2 v, int s, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_div(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_divs(ivec2 v, int s, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_mod(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_addadd(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_addadds(ivec2 a, int s, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_subadd(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_subadds(ivec2 a, int s, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_muladd(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_muladds(ivec2 a, int s, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_maxadd(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_minadd(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_subsub(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_subsubs(ivec2 a, int s, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_addsub(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_addsubs(ivec2 a, int s, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_mulsub(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_mulsubs(ivec2 a, int s, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_maxsub(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_minsub(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE int glm_ivec2_distance2(ivec2 a, ivec2 b)
+  CGLM_INLINE float glm_ivec2_distance(ivec2 a, ivec2 b)
+  CGLM_INLINE void glm_ivec2_fill(ivec2 v, int val);
+  CGLM_INLINE bool glm_ivec2_eq(ivec2 v, int val);
+  CGLM_INLINE bool glm_ivec2_eqv(ivec2 a, ivec2 b);
+  CGLM_INLINE void glm_ivec2_maxv(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_minv(ivec2 a, ivec2 b, ivec2 dest)
+  CGLM_INLINE void glm_ivec2_clamp(ivec2 v, int minVal, int maxVal)
+  CGLM_INLINE void glm_ivec2_abs(ivec2 v, ivec2 dest)
+ */
+
+#ifndef cglm_ivec2_h
+#define cglm_ivec2_h
+
+#include "common.h"
+#include "util.h"
+
+#define GLM_IVEC2_ONE_INIT   {1, 1}
+#define GLM_IVEC2_ZERO_INIT  {0, 0}
+
+#define GLM_IVEC2_ONE  ((ivec2)GLM_IVEC2_ONE_INIT)
+#define GLM_IVEC2_ZERO ((ivec2)GLM_IVEC2_ZERO_INIT)
+
+/*!
+ * @brief init ivec2 using vec3 or vec4
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec2(int * __restrict v, ivec2 dest) {
+  dest[0] = v[0];
+  dest[1] = v[1];
+}
+
+/*!
+ * @brief copy all members of [a] to [dest]
+ *
+ * @param[in]  a    source vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec2_copy(ivec2 a, ivec2 dest) {
+  dest[0] = a[0];
+  dest[1] = a[1];
+}
+
+/*!
+ * @brief set all members of [v] to zero
+ *
+ * @param[out] v vector
+ */
+CGLM_INLINE
+void 
+glm_ivec2_zero(ivec2 v) {
+  v[0] = v[1] = 0;
+}
+
+/*!
+ * @brief set all members of [v] to one
+ *
+ * @param[out] v vector
+ */
+CGLM_INLINE
+void
+glm_ivec2_one(ivec2 v) {
+  v[0] = v[1] = 1;
+}
+
+/*!
+ * @brief ivec2 dot product
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ *
+ * @return dot product
+ */
+CGLM_INLINE
+int
+glm_ivec2_dot(ivec2 a, ivec2 b) {
+  return a[0] * b[0] + a[1] * b[1];
+}
+
+/*!
+ * @brief ivec2 cross product
+ *
+ * REF: http://allenchou.net/2013/07/cross-product-of-2d-vectors/
+ *
+ * @param[in]  a vector1
+ * @param[in]  b vector2
+ *
+ * @return Z component of cross product
+ */
+CGLM_INLINE
+int
+glm_ivec2_cross(ivec2 a, ivec2 b) {
+  return a[0] * b[1] - a[1] * b[0];
+}
+
+/*!
+ * @brief add vector [a] to vector [b] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec2_add(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] = a[0] + b[0];
+  dest[1] = a[1] + b[1];
+}
+
+/*!
+ * @brief add scalar s to vector [v] and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec2_adds(ivec2 v, int s, ivec2 dest) {
+  dest[0] = v[0] + s;
+  dest[1] = v[1] + s;
+}
+
+/*!
+ * @brief subtract vector [b] from vector [a] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec2_sub(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] = a[0] - b[0];
+  dest[1] = a[1] - b[1];
+}
+
+/*!
+ * @brief subtract scalar s from vector [v] and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec2_subs(ivec2 v, int s, ivec2 dest) {
+  dest[0] = v[0] - s;
+  dest[1] = v[1] - s;
+}
+
+/*!
+ * @brief multiply vector [a] with vector [b] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec2_mul(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] = a[0] * b[0];
+  dest[1] = a[1] * b[1];
+}
+
+/*!
+ * @brief multiply vector [a] with scalar s and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec2_scale(ivec2 v, int s, ivec2 dest) {
+  dest[0] = v[0] * s;
+  dest[1] = v[1] * s;
+}
+
+/*!
+ * @brief div vector with another component-wise division: d = a / b
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest result = (a[0]/b[0], a[1]/b[1])
+ */
+CGLM_INLINE
+void
+glm_ivec2_div(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] = a[0] / b[0];
+  dest[1] = a[1] / b[1];
+}
+
+/*!
+ * @brief div vector with scalar: d = v / s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest result = (a[0]/s, a[1]/s)
+ */
+CGLM_INLINE
+void
+glm_ivec2_divs(ivec2 v, int s, ivec2 dest) {
+  dest[0] = v[0] / s;
+  dest[1] = v[1] / s;
+}
+
+/*!
+ * @brief mod vector with another component-wise modulo: d = a % b
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest result = (a[0]%b[0], a[1]%b[1])
+ */
+CGLM_INLINE
+void
+glm_ivec2_mod(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] = a[0] % b[0];
+  dest[1] = a[1] % b[1];
+}
+
+/*!
+ * @brief add vector [a] with vector [b] and add result to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += (a + b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_addadd(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] += a[0] + b[0];
+  dest[1] += a[1] + b[1];
+}
+
+/*!
+ * @brief add scalar [s] onto vector [a] and add result to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a + s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_addadds(ivec2 a, int s, ivec2 dest) {
+  dest[0] += a[0] + s;
+  dest[1] += a[1] + s;
+}
+
+/*!
+ * @brief subtract vector [a] from vector [b] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += (a - b)  
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_subadd(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] += a[0] - b[0];
+  dest[1] += a[1] - b[1];
+}
+
+/*!
+ * @brief subtract scalar [s] from vector [a] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a - s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_subadds(ivec2 a, int s, ivec2 dest) {
+  dest[0] += a[0] - s;
+  dest[1] += a[1] - s;
+}
+
+/*!
+ * @brief multiply vector [a] with vector [b] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += (a * b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_muladd(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] += a[0] * b[0];
+  dest[1] += a[1] * b[1];
+}
+
+/*!
+ * @brief multiply vector [a] with scalar [s] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a * s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_muladds(ivec2 a, int s, ivec2 dest) {
+  dest[0] += a[0] * s;
+  dest[1] += a[1] * s;
+}
+
+/*!
+ * @brief add maximum of vector [a] and vector [b] to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += max(a, b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_maxadd(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] += glm_imax(a[0], b[0]);
+  dest[1] += glm_imax(a[1], b[1]);
+}
+
+/*!
+ * @brief add minimum of vector [a] and vector [b] to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += min(a, b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_minadd(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] += glm_imin(a[0], b[0]);
+  dest[1] += glm_imin(a[1], b[1]);
+}
+
+/*!
+ * @brief subtract vector [a] from vector [b] and subtract result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest -= (a - b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_subsub(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] -= a[0] - b[0];
+  dest[1] -= a[1] - b[1];
+}
+
+/*!
+ * @brief subtract scalar [s] from vector [a] and subtract result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest -= (a - s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_subsubs(ivec2 a, int s, ivec2 dest) {
+  dest[0] -= a[0] - s;
+  dest[1] -= a[1] - s;
+}
+
+/*!
+ * @brief add vector [a] to vector [b] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  b    scalar
+ * @param[out] dest dest -= (a + b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_addsub(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] -= a[0] + b[0];
+  dest[1] -= a[1] + b[1];
+}
+
+/*!
+ * @brief add scalar [s] to vector [a] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest -= (a + b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_addsubs(ivec2 a, int s, ivec2 dest) {
+  dest[0] -= a[0] + s;
+  dest[1] -= a[1] + s;
+}
+
+/*!
+ * @brief multiply vector [a] and vector [b] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  b    scalar
+ * @param[out] dest dest -= (a * b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_mulsub(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] -= a[0] * b[0];
+  dest[1] -= a[1] * b[1];
+}
+
+/*!
+ * @brief multiply vector [a] with scalar [s] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest -= (a * s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_mulsubs(ivec2 a, int s, ivec2 dest) {
+  dest[0] -= a[0] * s;
+  dest[1] -= a[1] * s;
+}
+
+/*!
+ * @brief subtract maximum of vector [a] and vector [b] from vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest -= max(a, b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_maxsub(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] -= glm_imax(a[0], b[0]);
+  dest[1] -= glm_imax(a[1], b[1]);
+}
+
+/*!
+ * @brief subtract minimum of vector [a] and vector [b] from vector [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest -= min(a, b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec2_minsub(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] -= glm_imin(a[0], b[0]);
+  dest[1] -= glm_imin(a[1], b[1]);
+}
+
+/*!
+ * @brief squared distance between two vectors
+ *
+ * @param[in] a first vector
+ * @param[in] b second vector
+ * @return returns squared distance (distance * distance)
+ */
+CGLM_INLINE
+int
+glm_ivec2_distance2(ivec2 a, ivec2 b) {
+  int xd, yd;
+  xd = a[0] - b[0];
+  yd = a[1] - b[1];
+  return xd * xd + yd * yd;
+}
+
+/*!
+ * @brief distance between two vectors
+ *
+ * @param[in] a first vector
+ * @param[in] b second vector
+ * @return returns distance
+ */
+CGLM_INLINE
+float
+glm_ivec2_distance(ivec2 a, ivec2 b) {
+  return sqrtf((float)glm_ivec2_distance2(a, b));
+}
+
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param[out] v   dest
+ * @param[in]  val value
+ */
+CGLM_INLINE
+void
+glm_ivec2_fill(ivec2 v, int val) {
+  v[0] = v[1] = val;
+}
+
+/*!
+ * @brief check if vector is equal to value
+ *
+ * @param[in] v   vector
+ * @param[in] val value
+ */
+CGLM_INLINE
+bool
+glm_ivec2_eq(ivec2 v, int val) {
+  return v[0] == val && v[0] == v[1];
+}
+
+/*!
+ * @brief check if vector is equal to another
+ *
+ * @param[in] a vector
+ * @param[in] b vector
+ */
+CGLM_INLINE
+bool
+glm_ivec2_eqv(ivec2 a, ivec2 b) {
+  return a[0] == b[0]
+         && a[1] == b[1];
+}
+
+/*!
+ * @brief set each member of dest to greater of vector a and b
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec2_maxv(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] = a[0] > b[0] ? a[0] : b[0];
+  dest[1] = a[1] > b[1] ? a[1] : b[1];
+}
+
+/*!
+ * @brief set each member of dest to lesser of vector a and b
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec2_minv(ivec2 a, ivec2 b, ivec2 dest) {
+  dest[0] = a[0] < b[0] ? a[0] : b[0];
+  dest[1] = a[1] < b[1] ? a[1] : b[1];
+}
+
+/*!
+ * @brief clamp each member of [v] between minVal and maxVal (inclusive)
+ *
+ * @param[in, out] v      vector
+ * @param[in]      minVal minimum value
+ * @param[in]      maxVal maximum value
+ */
+CGLM_INLINE
+void
+glm_ivec2_clamp(ivec2 v, int minVal, int maxVal) {
+  if (v[0] < minVal)
+    v[0] = minVal;
+  else if(v[0] > maxVal)
+    v[0] = maxVal;
+
+  if (v[1] < minVal)
+    v[1] = minVal;
+  else if(v[1] > maxVal)
+    v[1] = maxVal;
+}
+
+/*!
+ * @brief absolute value of v
+ *
+ * @param[in]	v	vector
+ * @param[out]	dest	destination
+ */
+CGLM_INLINE
+void
+glm_ivec2_abs(ivec2 v, ivec2 dest) {
+  dest[0] = abs(v[0]);
+  dest[1] = abs(v[1]);
+}
+
+#endif /* cglm_ivec2_h */
diff --git a/external/cglm/ivec3.h b/external/cglm/ivec3.h
new file mode 100644
index 0000000..67eaa22
--- /dev/null
+++ b/external/cglm/ivec3.h
@@ -0,0 +1,713 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_IVEC3_ONE_INIT
+   GLM_IVEC3_ZERO_INIT
+   GLM_IVEC3_ONE
+   GLM_IVEC3_ZERO
+
+ Functions:
+  CGLM_INLINE void glm_ivec3(ivec4 v4, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_copy(ivec3 a, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_zero(ivec3 v)
+  CGLM_INLINE void glm_ivec3_one(ivec3 v)
+  CGLM_INLINE int glm_ivec3_dot(ivec3 a, ivec3 b)
+  CGLM_INLINE int glm_ivec3_norm2(ivec3 v)
+  CGLM_INLINE int glm_ivec3_norm(ivec3 v)
+  CGLM_INLINE void glm_ivec3_add(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_adds(ivec3 v, int s, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_sub(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_subs(ivec3 v, int s, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_mul(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_scale(ivec3 v, int s, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_div(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_divs(ivec3 v, int s, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_mod(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_addadd(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_addadds(ivec3 a, int s, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_subadd(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_subadds(ivec3 a, int s, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_muladd(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_muladds(ivec3 a, int s, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_maxadd(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_minadd(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_subsub(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_subsubs(ivec3 a, int s, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_addsub(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_addsubs(ivec3 a, int s, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_mulsub(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_mulsubs(ivec3 a, int s, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_maxsub(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_minsub(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE int glm_ivec3_distance2(ivec3 a, ivec3 b)
+  CGLM_INLINE float glm_ivec3_distance(ivec3 a, ivec3 b)
+  CGLM_INLINE void glm_ivec3_fill(ivec3 v, int val);
+  CGLM_INLINE bool glm_ivec3_eq(ivec3 v, int val);
+  CGLM_INLINE bool glm_ivec3_eqv(ivec3 a, ivec3 b);
+  CGLM_INLINE void glm_ivec3_maxv(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_minv(ivec3 a, ivec3 b, ivec3 dest)
+  CGLM_INLINE void glm_ivec3_clamp(ivec3 v, int minVal, int maxVal)
+  CGLM_INLINE void glm_ivec3_abs(ivec3 v, ivec3 dest)
+ */
+
+#ifndef cglm_ivec3_h
+#define cglm_ivec3_h
+
+#include "common.h"
+#include "util.h"
+
+#define GLM_IVEC3_ONE_INIT   {1, 1, 1}
+#define GLM_IVEC3_ZERO_INIT  {0, 0, 0}
+
+#define GLM_IVEC3_ONE  ((ivec3)GLM_IVEC3_ONE_INIT)
+#define GLM_IVEC3_ZERO ((ivec3)GLM_IVEC3_ZERO_INIT)
+
+/*!
+ * @brief init ivec3 using ivec4
+ *
+ * @param[in]  v4   vector4
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec3(ivec4 v4, ivec3 dest) {
+  dest[0] = v4[0];
+  dest[1] = v4[1];
+  dest[2] = v4[2];
+}
+
+/*!
+ * @brief copy all members of [a] to [dest]
+ *
+ * @param[in]  a    source vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec3_copy(ivec3 a, ivec3 dest) {
+  dest[0] = a[0];
+  dest[1] = a[1];
+  dest[2] = a[2];
+}
+
+/*!
+ * @brief set all members of [v] to zero
+ *
+ * @param[out] v vector
+ */
+CGLM_INLINE
+void 
+glm_ivec3_zero(ivec3 v) {
+  v[0] = v[1] = v[2] = 0;
+}
+
+/*!
+ * @brief set all members of [v] to one
+ *
+ * @param[out] v vector
+ */
+CGLM_INLINE
+void
+glm_ivec3_one(ivec3 v) {
+  v[0] = v[1] = v[2] = 1;
+}
+
+/*!
+ * @brief ivec3 dot product
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ *
+ * @return dot product
+ */
+CGLM_INLINE
+int
+glm_ivec3_dot(ivec3 a, ivec3 b) {
+  return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+}
+
+/*!
+ * @brief norm * norm (magnitude) of vec
+ *
+ * we can use this func instead of calling norm * norm, because it would call
+ * sqrtf function twice but with this func we can avoid func call, maybe this is
+ * not good name for this func
+ *
+ * @param[in] v vector
+ *
+ * @return norm * norm
+ */
+CGLM_INLINE
+int
+glm_ivec3_norm2(ivec3 v) {
+  return glm_ivec3_dot(v, v);
+}
+
+/*!
+ * @brief euclidean norm (magnitude), also called L2 norm
+ *        this will give magnitude of vector in euclidean space
+ *
+ * @param[in] v vector
+ *
+ * @return norm
+ */
+CGLM_INLINE
+int
+glm_ivec3_norm(ivec3 v) {
+  return (int)sqrtf((float)glm_ivec3_norm2(v));
+}
+
+/*!
+ * @brief add vector [a] to vector [b] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec3_add(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] = a[0] + b[0];
+  dest[1] = a[1] + b[1];
+  dest[2] = a[2] + b[2];
+}
+
+/*!
+ * @brief add scalar s to vector [v] and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec3_adds(ivec3 v, int s, ivec3 dest) {
+  dest[0] = v[0] + s;
+  dest[1] = v[1] + s;
+  dest[2] = v[2] + s;
+}
+
+/*!
+ * @brief subtract vector [b] from vector [a] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec3_sub(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] = a[0] - b[0];
+  dest[1] = a[1] - b[1];
+  dest[2] = a[2] - b[2];
+}
+
+/*!
+ * @brief subtract scalar s from vector [v] and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec3_subs(ivec3 v, int s, ivec3 dest) {
+  dest[0] = v[0] - s;
+  dest[1] = v[1] - s;
+  dest[2] = v[2] - s;
+}
+
+/*!
+ * @brief multiply vector [a] with vector [b] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec3_mul(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] = a[0] * b[0];
+  dest[1] = a[1] * b[1];
+  dest[2] = a[2] * b[2];
+}
+
+/*!
+ * @brief multiply vector [a] with scalar s and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec3_scale(ivec3 v, int s, ivec3 dest) {
+  dest[0] = v[0] * s;
+  dest[1] = v[1] * s;
+  dest[2] = v[2] * s;
+}
+
+/*!
+ * @brief div vector with another component-wise division: d = a / b
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest result = (a[0]/b[0], a[1]/b[1], a[2]/b[2])
+ */
+CGLM_INLINE
+void
+glm_ivec3_div(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] = a[0] / b[0];
+  dest[1] = a[1] / b[1];
+  dest[2] = a[2] / b[2];
+}
+
+/*!
+ * @brief div vector with scalar: d = v / s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest result = (a[0]/s, a[1]/s, a[2]/s)
+ */
+CGLM_INLINE
+void
+glm_ivec3_divs(ivec3 v, int s, ivec3 dest) {
+  dest[0] = v[0] / s;
+  dest[1] = v[1] / s;
+  dest[2] = v[2] / s;
+}
+
+/*!
+ * @brief Element-wise modulo operation on ivec3 vectors: dest = a % b
+ *
+ * Performs element-wise modulo on each component of vectors `a` and `b`.
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest result = (a[0]%b[0], a[1]%b[1], a[2]%b[2])
+ */
+CGLM_INLINE
+void
+glm_ivec3_mod(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] = a[0] % b[0];
+  dest[1] = a[1] % b[1];
+  dest[2] = a[2] % b[2];
+}
+
+/*!
+ * @brief add vector [a] with vector [b] and add result to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += (a + b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_addadd(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] += a[0] + b[0];
+  dest[1] += a[1] + b[1];
+  dest[2] += a[2] + b[2];
+}
+
+/*!
+ * @brief add scalar [s] onto vector [a] and add result to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a + s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_addadds(ivec3 a, int s, ivec3 dest) {
+  dest[0] += a[0] + s;
+  dest[1] += a[1] + s;
+  dest[2] += a[2] + s;
+}
+
+/*!
+ * @brief subtract vector [a] from vector [b] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += (a - b)  
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_subadd(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] += a[0] - b[0];
+  dest[1] += a[1] - b[1];
+  dest[2] += a[2] - b[2];
+}
+
+/*!
+ * @brief subtract scalar [s] from vector [a] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a - s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_subadds(ivec3 a, int s, ivec3 dest) {
+  dest[0] += a[0] - s;
+  dest[1] += a[1] - s;
+  dest[2] += a[2] - s;
+}
+
+/*!
+ * @brief multiply vector [a] with vector [b] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += (a * b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_muladd(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] += a[0] * b[0];
+  dest[1] += a[1] * b[1];
+  dest[2] += a[2] * b[2];
+}
+
+/*!
+ * @brief multiply vector [a] with scalar [s] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a * s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_muladds(ivec3 a, int s, ivec3 dest) {
+  dest[0] += a[0] * s;
+  dest[1] += a[1] * s;
+  dest[2] += a[2] * s;
+}
+
+/*!
+ * @brief add maximum of vector [a] and vector [b] to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += max(a, b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_maxadd(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] += glm_imax(a[0], b[0]);
+  dest[1] += glm_imax(a[1], b[1]);
+  dest[2] += glm_imax(a[2], b[2]);
+}
+
+/*!
+ * @brief add minimum of vector [a] and vector [b] to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += min(a, b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_minadd(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] += glm_imin(a[0], b[0]);
+  dest[1] += glm_imin(a[1], b[1]);
+  dest[2] += glm_imin(a[2], b[2]);
+}
+
+/*!
+ * @brief subtract vector [a] from vector [b] and subtract result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest -= (a - b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_subsub(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] -= a[0] - b[0];
+  dest[1] -= a[1] - b[1];
+  dest[2] -= a[2] - b[2];
+}
+
+/*!
+ * @brief subtract scalar [s] from vector [a] and subtract result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest -= (a - s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_subsubs(ivec3 a, int s, ivec3 dest) {
+  dest[0] -= a[0] - s;
+  dest[1] -= a[1] - s;
+  dest[2] -= a[2] - s;
+}
+
+/*!
+ * @brief add vector [a] to vector [b] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  b    scalar
+ * @param[out] dest dest -= (a + b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_addsub(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] -= a[0] + b[0];
+  dest[1] -= a[1] + b[1];
+  dest[2] -= a[2] + b[2];
+}
+
+/*!
+ * @brief add scalar [s] to vector [a] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest -= (a + b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_addsubs(ivec3 a, int s, ivec3 dest) {
+  dest[0] -= a[0] + s;
+  dest[1] -= a[1] + s;
+  dest[2] -= a[2] + s;
+}
+
+/*!
+ * @brief multiply vector [a] and vector [b] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  b    scalar
+ * @param[out] dest dest -= (a * b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_mulsub(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] -= a[0] * b[0];
+  dest[1] -= a[1] * b[1];
+  dest[2] -= a[2] * b[2];
+}
+
+/*!
+ * @brief multiply vector [a] with scalar [s] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest -= (a * s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_mulsubs(ivec3 a, int s, ivec3 dest) {
+  dest[0] -= a[0] * s;
+  dest[1] -= a[1] * s;
+  dest[2] -= a[2] * s;
+}
+
+/*!
+ * @brief subtract maximum of vector [a] and vector [b] from vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest -= max(a, b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_maxsub(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] -= glm_imax(a[0], b[0]);
+  dest[1] -= glm_imax(a[1], b[1]);
+  dest[2] -= glm_imax(a[2], b[2]);
+}
+
+/*!
+ * @brief subtract minimum of vector [a] and vector [b] from vector [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest -= min(a, b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec3_minsub(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] -= glm_imin(a[0], b[0]);
+  dest[1] -= glm_imin(a[1], b[1]);
+  dest[2] -= glm_imin(a[2], b[2]);
+}
+
+/*!
+ * @brief squared distance between two vectors
+ *
+ * @param[in] a first vector
+ * @param[in] b second vector
+ * @return returns squared distance (distance * distance)
+ */
+CGLM_INLINE
+int
+glm_ivec3_distance2(ivec3 a, ivec3 b) {
+  int xd, yd, zd;
+  xd = a[0] - b[0];
+  yd = a[1] - b[1];
+  zd = a[2] - b[2];
+  return xd * xd + yd * yd + zd * zd;
+}
+
+/*!
+ * @brief distance between two vectors
+ *
+ * @param[in] a first vector
+ * @param[in] b second vector
+ * @return returns distance
+ */
+CGLM_INLINE
+float
+glm_ivec3_distance(ivec3 a, ivec3 b) {
+  return sqrtf((float)glm_ivec3_distance2(a, b));
+}
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param[out] v   dest
+ * @param[in]  val value
+ */
+CGLM_INLINE
+void
+glm_ivec3_fill(ivec3 v, int val) {
+  v[0] = v[1] = v[2] = val;
+}
+
+/*!
+ * @brief check if vector is equal to value
+ *
+ * @param[in] v   vector
+ * @param[in] val value
+ */
+CGLM_INLINE
+bool
+glm_ivec3_eq(ivec3 v, int val) {
+  return v[0] == val && v[0] == v[1] && v[0] == v[2];
+}
+
+/*!
+ * @brief check if vector is equal to another
+ *
+ * @param[in] a vector
+ * @param[in] b vector
+ */
+CGLM_INLINE
+bool
+glm_ivec3_eqv(ivec3 a, ivec3 b) {
+  return a[0] == b[0]
+         && a[1] == b[1]
+         && a[2] == b[2];
+}
+
+/*!
+ * @brief set each member of dest to greater of vector a and b
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec3_maxv(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] = a[0] > b[0] ? a[0] : b[0];
+  dest[1] = a[1] > b[1] ? a[1] : b[1];
+  dest[2] = a[2] > b[2] ? a[2] : b[2];
+}
+
+/*!
+ * @brief set each member of dest to lesser of vector a and b
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec3_minv(ivec3 a, ivec3 b, ivec3 dest) {
+  dest[0] = a[0] < b[0] ? a[0] : b[0];
+  dest[1] = a[1] < b[1] ? a[1] : b[1];
+  dest[2] = a[2] < b[2] ? a[2] : b[2];
+}
+
+/*!
+ * @brief clamp each member of [v] between minVal and maxVal (inclusive)
+ *
+ * @param[in, out] v      vector
+ * @param[in]      minVal minimum value
+ * @param[in]      maxVal maximum value
+ */
+CGLM_INLINE
+void
+glm_ivec3_clamp(ivec3 v, int minVal, int maxVal) {
+  if (v[0] < minVal)
+    v[0] = minVal;
+  else if(v[0] > maxVal)
+    v[0] = maxVal;
+
+  if (v[1] < minVal)
+    v[1] = minVal;
+  else if(v[1] > maxVal)
+    v[1] = maxVal;
+
+  if (v[2] < minVal)
+    v[2] = minVal;
+  else if(v[2] > maxVal)
+    v[2] = maxVal;
+}
+
+/*!
+ * @brief absolute value of v
+ *
+ * @param[in]	v	vector
+ * @param[out]	dest	destination
+ */
+CGLM_INLINE
+void
+glm_ivec3_abs(ivec3 v, ivec3 dest) {
+  dest[0] = abs(v[0]);
+  dest[1] = abs(v[1]);
+  dest[2] = abs(v[2]);
+}
+
+#endif /* cglm_ivec3_h */
diff --git a/external/cglm/ivec4.h b/external/cglm/ivec4.h
new file mode 100644
index 0000000..6357599
--- /dev/null
+++ b/external/cglm/ivec4.h
@@ -0,0 +1,608 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_IVEC4_ONE_INIT
+   GLM_IVEC4_ZERO_INIT
+   GLM_IVEC4_ONE
+   GLM_IVEC4_ZERO
+
+ Functions:
+  CGLM_INLINE void glm_ivec4(ivec3 v3, int last, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_copy(ivec4 a, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_zero(ivec4 v)
+  CGLM_INLINE void glm_ivec4_one(ivec4 v)
+  CGLM_INLINE void glm_ivec4_add(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_adds(ivec4 v, int s, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_sub(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_subs(ivec4 v, int s, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_mul(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_scale(ivec4 v, int s, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_addadd(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_addadds(ivec4 a, int s, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_subadd(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_subadds(ivec4 a, int s, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_muladd(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_muladds(ivec4 a, int s, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_maxadd(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_minadd(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_subsub(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_subsubs(ivec4 a, int s, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_addsub(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_addsubs(ivec4 a, int s, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_mulsub(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_mulsubs(ivec4 a, int s, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_maxsub(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_minsub(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE int glm_ivec4_distance2(ivec4 a, ivec4 b)
+  CGLM_INLINE float glm_ivec4_distance(ivec4 a, ivec4 b)
+  CGLM_INLINE void glm_ivec4_maxv(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_minv(ivec4 a, ivec4 b, ivec4 dest)
+  CGLM_INLINE void glm_ivec4_clamp(ivec4 v, int minVal, int maxVal)
+  CGLM_INLINE void glm_ivec4_abs(ivec4 v, ivec4 dest)
+ */
+
+#ifndef cglm_ivec4_h
+#define cglm_ivec4_h
+
+#include "common.h"
+#include "util.h"
+
+#define GLM_IVEC4_ONE_INIT   {1, 1, 1, 1}
+#define GLM_IVEC4_ZERO_INIT  {0, 0, 0, 0}
+
+#define GLM_IVEC4_ONE  ((ivec4)GLM_IVEC4_ONE_INIT)
+#define GLM_IVEC4_ZERO ((ivec4)GLM_IVEC4_ZERO_INIT)
+
+/*!
+ * @brief init ivec4 using ivec3
+ *
+ * @param[in]  v3   vector3
+ * @param[in]  last last item
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec4(ivec3 v3, int last, ivec4 dest) {
+  dest[0] = v3[0];
+  dest[1] = v3[1];
+  dest[2] = v3[2];
+  dest[3] = last;
+}
+
+/*!
+ * @brief copy all members of [a] to [dest]
+ *
+ * @param[in]  a    source vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec4_copy(ivec4 a, ivec4 dest) {
+  dest[0] = a[0];
+  dest[1] = a[1];
+  dest[2] = a[2];
+  dest[3] = a[3];
+}
+
+/*!
+ * @brief set all members of [v] to zero
+ *
+ * @param[out] v vector
+ */
+CGLM_INLINE
+void 
+glm_ivec4_zero(ivec4 v) {
+  v[0] = v[1] = v[2] = v[3] = 0;
+}
+
+/*!
+ * @brief set all members of [v] to one
+ *
+ * @param[out] v vector
+ */
+CGLM_INLINE
+void
+glm_ivec4_one(ivec4 v) {
+  v[0] = v[1] = v[2] = v[3] = 1;
+}
+
+/*!
+ * @brief add vector [a] to vector [b] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec4_add(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] = a[0] + b[0];
+  dest[1] = a[1] + b[1];
+  dest[2] = a[2] + b[2];
+  dest[3] = a[3] + b[3];
+}
+
+/*!
+ * @brief add scalar s to vector [v] and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec4_adds(ivec4 v, int s, ivec4 dest) {
+  dest[0] = v[0] + s;
+  dest[1] = v[1] + s;
+  dest[2] = v[2] + s;
+  dest[3] = v[3] + s;
+}
+
+/*!
+ * @brief subtract vector [b] from vector [a] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec4_sub(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] = a[0] - b[0];
+  dest[1] = a[1] - b[1];
+  dest[2] = a[2] - b[2];
+  dest[3] = a[3] - b[3];
+}
+
+/*!
+ * @brief subtract scalar s from vector [v] and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec4_subs(ivec4 v, int s, ivec4 dest) {
+  dest[0] = v[0] - s;
+  dest[1] = v[1] - s;
+  dest[2] = v[2] - s;
+  dest[3] = v[3] - s;
+}
+
+/*!
+ * @brief multiply vector [a] with vector [b] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec4_mul(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] = a[0] * b[0];
+  dest[1] = a[1] * b[1];
+  dest[2] = a[2] * b[2];
+  dest[3] = a[3] * b[3];
+}
+
+/*!
+ * @brief multiply vector [a] with scalar s and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec4_scale(ivec4 v, int s, ivec4 dest) {
+  dest[0] = v[0] * s;
+  dest[1] = v[1] * s;
+  dest[2] = v[2] * s;
+  dest[3] = v[3] * s;
+}
+
+/*!
+ * @brief add vector [a] with vector [b] and add result to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += (a + b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_addadd(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] += a[0] + b[0];
+  dest[1] += a[1] + b[1];
+  dest[2] += a[2] + b[2];
+  dest[3] += a[3] + b[3];
+}
+
+/*!
+ * @brief add scalar [s] onto vector [a] and add result to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a + s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_addadds(ivec4 a, int s, ivec4 dest) {
+  dest[0] += a[0] + s;
+  dest[1] += a[1] + s;
+  dest[2] += a[2] + s;
+  dest[3] += a[3] + s;
+}
+
+/*!
+ * @brief subtract vector [a] from vector [b] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += (a - b)  
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_subadd(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] += a[0] - b[0];
+  dest[1] += a[1] - b[1];
+  dest[2] += a[2] - b[2];
+  dest[3] += a[3] - b[3];
+}
+
+/*!
+ * @brief subtract scalar [s] from vector [a] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a - s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_subadds(ivec4 a, int s, ivec4 dest) {
+  dest[0] += a[0] - s;
+  dest[1] += a[1] - s;
+  dest[2] += a[2] - s;
+  dest[3] += a[3] - s;
+}
+
+/*!
+ * @brief multiply vector [a] with vector [b] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += (a * b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_muladd(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] += a[0] * b[0];
+  dest[1] += a[1] * b[1];
+  dest[2] += a[2] * b[2];
+  dest[3] += a[3] * b[3];
+}
+
+/*!
+ * @brief multiply vector [a] with scalar [s] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a * s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_muladds(ivec4 a, int s, ivec4 dest) {
+  dest[0] += a[0] * s;
+  dest[1] += a[1] * s;
+  dest[2] += a[2] * s;
+  dest[3] += a[3] * s;
+}
+
+/*!
+ * @brief add maximum of vector [a] and vector [b] to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += max(a, b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_maxadd(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] += glm_imax(a[0], b[0]);
+  dest[1] += glm_imax(a[1], b[1]);
+  dest[2] += glm_imax(a[2], b[2]);
+  dest[3] += glm_imax(a[3], b[3]);
+}
+
+/*!
+ * @brief add minimum of vector [a] and vector [b] to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest += min(a, b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_minadd(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] += glm_imin(a[0], b[0]);
+  dest[1] += glm_imin(a[1], b[1]);
+  dest[2] += glm_imin(a[2], b[2]);
+  dest[3] += glm_imin(a[3], b[3]);
+}
+
+/*!
+ * @brief subtract vector [a] from vector [b] and subtract result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest -= (a - b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_subsub(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] -= a[0] - b[0];
+  dest[1] -= a[1] - b[1];
+  dest[2] -= a[2] - b[2];
+  dest[3] -= a[3] - b[3];
+}
+
+/*!
+ * @brief subtract scalar [s] from vector [a] and subtract result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest -= (a - s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_subsubs(ivec4 a, int s, ivec4 dest) {
+  dest[0] -= a[0] - s;
+  dest[1] -= a[1] - s;
+  dest[2] -= a[2] - s;
+  dest[3] -= a[3] - s;
+}
+
+/*!
+ * @brief add vector [a] to vector [b] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  b    scalar
+ * @param[out] dest dest -= (a + b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_addsub(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] -= a[0] + b[0];
+  dest[1] -= a[1] + b[1];
+  dest[2] -= a[2] + b[2];
+  dest[3] -= a[3] + b[3];
+}
+
+/*!
+ * @brief add scalar [s] to vector [a] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest -= (a + b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_addsubs(ivec4 a, int s, ivec4 dest) {
+  dest[0] -= a[0] + s;
+  dest[1] -= a[1] + s;
+  dest[2] -= a[2] + s;
+  dest[3] -= a[3] + s;
+}
+
+/*!
+ * @brief multiply vector [a] and vector [b] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  b    scalar
+ * @param[out] dest dest -= (a * b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_mulsub(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] -= a[0] * b[0];
+  dest[1] -= a[1] * b[1];
+  dest[2] -= a[2] * b[2];
+  dest[3] -= a[3] * b[3];
+}
+
+/*!
+ * @brief multiply vector [a] with scalar [s] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest -= (a * s)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_mulsubs(ivec4 a, int s, ivec4 dest) {
+  dest[0] -= a[0] * s;
+  dest[1] -= a[1] * s;
+  dest[2] -= a[2] * s;
+  dest[3] -= a[3] * s;
+}
+
+/*!
+ * @brief subtract maximum of vector [a] and vector [b] from vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest -= max(a, b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_maxsub(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] -= glm_imax(a[0], b[0]);
+  dest[1] -= glm_imax(a[1], b[1]);
+  dest[2] -= glm_imax(a[2], b[2]);
+  dest[3] -= glm_imax(a[3], b[3]);
+}
+
+/*!
+ * @brief subtract minimum of vector [a] and vector [b] from vector [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest dest -= min(a, b)
+ */
+CGLM_INLINE 
+void 
+glm_ivec4_minsub(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] -= glm_imin(a[0], b[0]);
+  dest[1] -= glm_imin(a[1], b[1]);
+  dest[2] -= glm_imin(a[2], b[2]);
+  dest[3] -= glm_imin(a[3], b[3]);
+}
+
+/*!
+ * @brief squared distance between two vectors
+ *
+ * @param[in] a first vector
+ * @param[in] b second vector
+ * @return returns squared distance (distance * distance)
+ */
+CGLM_INLINE
+int
+glm_ivec4_distance2(ivec4 a, ivec4 b) {
+  int xd, yd, zd, wd;
+  xd = a[0] - b[0];
+  yd = a[1] - b[1];
+  zd = a[2] - b[2];
+  wd = a[3] - b[3];
+  return xd * xd + yd * yd + zd * zd + wd * wd;
+}
+
+/*!
+ * @brief distance between two vectors
+ *
+ * @param[in] a first vector
+ * @param[in] b second vector
+ * @return returns distance
+ */
+CGLM_INLINE
+float
+glm_ivec4_distance(ivec4 a, ivec4 b) {
+  return sqrtf((float)glm_ivec4_distance2(a, b));
+}
+
+/*!
+ * @brief set each member of dest to greater of vector a and b
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec4_maxv(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] = a[0] > b[0] ? a[0] : b[0];
+  dest[1] = a[1] > b[1] ? a[1] : b[1];
+  dest[2] = a[2] > b[2] ? a[2] : b[2];
+  dest[3] = a[3] > b[3] ? a[3] : b[3];
+}
+
+/*!
+ * @brief set each member of dest to lesser of vector a and b
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_ivec4_minv(ivec4 a, ivec4 b, ivec4 dest) {
+  dest[0] = a[0] < b[0] ? a[0] : b[0];
+  dest[1] = a[1] < b[1] ? a[1] : b[1];
+  dest[2] = a[2] < b[2] ? a[2] : b[2];
+  dest[3] = a[3] < b[3] ? a[3] : b[3];
+}
+
+/*!
+ * @brief clamp each member of [v] between minVal and maxVal (inclusive)
+ *
+ * @param[in, out] v      vector
+ * @param[in]      minVal minimum value
+ * @param[in]      maxVal maximum value
+ */
+CGLM_INLINE
+void
+glm_ivec4_clamp(ivec4 v, int minVal, int maxVal) {
+  if (v[0] < minVal)
+    v[0] = minVal;
+  else if(v[0] > maxVal)
+    v[0] = maxVal;
+
+  if (v[1] < minVal)
+    v[1] = minVal;
+  else if(v[1] > maxVal)
+    v[1] = maxVal;
+
+  if (v[2] < minVal)
+    v[2] = minVal;
+  else if(v[2] > maxVal)
+    v[2] = maxVal;
+
+  if (v[3] < minVal)
+    v[3] = minVal;
+  else if(v[3] > maxVal)
+    v[3] = maxVal;
+}
+
+/*!
+ * @brief absolute value of v
+ *
+ * @param[in]	v	vector
+ * @param[out]	dest	destination
+ */
+CGLM_INLINE
+void
+glm_ivec4_abs(ivec4 v, ivec4 dest) {
+  dest[0] = abs(v[0]);
+  dest[1] = abs(v[1]);
+  dest[2] = abs(v[2]);
+  dest[3] = abs(v[3]);
+}
+
+#endif /* cglm_ivec4_h */
diff --git a/external/cglm/mat2.h b/external/cglm/mat2.h
new file mode 100644
index 0000000..9248460
--- /dev/null
+++ b/external/cglm/mat2.h
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_MAT2_IDENTITY_INIT
+   GLM_MAT2_ZERO_INIT
+   GLM_MAT2_IDENTITY
+   GLM_MAT2_ZERO
+
+ Functions:
+   CGLM_INLINE void  glm_mat2_make(float * restrict src, mat2 dest)
+   CGLM_INLINE void  glm_mat2_copy(mat2 mat, mat2 dest)
+   CGLM_INLINE void  glm_mat2_identity(mat2 m)
+   CGLM_INLINE void  glm_mat2_identity_array(mat2 * restrict mats, size_t count)
+   CGLM_INLINE void  glm_mat2_zero(mat2 m)
+   CGLM_INLINE void  glm_mat2_mul(mat2 m1, mat2 m2, mat2 dest)
+   CGLM_INLINE void  glm_mat2_mulv(mat2 m, vec2 v, vec2 dest)
+   CGLM_INLINE void  glm_mat2_transpose_to(mat2 mat, mat2 dest)
+   CGLM_INLINE void  glm_mat2_transpose(mat2 m)
+   CGLM_INLINE void  glm_mat2_scale(mat2 m, float s)
+   CGLM_INLINE void  glm_mat2_inv(mat2 mat, mat2 dest)
+   CGLM_INLINE void  glm_mat2_swap_col(mat2 mat, int col1, int col2)
+   CGLM_INLINE void  glm_mat2_swap_row(mat2 mat, int row1, int row2)
+   CGLM_INLINE float glm_mat2_det(mat2 m)
+   CGLM_INLINE float glm_mat2_trace(mat2 m)
+   CGLM_INLINE float glm_mat2_rmc(vec2 r, mat2 m, vec2 c)
+ */
+
+#ifndef cglm_mat2_h
+#define cglm_mat2_h
+
+#include "common.h"
+#include "vec2.h"
+
+#ifdef CGLM_SSE_FP
+#  include "simd/sse2/mat2.h"
+#endif
+
+#ifdef CGLM_NEON_FP
+#  include "simd/neon/mat2.h"
+#endif
+
+#ifdef CGLM_SIMD_WASM
+#  include "simd/wasm/mat2.h"
+#endif
+
+#define GLM_MAT2_IDENTITY_INIT  {{1.0f, 0.0f}, {0.0f, 1.0f}}
+#define GLM_MAT2_ZERO_INIT      {{0.0f, 0.0f}, {0.0f, 0.0f}}
+
+/* for C only */
+#define GLM_MAT2_IDENTITY ((mat2)GLM_MAT2_IDENTITY_INIT)
+#define GLM_MAT2_ZERO     ((mat2)GLM_MAT2_ZERO_INIT)
+
+/*!
+ * @brief Create mat2 (dest) from pointer (src).
+ *
+ * @param[in]  src  pointer to an array of floats (left)
+ * @param[out] dest destination (result, mat2)
+ */
+CGLM_INLINE
+void
+glm_mat2_make(const float * __restrict src, mat2 dest) {
+  dest[0][0] = src[0];
+  dest[0][1] = src[1];
+  dest[1][0] = src[2];
+  dest[1][1] = src[3];
+}
+
+/*!
+ * @brief Copy mat2 (mat) to mat2 (dest).
+ *
+ * @param[in]  mat  mat2 (left, src)
+ * @param[out] dest destination (result, mat2)
+ */
+CGLM_INLINE
+void
+glm_mat2_copy(mat2 mat, mat2 dest) {
+  glm_vec4_ucopy(mat[0], dest[0]);
+}
+
+/*!
+ * @brief Copy a mat2 identity to mat2 (m), or makes mat2 (m) an identity.
+ *
+ *        The same thing may be achieved with either of bellow methods,
+ *        but it is more easy to do that with this func especially for members
+ *        e.g. glm_mat2_identity(aStruct->aMatrix);
+ *
+ * @code
+ * glm_mat2_copy(GLM_MAT2_IDENTITY, mat); // C only
+ *
+ * // or
+ * mat2 mat = GLM_MAT2_IDENTITY_INIT;
+ * @endcode
+ *
+ * @param[in, out] m mat2 (src, dest)
+ */
+CGLM_INLINE
+void
+glm_mat2_identity(mat2 m) {
+  CGLM_ALIGN_MAT mat2 t = GLM_MAT2_IDENTITY_INIT;
+  glm_mat2_copy(t, m);
+}
+
+/*!
+ * @brief Given an array of mat2’s (mats) make each matrix an identity matrix.
+ *
+ * @param[in, out] mats Array of mat2’s (must be aligned (16/32) if alignment is not disabled)
+ * @param[in]      count Array size of mats or number of matrices
+ */
+CGLM_INLINE
+void
+glm_mat2_identity_array(mat2 * __restrict mats, size_t count) {
+  CGLM_ALIGN_MAT mat2 t = GLM_MAT2_IDENTITY_INIT;
+  size_t i;
+
+  for (i = 0; i < count; i++) {
+    glm_mat2_copy(t, mats[i]);
+  }
+}
+
+/*!
+ * @brief Zero out the mat2 (m).
+ *
+ * @param[in, out] m mat2 (src, dest)
+ */
+CGLM_INLINE
+void
+glm_mat2_zero(mat2 m) {
+  CGLM_ALIGN_MAT mat2 t = GLM_MAT2_ZERO_INIT;
+  glm_mat2_copy(t, m);
+}
+
+/*!
+ * @brief Multiply mat2 (m1) by mat2 (m2) and store in mat2 (dest).
+ *
+ *        m1, m2 and dest matrices can be same matrix, it is possible to write this: 
+ *
+ * @code
+ * mat2 m = GLM_MAT2_IDENTITY_INIT;
+ * glm_mat2_mul(m, m, m);
+ * @endcode
+ *
+ * @param[in]  m1   mat2 (left)
+ * @param[in]  m2   mat2 (right)
+ * @param[out] dest destination (result, mat2)
+ */
+CGLM_INLINE
+void
+glm_mat2_mul(mat2 m1, mat2 m2, mat2 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_mat2_mul_wasm(m1, m2, dest);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_mat2_mul_sse2(m1, m2, dest);
+#elif defined(CGLM_NEON_FP)
+  glm_mat2_mul_neon(m1, m2, dest);
+#else
+  float a00 = m1[0][0], a01 = m1[0][1],
+        a10 = m1[1][0], a11 = m1[1][1],
+        b00 = m2[0][0], b01 = m2[0][1],
+        b10 = m2[1][0], b11 = m2[1][1];
+
+  dest[0][0] = a00 * b00 + a10 * b01;
+  dest[0][1] = a01 * b00 + a11 * b01;
+  dest[1][0] = a00 * b10 + a10 * b11;
+  dest[1][1] = a01 * b10 + a11 * b11;
+#endif
+}
+
+/*!
+ * @brief Multiply mat2 (m) by vec2 (v) and store in vec2 (dest).
+ *
+ * @param[in]  m    mat2 (left)
+ * @param[in]  v    vec2 (right, column vector)
+ * @param[out] dest destination (result, column vector)
+ */
+CGLM_INLINE
+void
+glm_mat2_mulv(mat2 m, vec2 v, vec2 dest) {
+  dest[0] = m[0][0] * v[0] + m[1][0] * v[1];
+  dest[1] = m[0][1] * v[0] + m[1][1] * v[1];
+}
+
+/*!
+ * @brief Transpose mat2 (mat) and store in mat2 (dest).
+ *
+ * @param[in]  mat  mat2 (left, src)
+ * @param[out] dest destination (result, mat2)
+ */
+CGLM_INLINE
+void
+glm_mat2_transpose_to(mat2 mat, mat2 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_mat2_transp_wasm(mat, dest);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_mat2_transp_sse2(mat, dest);
+#else
+  dest[0][0] = mat[0][0];
+  dest[0][1] = mat[1][0];
+  dest[1][0] = mat[0][1];
+  dest[1][1] = mat[1][1];
+#endif
+}
+
+/*!
+ * @brief Transpose mat2 (m) and store result in the same matrix.
+ *
+ * @param[in, out] m mat2 (src, dest)
+ */
+CGLM_INLINE
+void
+glm_mat2_transpose(mat2 m) {
+  float tmp;
+  tmp     = m[0][1];
+  m[0][1] = m[1][0];
+  m[1][0] = tmp;
+}
+
+/*!
+ * @brief Multiply mat2 (m) by scalar constant (s).
+ *
+ * @param[in, out] m mat2 (src, dest)
+ * @param[in]      s float (scalar)
+ */
+CGLM_INLINE
+void
+glm_mat2_scale(mat2 m, float s) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(m[0], wasm_f32x4_mul(wasm_v128_load(m[0]),
+                                  wasm_f32x4_splat(s)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), glmm_set1(s)));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), vdupq_n_f32(s)));
+#else
+  m[0][0] = m[0][0] * s;
+  m[0][1] = m[0][1] * s;
+  m[1][0] = m[1][0] * s;
+  m[1][1] = m[1][1] * s;
+#endif
+}
+
+/*!
+ * @brief Inverse mat2 (mat) and store in mat2 (dest).
+ *
+ * @param[in]  mat  mat2 (left, src)
+ * @param[out] dest destination (result, inverse mat2)
+ */
+CGLM_INLINE
+void
+glm_mat2_inv(mat2 mat, mat2 dest) {
+  float det;
+  float a = mat[0][0], b = mat[0][1],
+        c = mat[1][0], d = mat[1][1];
+
+  det = 1.0f / (a * d - b * c);
+
+  dest[0][0] =  d * det;
+  dest[0][1] = -b * det;
+  dest[1][0] = -c * det;
+  dest[1][1] =  a * det;
+}
+
+/*!
+ * @brief Swap two columns in mat2 (mat) and store in same matrix.
+ *
+ * @param[in, out] mat  mat2 (src, dest)
+ * @param[in]      col1 Column 1 array index
+ * @param[in]      col2 Column 2 array index
+ */
+CGLM_INLINE
+void
+glm_mat2_swap_col(mat2 mat, int col1, int col2) {
+  float a, b;
+
+  a = mat[col1][0];
+  b = mat[col1][1];
+
+  mat[col1][0] = mat[col2][0];
+  mat[col1][1] = mat[col2][1];
+
+  mat[col2][0] = a;
+  mat[col2][1] = b;
+}
+
+/*!
+ * @brief Swap two rows in mat2 (mat) and store in same matrix.
+ *
+ * @param[in, out] mat  mat2 (src, dest)
+ * @param[in]      row1 Row 1 array index
+ * @param[in]      row2 Row 2 array index
+ */
+CGLM_INLINE
+void
+glm_mat2_swap_row(mat2 mat, int row1, int row2) {
+  float a, b;
+
+  a = mat[0][row1];
+  b = mat[1][row1];
+
+  mat[0][row1] = mat[0][row2];
+  mat[1][row1] = mat[1][row2];
+
+  mat[0][row2] = a;
+  mat[1][row2] = b;
+}
+
+/*!
+ * @brief Returns mat2 determinant.
+ *
+ * @param[in] m mat2 (src)
+ *
+ * @return[out] mat2 determinant (float)
+ */
+CGLM_INLINE
+float
+glm_mat2_det(mat2 m) {
+  return m[0][0] * m[1][1] - m[1][0] * m[0][1];
+}
+
+/*!
+ * @brief Returns trace of matrix. Which is:
+ *
+ *        The sum of the elements on the main diagonal from
+ *        upper left corner to the bottom right corner.
+ *
+ * @param[in] m mat2 (src)
+ *
+ * @return[out] mat2 trace (float)
+ */
+CGLM_INLINE
+float
+glm_mat2_trace(mat2 m) {
+  return m[0][0] + m[1][1];
+}
+
+/*!
+ * @brief Helper for  R (row vector) * M (matrix) * C (column vector)
+ *
+ *        rmc stands for Row * Matrix * Column
+ *
+ *        the result is scalar because M * C = ResC (1x2, column vector),
+ *        then if you take the dot_product(R (2x1), ResC (1x2)) = scalar value.
+ *
+ * @param[in] r vec2 (2x1, row vector)
+ * @param[in] m mat2 (2x2, matrix)
+ * @param[in] c vec2 (1x2, column vector)
+ *
+ * @return[out] Scalar value (float, 1x1)
+ */
+CGLM_INLINE
+float
+glm_mat2_rmc(vec2 r, mat2 m, vec2 c) {
+  vec2 tmp;
+  glm_mat2_mulv(m, c, tmp);
+  return glm_vec2_dot(r, tmp);
+}
+
+#endif /* cglm_mat2_h */
diff --git a/external/cglm/mat2x3.h b/external/cglm/mat2x3.h
new file mode 100644
index 0000000..0bb8d70
--- /dev/null
+++ b/external/cglm/mat2x3.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_MAT2X3_ZERO_INIT
+   GLM_MAT2X3_ZERO
+
+ Functions:
+   CGLM_INLINE void glm_mat2x3_copy(mat2x3 src, mat2x3 dest);
+   CGLM_INLINE void glm_mat2x3_zero(mat2x3 m);
+   CGLM_INLINE void glm_mat2x3_make(const float * __restrict src, mat2x3 dest);
+   CGLM_INLINE void glm_mat2x3_mul(mat2x3 m1, mat3x2 m2, mat3 dest);
+   CGLM_INLINE void glm_mat2x3_mulv(mat2x3 m, vec2 v, vec3 dest);
+   CGLM_INLINE void glm_mat2x3_transpose(mat2x3 src, mat3x2 dest);
+   CGLM_INLINE void glm_mat2x3_scale(mat2x3 m, float s);
+ */
+
+#ifndef cglm_mat2x3_h
+#define cglm_mat2x3_h
+
+#include "common.h"
+
+#define GLM_MAT2X3_ZERO_INIT {{0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f}}
+
+/* for C only */
+#define GLM_MAT2X3_ZERO GLM_MAT2X3_ZERO_INIT
+
+/*!
+ * @brief Copy mat2x3 (src) to mat2x3 (dest).
+ *
+ * @param[in]  src  mat2x3 (left)
+ * @param[out] dest destination (result, mat2x3)
+ */
+CGLM_INLINE
+void
+glm_mat2x3_copy(mat2x3 src, mat2x3 dest) {
+  glm_vec3_copy(src[0], dest[0]);
+  glm_vec3_copy(src[1], dest[1]);
+}
+
+/*!
+ * @brief Zero out the mat2x3 (m).
+ *
+ * @param[in, out] mat2x3 (src, dest)
+ */
+CGLM_INLINE
+void
+glm_mat2x3_zero(mat2x3 m) {
+  CGLM_ALIGN_MAT mat2x3 t = GLM_MAT2X3_ZERO_INIT;
+  glm_mat2x3_copy(t, m);
+}
+
+/*!
+ * @brief Create mat2x3 (dest) from pointer (src).
+ *
+ * @param[in]  src  pointer to an array of floats (left)
+ * @param[out] dest destination (result, mat2x3)
+ */
+CGLM_INLINE
+void
+glm_mat2x3_make(const float * __restrict src, mat2x3 dest) {
+  dest[0][0] = src[0];
+  dest[0][1] = src[1];
+  dest[0][2] = src[2];
+
+  dest[1][0] = src[3];
+  dest[1][1] = src[4];
+  dest[1][2] = src[5];
+}
+
+/*!
+ * @brief Multiply mat2x3 (m1) by mat3x2 (m2) and store in mat3 (dest).
+ *
+ * @code
+ * glm_mat2x3_mul(mat2x3, mat3x2, mat3);
+ * @endcode
+ *
+ * @param[in]  m1   mat2x3 (left)
+ * @param[in]  m2   mat3x2 (right)
+ * @param[out] dest destination (result, mat3)
+ */
+CGLM_INLINE
+void
+glm_mat2x3_mul(mat2x3 m1, mat3x2 m2, mat3 dest) {
+  float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], 
+        a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2],
+
+        b00 = m2[0][0], b01 = m2[0][1],
+        b10 = m2[1][0], b11 = m2[1][1],
+        b20 = m2[2][0], b21 = m2[2][1];
+
+  dest[0][0] = a00 * b00 + a10 * b01;
+  dest[0][1] = a01 * b00 + a11 * b01;
+  dest[0][2] = a02 * b00 + a12 * b01;
+
+  dest[1][0] = a00 * b10 + a10 * b11;
+  dest[1][1] = a01 * b10 + a11 * b11;
+  dest[1][2] = a02 * b10 + a12 * b11;
+
+  dest[2][0] = a00 * b20 + a10 * b21;
+  dest[2][1] = a01 * b20 + a11 * b21;
+  dest[2][2] = a02 * b20 + a12 * b21;
+}
+
+/*!
+ * @brief Multiply mat2x3 (m) by vec2 (v) and store in vec3 (dest).
+ *
+ * @param[in]  m    mat2x3 (left)
+ * @param[in]  v    vec2 (right, column vector)
+ * @param[out] dest destination (result, column vector)
+ */
+CGLM_INLINE
+void
+glm_mat2x3_mulv(mat2x3 m, vec2 v, vec3 dest) {
+  float v0 = v[0], v1 = v[1];
+
+  dest[0] = m[0][0] * v0 + m[1][0] * v1;
+  dest[1] = m[0][1] * v0 + m[1][1] * v1;
+  dest[2] = m[0][2] * v0 + m[1][2] * v1;
+}
+
+/*!
+ * @brief Transpose mat2x3 (src) and store in mat3x2 (dest).
+ *
+ * @param[in]  src  mat2x3 (left)
+ * @param[out] dest destination (result, mat3x2)
+ */
+CGLM_INLINE
+void
+glm_mat2x3_transpose(mat2x3 src, mat3x2 dest) {
+  dest[0][0] = src[0][0];  dest[0][1] = src[1][0];
+  dest[1][0] = src[0][1];  dest[1][1] = src[1][1];
+  dest[2][0] = src[0][2];  dest[2][1] = src[1][2];
+}
+
+/*!
+ * @brief Multiply mat2x3 (m) by scalar constant (s).
+ *
+ * @param[in, out] m (src, dest)
+ * @param[in]      float (scalar)
+ */
+CGLM_INLINE
+void
+glm_mat2x3_scale(mat2x3 m, float s) {
+  m[0][0] *= s;  m[0][1] *= s;  m[0][2] *= s;
+  m[1][0] *= s;  m[1][1] *= s;  m[1][2] *= s;
+}
+
+#endif
diff --git a/external/cglm/mat2x4.h b/external/cglm/mat2x4.h
new file mode 100644
index 0000000..fa9adf3
--- /dev/null
+++ b/external/cglm/mat2x4.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_MAT2X4_ZERO_INIT
+   GLM_MAT2X4_ZERO
+
+ Functions:
+   CGLM_INLINE void glm_mat2x4_copy(mat2x4 src, mat2x4 dest);
+   CGLM_INLINE void glm_mat2x4_zero(mat2x4 m);
+   CGLM_INLINE void glm_mat2x4_make(const float * __restrict src, mat2x4 dest);
+   CGLM_INLINE void glm_mat2x4_mul(mat2x4 m1, mat4x2 m2, mat4 dest);
+   CGLM_INLINE void glm_mat2x4_mulv(mat2x4 m, vec2 v, vec4 dest);
+   CGLM_INLINE void glm_mat2x4_transpose(mat2x4 src, mat4x2 dest);
+   CGLM_INLINE void glm_mat2x4_scale(mat2x4 m, float s);
+ */
+
+#ifndef cglm_mat2x4_h
+#define cglm_mat2x4_h
+
+#include "common.h"
+#include "vec4.h"
+
+#define GLM_MAT2X4_ZERO_INIT {{0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 0.0f}}
+
+/* for C only */
+#define GLM_MAT2X4_ZERO GLM_MAT2X4_ZERO_INIT
+
+/*!
+ * @brief Copy mat2x4 (src) to mat2x4 (dest).
+ *
+ * @param[in]  src  mat2x4 (left)
+ * @param[out] dest destination (result, mat2x4)
+ */
+CGLM_INLINE
+void
+glm_mat2x4_copy(mat2x4 src, mat2x4 dest) {
+  glm_vec4_ucopy(src[0], dest[0]);
+  glm_vec4_ucopy(src[1], dest[1]);
+}
+
+/*!
+ * @brief Zero out the mat2x4 (m).
+ *
+ * @param[in, out] mat2x4 (src, dest)
+ */
+CGLM_INLINE
+void
+glm_mat2x4_zero(mat2x4 m) {
+  CGLM_ALIGN_MAT mat2x4 t = GLM_MAT2X4_ZERO_INIT;
+  glm_mat2x4_copy(t, m);
+}
+
+/*!
+ * @brief Create mat2x4 (dest) from pointer (src).
+ *
+ * @param[in]  src  pointer to an array of floats (left)
+ * @param[out] dest destination (result, mat2x4)
+ */
+CGLM_INLINE
+void
+glm_mat2x4_make(const float * __restrict src, mat2x4 dest) {
+  dest[0][0] = src[0];
+  dest[0][1] = src[1];
+  dest[0][2] = src[2];
+  dest[0][3] = src[3];
+
+  dest[1][0] = src[4];
+  dest[1][1] = src[5];
+  dest[1][2] = src[6];
+  dest[1][3] = src[7];
+}
+
+/*!
+ * @brief Multiply mat2x4 (m1) by mat4x2 (m2) and store in mat4 (dest).
+ *
+ * @code
+ * glm_mat2x4_mul(mat2x4, mat4x2, mat4);
+ * @endcode
+ *
+ * @param[in]  m1   mat2x4 (left)
+ * @param[in]  m2   mat4x2 (right)
+ * @param[out] dest destination (result, mat4)
+ */
+CGLM_INLINE
+void
+glm_mat2x4_mul(mat2x4 m1, mat4x2 m2, mat4 dest) {
+  float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3],
+        a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2], a13 = m1[1][3],
+
+        b00 = m2[0][0], b01 = m2[0][1],
+        b10 = m2[1][0], b11 = m2[1][1],
+        b20 = m2[2][0], b21 = m2[2][1],
+        b30 = m2[3][0], b31 = m2[3][1];
+
+  dest[0][0] = a00 * b00 + a10 * b01;
+  dest[0][1] = a01 * b00 + a11 * b01;
+  dest[0][2] = a02 * b00 + a12 * b01;
+  dest[0][3] = a03 * b00 + a13 * b01;
+
+  dest[1][0] = a00 * b10 + a10 * b11;
+  dest[1][1] = a01 * b10 + a11 * b11;
+  dest[1][2] = a02 * b10 + a12 * b11;
+  dest[1][3] = a03 * b10 + a13 * b11;
+
+  dest[2][0] = a00 * b20 + a10 * b21;
+  dest[2][1] = a01 * b20 + a11 * b21;
+  dest[2][2] = a02 * b20 + a12 * b21;
+  dest[2][3] = a03 * b20 + a13 * b21;
+
+  dest[3][0] = a00 * b30 + a10 * b31;
+  dest[3][1] = a01 * b30 + a11 * b31;
+  dest[3][2] = a02 * b30 + a12 * b31;
+  dest[3][3] = a03 * b30 + a13 * b31;
+}
+
+/*!
+ * @brief Multiply mat2x4 (m) by vec2 (v) and store in vec4 (dest).
+ *
+ * @param[in]  m    mat2x4 (left)
+ * @param[in]  v    vec2 (right, column vector)
+ * @param[out] dest destination (result, column vector)
+ */
+CGLM_INLINE
+void
+glm_mat2x4_mulv(mat2x4 m, vec2 v, vec4 dest) {
+  float v0 = v[0], v1 = v[1];
+
+  dest[0] = m[0][0] * v0 + m[1][0] * v1;
+  dest[1] = m[0][1] * v0 + m[1][1] * v1;
+  dest[2] = m[0][2] * v0 + m[1][2] * v1;
+  dest[3] = m[0][3] * v0 + m[1][3] * v1;
+}
+
+/*!
+ * @brief Transpose mat2x4 (src) and store in mat4x2 (dest).
+ *
+ * @param[in]  src  mat2x4 (left)
+ * @param[out] dest destination (result, mat4x2)
+ */
+CGLM_INLINE
+void
+glm_mat2x4_transpose(mat2x4 src, mat4x2 dest) {
+  dest[0][0] = src[0][0];  dest[0][1] = src[1][0];
+  dest[1][0] = src[0][1];  dest[1][1] = src[1][1];
+  dest[2][0] = src[0][2];  dest[2][1] = src[1][2];
+  dest[3][0] = src[0][3];  dest[3][1] = src[1][3];
+}
+
+/*!
+ * @brief Multiply mat2x4 (m) by scalar constant (s).
+ *
+ * @param[in, out] m (src, dest)
+ * @param[in]      s float (scalar)
+ */
+CGLM_INLINE
+void
+glm_mat2x4_scale(mat2x4 m, float s) {
+  m[0][0] *= s;  m[0][1] *= s;  m[0][2] *= s;  m[0][3] *= s;
+  m[1][0] *= s;  m[1][1] *= s;  m[1][2] *= s;  m[1][3] *= s;
+}
+
+#endif
diff --git a/external/cglm/mat3.h b/external/cglm/mat3.h
new file mode 100644
index 0000000..10b373e
--- /dev/null
+++ b/external/cglm/mat3.h
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_MAT3_IDENTITY_INIT
+   GLM_MAT3_ZERO_INIT
+   GLM_MAT3_IDENTITY
+   GLM_MAT3_ZERO
+   glm_mat3_dup(mat, dest)
+
+ Functions:
+   CGLM_INLINE void  glm_mat3_copy(mat3 mat, mat3 dest);
+   CGLM_INLINE void  glm_mat3_identity(mat3 mat);
+   CGLM_INLINE void  glm_mat3_identity_array(mat3 * restrict mat, size_t count);
+   CGLM_INLINE void  glm_mat3_zero(mat3 mat);
+   CGLM_INLINE void  glm_mat3_mul(mat3 m1, mat3 m2, mat3 dest);
+   CGLM_INLINE void  glm_mat3_transpose_to(mat3 m, mat3 dest);
+   CGLM_INLINE void  glm_mat3_transpose(mat3 m);
+   CGLM_INLINE void  glm_mat3_mulv(mat3 m, vec3 v, vec3 dest);
+   CGLM_INLINE float glm_mat3_trace(mat3 m);
+   CGLM_INLINE void  glm_mat3_quat(mat3 m, versor dest);
+   CGLM_INLINE void  glm_mat3_scale(mat3 m, float s);
+   CGLM_INLINE float glm_mat3_det(mat3 mat);
+   CGLM_INLINE void  glm_mat3_inv(mat3 mat, mat3 dest);
+   CGLM_INLINE void  glm_mat3_swap_col(mat3 mat, int col1, int col2);
+   CGLM_INLINE void  glm_mat3_swap_row(mat3 mat, int row1, int row2);
+   CGLM_INLINE float glm_mat3_rmc(vec3 r, mat3 m, vec3 c);
+   CGLM_INLINE void  glm_mat3_make(float * restrict src, mat3 dest);
+   CGLM_INLINE void  glm_mat3_textrans(float sx, float sy, float rot, float tx, float ty, mat3 dest);
+ */
+
+#ifndef cglm_mat3_h
+#define cglm_mat3_h
+
+#include "common.h"
+#include "vec3.h"
+
+#ifdef CGLM_SSE_FP
+#  include "simd/sse2/mat3.h"
+#endif
+
+#ifdef CGLM_SIMD_WASM
+#  include "simd/wasm/mat3.h"
+#endif
+
+#define GLM_MAT3_IDENTITY_INIT  {{1.0f, 0.0f, 0.0f},                          \
+                                 {0.0f, 1.0f, 0.0f},                          \
+                                 {0.0f, 0.0f, 1.0f}}
+#define GLM_MAT3_ZERO_INIT      {{0.0f, 0.0f, 0.0f},                          \
+                                 {0.0f, 0.0f, 0.0f},                          \
+                                 {0.0f, 0.0f, 0.0f}}
+
+
+/* for C only */
+#define GLM_MAT3_IDENTITY ((mat3)GLM_MAT3_IDENTITY_INIT)
+#define GLM_MAT3_ZERO     ((mat3)GLM_MAT3_ZERO_INIT)
+
+/* DEPRECATED! use _copy, _ucopy versions */
+#define glm_mat3_dup(mat, dest) glm_mat3_copy(mat, dest)
+
+/*!
+ * @brief copy all members of [mat] to [dest]
+ *
+ * @param[in]  mat  source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_mat3_copy(mat3 mat, mat3 dest) {
+  dest[0][0] = mat[0][0];
+  dest[0][1] = mat[0][1];
+  dest[0][2] = mat[0][2];
+
+  dest[1][0] = mat[1][0];
+  dest[1][1] = mat[1][1];
+  dest[1][2] = mat[1][2];
+
+  dest[2][0] = mat[2][0];
+  dest[2][1] = mat[2][1];
+  dest[2][2] = mat[2][2];
+}
+
+/*!
+ * @brief make given matrix identity. It is identical with below,
+ *        but it is more easy to do that with this func especially for members
+ *        e.g. glm_mat3_identity(aStruct->aMatrix);
+ *
+ * @code
+ * glm_mat3_copy(GLM_MAT3_IDENTITY, mat); // C only
+ *
+ * // or
+ * mat3 mat = GLM_MAT3_IDENTITY_INIT;
+ * @endcode
+ *
+ * @param[in, out]  mat  destination
+ */
+CGLM_INLINE
+void
+glm_mat3_identity(mat3 mat) {
+  CGLM_ALIGN_MAT mat3 t = GLM_MAT3_IDENTITY_INIT;
+  glm_mat3_copy(t, mat);
+}
+
+/*!
+ * @brief make given matrix array's each element identity matrix
+ *
+ * @param[in, out]  mat   matrix array (must be aligned (16/32)
+ *                        if alignment is not disabled)
+ *
+ * @param[in]       count count of matrices
+ */
+CGLM_INLINE
+void
+glm_mat3_identity_array(mat3 * __restrict mat, size_t count) {
+  CGLM_ALIGN_MAT mat3 t = GLM_MAT3_IDENTITY_INIT;
+  size_t i;
+
+  for (i = 0; i < count; i++) {
+    glm_mat3_copy(t, mat[i]);
+  }
+}
+
+/*!
+ * @brief make given matrix zero.
+ *
+ * @param[in, out]  mat  matrix
+ */
+CGLM_INLINE
+void
+glm_mat3_zero(mat3 mat) {
+  CGLM_ALIGN_MAT mat3 t = GLM_MAT3_ZERO_INIT;
+  glm_mat3_copy(t, mat);
+}
+
+/*!
+ * @brief multiply m1 and m2 to dest
+ *
+ * m1, m2 and dest matrices can be same matrix, it is possible to write this:
+ *
+ * @code
+ * mat3 m = GLM_MAT3_IDENTITY_INIT;
+ * glm_mat3_mul(m, m, m);
+ * @endcode
+ *
+ * @param[in]  m1   left matrix
+ * @param[in]  m2   right matrix
+ * @param[out] dest destination matrix
+ */
+CGLM_INLINE
+void
+glm_mat3_mul(mat3 m1, mat3 m2, mat3 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_mat3_mul_wasm(m1, m2, dest);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_mat3_mul_sse2(m1, m2, dest);
+#else
+  float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2],
+        a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2],
+        a20 = m1[2][0], a21 = m1[2][1], a22 = m1[2][2],
+
+        b00 = m2[0][0], b01 = m2[0][1], b02 = m2[0][2],
+        b10 = m2[1][0], b11 = m2[1][1], b12 = m2[1][2],
+        b20 = m2[2][0], b21 = m2[2][1], b22 = m2[2][2];
+
+  dest[0][0] = a00 * b00 + a10 * b01 + a20 * b02;
+  dest[0][1] = a01 * b00 + a11 * b01 + a21 * b02;
+  dest[0][2] = a02 * b00 + a12 * b01 + a22 * b02;
+  dest[1][0] = a00 * b10 + a10 * b11 + a20 * b12;
+  dest[1][1] = a01 * b10 + a11 * b11 + a21 * b12;
+  dest[1][2] = a02 * b10 + a12 * b11 + a22 * b12;
+  dest[2][0] = a00 * b20 + a10 * b21 + a20 * b22;
+  dest[2][1] = a01 * b20 + a11 * b21 + a21 * b22;
+  dest[2][2] = a02 * b20 + a12 * b21 + a22 * b22;
+#endif
+}
+
+/*!
+ * @brief transpose mat3 and store in dest
+ *
+ * source matrix will not be transposed unless dest is m
+ *
+ * @param[in]  m     matrix
+ * @param[out] dest  result
+ */
+CGLM_INLINE
+void
+glm_mat3_transpose_to(mat3 m, mat3 dest) {
+  dest[0][0] = m[0][0];
+  dest[0][1] = m[1][0];
+  dest[0][2] = m[2][0];
+  dest[1][0] = m[0][1];
+  dest[1][1] = m[1][1];
+  dest[1][2] = m[2][1];
+  dest[2][0] = m[0][2];
+  dest[2][1] = m[1][2];
+  dest[2][2] = m[2][2];
+}
+
+/*!
+ * @brief transpose mat3 and store result in same matrix
+ *
+ * @param[in, out] m source and dest
+ */
+CGLM_INLINE
+void
+glm_mat3_transpose(mat3 m) {
+  CGLM_ALIGN_MAT mat3 tmp;
+
+  tmp[0][1] = m[1][0];
+  tmp[0][2] = m[2][0];
+  tmp[1][0] = m[0][1];
+  tmp[1][2] = m[2][1];
+  tmp[2][0] = m[0][2];
+  tmp[2][1] = m[1][2];
+
+  m[0][1] = tmp[0][1];
+  m[0][2] = tmp[0][2];
+  m[1][0] = tmp[1][0];
+  m[1][2] = tmp[1][2];
+  m[2][0] = tmp[2][0];
+  m[2][1] = tmp[2][1];
+}
+
+/*!
+ * @brief multiply mat3 with vec3 (column vector) and store in dest vector
+ *
+ * @param[in]  m    mat3 (left)
+ * @param[in]  v    vec3 (right, column vector)
+ * @param[out] dest vec3 (result, column vector)
+ */
+CGLM_INLINE
+void
+glm_mat3_mulv(mat3 m, vec3 v, vec3 dest) {
+  vec3 res;
+  res[0] = m[0][0] * v[0] + m[1][0] * v[1] + m[2][0] * v[2];
+  res[1] = m[0][1] * v[0] + m[1][1] * v[1] + m[2][1] * v[2];
+  res[2] = m[0][2] * v[0] + m[1][2] * v[1] + m[2][2] * v[2];
+  glm_vec3_copy(res, dest);
+}
+
+/*!
+ * @brief trace of matrix
+ *
+ * sum of the elements on the main diagonal from upper left to the lower right
+ *
+ * @param[in]  m matrix
+ */
+CGLM_INLINE
+float
+glm_mat3_trace(mat3 m) {
+  return m[0][0] + m[1][1] + m[2][2];
+}
+
+/*!
+ * @brief convert mat3 to quaternion
+ *
+ * @param[in]  m    rotation matrix
+ * @param[out] dest destination quaternion
+ */
+CGLM_INLINE
+void
+glm_mat3_quat(mat3 m, versor dest) {
+  float trace, r, rinv;
+
+  /* it seems using like m12 instead of m[1][2] causes extra instructions */
+
+  trace = m[0][0] + m[1][1] + m[2][2];
+  if (trace >= 0.0f) {
+    r       = sqrtf(1.0f + trace);
+    rinv    = 0.5f / r;
+
+    dest[0] = rinv * (m[1][2] - m[2][1]);
+    dest[1] = rinv * (m[2][0] - m[0][2]);
+    dest[2] = rinv * (m[0][1] - m[1][0]);
+    dest[3] = r    * 0.5f;
+  } else if (m[0][0] >= m[1][1] && m[0][0] >= m[2][2]) {
+    r       = sqrtf(1.0f - m[1][1] - m[2][2] + m[0][0]);
+    rinv    = 0.5f / r;
+
+    dest[0] = r    * 0.5f;
+    dest[1] = rinv * (m[0][1] + m[1][0]);
+    dest[2] = rinv * (m[0][2] + m[2][0]);
+    dest[3] = rinv * (m[1][2] - m[2][1]);
+  } else if (m[1][1] >= m[2][2]) {
+    r       = sqrtf(1.0f - m[0][0] - m[2][2] + m[1][1]);
+    rinv    = 0.5f / r;
+
+    dest[0] = rinv * (m[0][1] + m[1][0]);
+    dest[1] = r    * 0.5f;
+    dest[2] = rinv * (m[1][2] + m[2][1]);
+    dest[3] = rinv * (m[2][0] - m[0][2]);
+  } else {
+    r       = sqrtf(1.0f - m[0][0] - m[1][1] + m[2][2]);
+    rinv    = 0.5f / r;
+
+    dest[0] = rinv * (m[0][2] + m[2][0]);
+    dest[1] = rinv * (m[1][2] + m[2][1]);
+    dest[2] = r    * 0.5f;
+    dest[3] = rinv * (m[0][1] - m[1][0]);
+  }
+}
+
+/*!
+ * @brief scale (multiply with scalar) matrix
+ *
+ * multiply matrix with scalar
+ *
+ * @param[in, out] m matrix
+ * @param[in]      s scalar
+ */
+CGLM_INLINE
+void
+glm_mat3_scale(mat3 m, float s) {
+  m[0][0] *= s; m[0][1] *= s; m[0][2] *= s;
+  m[1][0] *= s; m[1][1] *= s; m[1][2] *= s;
+  m[2][0] *= s; m[2][1] *= s; m[2][2] *= s;
+}
+
+/*!
+ * @brief mat3 determinant
+ *
+ * @param[in] mat matrix
+ *
+ * @return determinant
+ */
+CGLM_INLINE
+float
+glm_mat3_det(mat3 mat) {
+  float a = mat[0][0], b = mat[0][1], c = mat[0][2],
+        d = mat[1][0], e = mat[1][1], f = mat[1][2],
+        g = mat[2][0], h = mat[2][1], i = mat[2][2];
+
+  return a * (e * i - h * f) - d * (b * i - h * c) + g * (b * f - e * c);
+}
+
+/*!
+ * @brief inverse mat3 and store in dest
+ *
+ * @param[in]  mat  matrix
+ * @param[out] dest inverse matrix
+ */
+CGLM_INLINE
+void
+glm_mat3_inv(mat3 mat, mat3 dest) {
+  float a = mat[0][0], b = mat[0][1], c = mat[0][2],
+        d = mat[1][0], e = mat[1][1], f = mat[1][2],
+        g = mat[2][0], h = mat[2][1], i = mat[2][2],
+
+        c1  = e * i - f * h, c2 = d * i - g * f, c3 = d * h - g * e,
+        idt = 1.0f / (a * c1 - b * c2 + c * c3), ndt = -idt;
+
+  dest[0][0] = idt * c1;
+  dest[0][1] = ndt * (b * i - h * c);
+  dest[0][2] = idt * (b * f - e * c);
+  dest[1][0] = ndt * c2;
+  dest[1][1] = idt * (a * i - g * c);
+  dest[1][2] = ndt * (a * f - d * c);
+  dest[2][0] = idt * c3;
+  dest[2][1] = ndt * (a * h - g * b);
+  dest[2][2] = idt * (a * e - d * b);
+}
+
+/*!
+ * @brief swap two matrix columns
+ *
+ * @param[in,out] mat  matrix
+ * @param[in]     col1 col1
+ * @param[in]     col2 col2
+ */
+CGLM_INLINE
+void
+glm_mat3_swap_col(mat3 mat, int col1, int col2) {
+  vec3 tmp;
+  glm_vec3_copy(mat[col1], tmp);
+  glm_vec3_copy(mat[col2], mat[col1]);
+  glm_vec3_copy(tmp, mat[col2]);
+}
+
+/*!
+ * @brief swap two matrix rows
+ *
+ * @param[in,out] mat  matrix
+ * @param[in]     row1 row1
+ * @param[in]     row2 row2
+ */
+CGLM_INLINE
+void
+glm_mat3_swap_row(mat3 mat, int row1, int row2) {
+  vec3 tmp;
+  tmp[0] = mat[0][row1];
+  tmp[1] = mat[1][row1];
+  tmp[2] = mat[2][row1];
+
+  mat[0][row1] = mat[0][row2];
+  mat[1][row1] = mat[1][row2];
+  mat[2][row1] = mat[2][row2];
+
+  mat[0][row2] = tmp[0];
+  mat[1][row2] = tmp[1];
+  mat[2][row2] = tmp[2];
+}
+
+/*!
+ * @brief helper for  R (row vector) * M (matrix) * C (column vector)
+ *
+ * rmc stands for Row * Matrix * Column
+ *
+ * the result is scalar because R * M = Matrix1x3 (row vector),
+ * then Matrix1x3 * Vec3 (column vector) = Matrix1x1 (Scalar)
+ *
+ * @param[in]  r   row vector or matrix1x3
+ * @param[in]  m   matrix3x3
+ * @param[in]  c   column vector or matrix3x1
+ *
+ * @return scalar value e.g. Matrix1x1
+ */
+CGLM_INLINE
+float
+glm_mat3_rmc(vec3 r, mat3 m, vec3 c) {
+  vec3 tmp;
+  glm_mat3_mulv(m, c, tmp);
+  return glm_vec3_dot(r, tmp);
+}
+
+/*!
+ * @brief Create mat3 matrix from pointer
+ *
+ * @param[in]  src  pointer to an array of floats
+ * @param[out] dest matrix
+ */
+CGLM_INLINE
+void
+glm_mat3_make(const float * __restrict src, mat3 dest) {
+  dest[0][0] = src[0];
+  dest[0][1] = src[1];
+  dest[0][2] = src[2];
+
+  dest[1][0] = src[3];
+  dest[1][1] = src[4];
+  dest[1][2] = src[5];
+
+  dest[2][0] = src[6];
+  dest[2][1] = src[7];
+  dest[2][2] = src[8];
+}
+
+/*!
+ * @brief Create mat3 matrix from texture transform parameters
+ *
+ * @param[in]  sx   scale x
+ * @param[in]  sy   scale y
+ * @param[in]  rot  rotation in radians CCW/RH
+ * @param[in]  tx   translate x
+ * @param[in]  ty   translate y
+ * @param[out] dest texture transform matrix
+ */
+CGLM_INLINE
+void
+glm_mat3_textrans(float sx, float sy, float rot, float tx, float ty, mat3 dest) {
+  float c, s;
+
+  c = cosf(rot);
+  s = sinf(rot);
+
+  glm_mat3_identity(dest);
+
+  dest[0][0] =  c * sx;
+  dest[0][1] = -s * sy;
+  dest[1][0] =  s * sx;
+  dest[1][1] =  c * sy;
+  dest[2][0] =  tx;
+  dest[2][1] =  ty;
+}
+
+#endif /* cglm_mat3_h */
diff --git a/external/cglm/mat3x2.h b/external/cglm/mat3x2.h
new file mode 100644
index 0000000..52173c0
--- /dev/null
+++ b/external/cglm/mat3x2.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_MAT3X2_ZERO_INIT
+   GLM_MAT3X2_ZERO
+
+ Functions:
+   CGLM_INLINE void glm_mat3x2_copy(mat3x2 src, mat3x2 dest);
+   CGLM_INLINE void glm_mat3x2_zero(mat3x2 m);
+   CGLM_INLINE void glm_mat3x2_make(const float * __restrict src, mat3x2 dest);
+   CGLM_INLINE void glm_mat3x2_mul(mat3x2 m1, mat2x3 m2, mat2 dest);
+   CGLM_INLINE void glm_mat3x2_mulv(mat3x2 m, vec3 v, vec2 dest);
+   CGLM_INLINE void glm_mat3x2_transpose(mat3x2 src, mat2x3 dest);
+   CGLM_INLINE void glm_mat3x2_scale(mat3x2 m, float s);
+ */
+
+#ifndef cglm_mat3x2_h
+#define cglm_mat3x2_h
+
+#include "common.h"
+
+#define GLM_MAT3X2_ZERO_INIT {{0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}}
+
+/* for C only */
+#define GLM_MAT3X2_ZERO GLM_MAT3X2_ZERO_INIT
+
+/*!
+ * @brief Copy mat3x2 (src) to mat3x2 (dest).
+ *
+ * @param[in]  src  mat3x2 (left)
+ * @param[out] dest destination (result, mat3x2)
+ */
+CGLM_INLINE
+void
+glm_mat3x2_copy(mat3x2 src, mat3x2 dest) {
+  glm_vec2_copy(src[0], dest[0]);
+  glm_vec2_copy(src[1], dest[1]);
+  glm_vec2_copy(src[2], dest[2]);
+}
+
+/*!
+ * @brief Zero out the mat3x2 (m).
+ *
+ * @param[in, out] mat3x2 (src, dest)
+ */
+CGLM_INLINE
+void
+glm_mat3x2_zero(mat3x2 m) {
+  CGLM_ALIGN_MAT mat3x2 t = GLM_MAT3X2_ZERO_INIT;
+  glm_mat3x2_copy(t, m);
+}
+
+/*!
+ * @brief Create mat3x2 (dest) from pointer (src).
+ *
+ * @param[in]  src  pointer to an array of floats (left)
+ * @param[out] dest destination (result, mat3x2)
+ */
+CGLM_INLINE
+void
+glm_mat3x2_make(const float * __restrict src, mat3x2 dest) {
+  dest[0][0] = src[0];
+  dest[0][1] = src[1];
+
+  dest[1][0] = src[2];
+  dest[1][1] = src[3];
+
+  dest[2][0] = src[4];
+  dest[2][1] = src[5];
+}
+
+/*!
+ * @brief Multiply mat3x2 (m1) by mat2x3 (m2) and store in mat2 (dest).
+ *
+ * @code
+ * glm_mat3x2_mul(mat3x2, mat2x3, mat2);
+ * @endcode
+ *
+ * @param[in]  m1   mat3x2 (left)
+ * @param[in]  m2   mat2x3 (right)
+ * @param[out] dest destination (result, mat2)
+ */
+CGLM_INLINE
+void
+glm_mat3x2_mul(mat3x2 m1, mat2x3 m2, mat2 dest) {
+  float a00 = m1[0][0], a01 = m1[0][1],
+        a10 = m1[1][0], a11 = m1[1][1],
+        a20 = m1[2][0], a21 = m1[2][1],
+
+        b00 = m2[0][0], b01 = m2[0][1], b02 = m2[0][2],
+        b10 = m2[1][0], b11 = m2[1][1], b12 = m2[1][2];
+
+  dest[0][0] = a00 * b00 + a10 * b01 + a20 * b02;
+  dest[0][1] = a01 * b00 + a11 * b01 + a21 * b02;
+
+  dest[1][0] = a00 * b10 + a10 * b11 + a20 * b12;
+  dest[1][1] = a01 * b10 + a11 * b11 + a21 * b12;
+}
+
+/*!
+ * @brief Multiply mat3x2 (m) by vec3 (v) and store in vec2 (dest).
+ *
+ * @param[in]  m    mat3x2 (left)
+ * @param[in]  v    vec3 (right, column vector)
+ * @param[out] dest destination (result, column vector)
+ */
+CGLM_INLINE
+void
+glm_mat3x2_mulv(mat3x2 m, vec3 v, vec2 dest) {
+  float v0 = v[0], v1 = v[1], v2 = v[2];
+
+  dest[0] = m[0][0] * v0 + m[1][0] * v1 + m[2][0] * v2;
+  dest[1] = m[0][1] * v0 + m[1][1] * v1 + m[2][1] * v2;
+}
+
+/*!
+ * @brief Transpose mat3x2 (src) and store in mat2x3 (dest).
+ *
+ * @param[in]  src  mat3x2 (left)
+ * @param[out] dest destination (result, mat2x3)
+ */
+CGLM_INLINE
+void
+glm_mat3x2_transpose(mat3x2 src, mat2x3 dest) {
+  dest[0][0] = src[0][0];  dest[0][1] = src[1][0];  dest[0][2] = src[2][0];
+  dest[1][0] = src[0][1];  dest[1][1] = src[1][1];  dest[1][2] = src[2][1];
+}
+
+/*!
+ * @brief Multiply mat3x2 (m) by scalar constant (s).
+ *
+ * @param[in, out] m (src, dest)
+ * @param[in]      s float (scalar)
+ */
+CGLM_INLINE
+void
+glm_mat3x2_scale(mat3x2 m, float s) {
+  m[0][0] *= s;  m[0][1] *= s;  m[1][0] *= s;
+  m[1][1] *= s;  m[2][0] *= s;  m[2][1] *= s;
+}
+
+#endif
diff --git a/external/cglm/mat3x4.h b/external/cglm/mat3x4.h
new file mode 100644
index 0000000..52d8e7e
--- /dev/null
+++ b/external/cglm/mat3x4.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_MAT3X4_ZERO_INIT
+   GLM_MAT3X4_ZERO
+
+ Functions:
+   CGLM_INLINE void glm_mat3x4_copy(mat3x4 src, mat3x4 dest);
+   CGLM_INLINE void glm_mat3x4_zero(mat3x4 m);
+   CGLM_INLINE void glm_mat3x4_make(const float * __restrict src, mat3x4 dest);
+   CGLM_INLINE void glm_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat4 dest);
+   CGLM_INLINE void glm_mat3x4_mulv(mat3x4 m, vec3 v, vec4 dest);
+   CGLM_INLINE void glm_mat3x4_transpose(mat3x4 src, mat4x3 dest);
+   CGLM_INLINE void glm_mat3x4_scale(mat3x4 m, float s);
+ */
+
+#ifndef cglm_mat3x4_h
+#define cglm_mat3x4_h
+
+#include "common.h"
+
+#define GLM_MAT3X4_ZERO_INIT {{0.0f, 0.0f, 0.0f, 0.0f}, \
+                              {0.0f, 0.0f, 0.0f, 0.0f}, \
+                              {0.0f, 0.0f, 0.0f, 0.0f}}
+
+/* for C only */
+#define GLM_MAT3X4_ZERO GLM_MAT3X4_ZERO_INIT
+
+/*!
+ * @brief Copy mat3x4 (src) to mat3x4 (dest).
+ *
+ * @param[in]  src  mat3x4 (left)
+ * @param[out] dest destination (result, mat3x4)
+ */
+CGLM_INLINE
+void
+glm_mat3x4_copy(mat3x4 src, mat3x4 dest) {
+  glm_vec4_ucopy(src[0], dest[0]);
+  glm_vec4_ucopy(src[1], dest[1]);
+  glm_vec4_ucopy(src[2], dest[2]);
+}
+
+/*!
+ * @brief Zero out the mat3x4 (m).
+ *
+ * @param[in, out] mat3x4 (src, dest)
+ */
+CGLM_INLINE
+void
+glm_mat3x4_zero(mat3x4 m) {
+  CGLM_ALIGN_MAT mat3x4 t = GLM_MAT3X4_ZERO_INIT;
+  glm_mat3x4_copy(t, m);
+}
+
+/*!
+ * @brief Create mat3x4 (dest) from pointer (src).
+ *
+ * @param[in]  src  pointer to an array of floats (left)
+ * @param[out] dest destination (result, mat3x4)
+ */
+CGLM_INLINE
+void
+glm_mat3x4_make(const float * __restrict src, mat3x4 dest) {
+  dest[0][0] = src[0];
+  dest[0][1] = src[1];
+  dest[0][2] = src[2];
+  dest[0][3] = src[3];
+
+  dest[1][0] = src[4];
+  dest[1][1] = src[5];
+  dest[1][2] = src[6];
+  dest[1][3] = src[7];
+
+  dest[2][0] = src[8];
+  dest[2][1] = src[9];
+  dest[2][2] = src[10];
+  dest[2][3] = src[11];
+}
+
+/*!
+ * @brief Multiply mat3x4 (m1) by mat4x3 (m2) and store in mat4 (dest).
+ *
+ * @code
+ * glm_mat3x4_mul(mat3x4, mat4x3, mat4);
+ * @endcode
+ *
+ * @param[in]  m1   mat3x4 (left)
+ * @param[in]  m2   mat4x3 (right)
+ * @param[out] dest destination (result, mat4)
+ */
+CGLM_INLINE
+void
+glm_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat4 dest) {
+  float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3],
+        a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2], a13 = m1[1][3],
+        a20 = m1[2][0], a21 = m1[2][1], a22 = m1[2][2], a23 = m1[2][3],
+
+        b00 = m2[0][0], b01 = m2[0][1], b02 = m2[0][2],
+        b10 = m2[1][0], b11 = m2[1][1], b12 = m2[1][2],
+        b20 = m2[2][0], b21 = m2[2][1], b22 = m2[2][2],
+        b30 = m2[3][0], b31 = m2[3][1], b32 = m2[3][2];
+
+  dest[0][0] = a00 * b00 + a10 * b01 + a20 * b02;
+  dest[0][1] = a01 * b00 + a11 * b01 + a21 * b02;
+  dest[0][2] = a02 * b00 + a12 * b01 + a22 * b02;
+  dest[0][3] = a03 * b00 + a13 * b01 + a23 * b02;
+
+  dest[1][0] = a00 * b10 + a10 * b11 + a20 * b12;
+  dest[1][1] = a01 * b10 + a11 * b11 + a21 * b12;
+  dest[1][2] = a02 * b10 + a12 * b11 + a22 * b12;
+  dest[1][3] = a03 * b10 + a13 * b11 + a23 * b12;
+
+  dest[2][0] = a00 * b20 + a10 * b21 + a20 * b22;
+  dest[2][1] = a01 * b20 + a11 * b21 + a21 * b22;
+  dest[2][2] = a02 * b20 + a12 * b21 + a22 * b22;
+  dest[2][3] = a03 * b20 + a13 * b21 + a23 * b22;
+
+  dest[3][0] = a00 * b30 + a10 * b31 + a20 * b32;
+  dest[3][1] = a01 * b30 + a11 * b31 + a21 * b32;
+  dest[3][2] = a02 * b30 + a12 * b31 + a22 * b32;
+  dest[3][3] = a03 * b30 + a13 * b31 + a23 * b32;
+}
+
+/*!
+ * @brief Multiply mat3x4 (m) by vec3 (v) and store in vec4 (dest).
+ *
+ * @param[in]  m    mat3x4 (left)
+ * @param[in]  v    vec3 (right, column vector)
+ * @param[out] dest destination (result, column vector)
+ */
+CGLM_INLINE
+void
+glm_mat3x4_mulv(mat3x4 m, vec3 v, vec4 dest) {
+  float v0 = v[0], v1 = v[1], v2 = v[2];
+
+  dest[0] = m[0][0] * v0 + m[1][0] * v1 + m[2][0] * v2;
+  dest[1] = m[0][1] * v0 + m[1][1] * v1 + m[2][1] * v2;
+  dest[2] = m[0][2] * v0 + m[1][2] * v1 + m[2][2] * v2;
+  dest[3] = m[0][3] * v0 + m[1][3] * v1 + m[2][3] * v2;
+}
+
+/*!
+ * @brief Transpose mat3x4 (src) and store in mat4x3 (dest).
+ *
+ * @param[in]  src  mat3x4 (left)
+ * @param[out] dest destination (result, mat4x3)
+ */
+CGLM_INLINE
+void
+glm_mat3x4_transpose(mat3x4 src, mat4x3 dest) {
+  dest[0][0] = src[0][0];  dest[0][1] = src[1][0];  dest[0][2] = src[2][0];
+  dest[1][0] = src[0][1];  dest[1][1] = src[1][1];  dest[1][2] = src[2][1];
+  dest[2][0] = src[0][2];  dest[2][1] = src[1][2];  dest[2][2] = src[2][2];
+  dest[3][0] = src[0][3];  dest[3][1] = src[1][3];  dest[3][2] = src[2][3];
+}
+
+/*!
+ * @brief Multiply mat3x4 (m) by scalar constant (s).
+ *
+ * @param[in, out] m (src, dest)
+ * @param[in]      s float (scalar)
+ */
+CGLM_INLINE
+void
+glm_mat3x4_scale(mat3x4 m, float s) {
+  m[0][0] *= s; m[0][1] *= s; m[0][2] *= s; m[0][3] *= s;
+  m[1][0] *= s; m[1][1] *= s; m[1][2] *= s; m[1][3] *= s;
+  m[2][0] *= s; m[2][1] *= s; m[2][2] *= s; m[2][3] *= s;
+}
+
+#endif
diff --git a/external/cglm/mat4.h b/external/cglm/mat4.h
new file mode 100644
index 0000000..c3fe7fd
--- /dev/null
+++ b/external/cglm/mat4.h
@@ -0,0 +1,831 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*!
+ * Most of functions in this header are optimized manually with SIMD
+ * if available. You dont need to call/incude SIMD headers manually
+ */
+
+/*
+ Macros:
+   GLM_MAT4_IDENTITY_INIT
+   GLM_MAT4_ZERO_INIT
+   GLM_MAT4_IDENTITY
+   GLM_MAT4_ZERO
+
+ Functions:
+   CGLM_INLINE void  glm_mat4_ucopy(mat4 mat, mat4 dest);
+   CGLM_INLINE void  glm_mat4_copy(mat4 mat, mat4 dest);
+   CGLM_INLINE void  glm_mat4_identity(mat4 mat);
+   CGLM_INLINE void  glm_mat4_identity_array(mat4 * restrict mat, size_t count);
+   CGLM_INLINE void  glm_mat4_zero(mat4 mat);
+   CGLM_INLINE void  glm_mat4_pick3(mat4 mat, mat3 dest);
+   CGLM_INLINE void  glm_mat4_pick3t(mat4 mat, mat3 dest);
+   CGLM_INLINE void  glm_mat4_ins3(mat3 mat, mat4 dest);
+   CGLM_INLINE void  glm_mat4_mul(mat4 m1, mat4 m2, mat4 dest);
+   CGLM_INLINE void  glm_mat4_mulN(mat4 *matrices[], int len, mat4 dest);
+   CGLM_INLINE void  glm_mat4_mulv(mat4 m, vec4 v, vec4 dest);
+   CGLM_INLINE void  glm_mat4_mulv3(mat4 m, vec3 v, float last, vec3 dest);
+   CGLM_INLINE float glm_mat4_trace(mat4 m);
+   CGLM_INLINE float glm_mat4_trace3(mat4 m);
+   CGLM_INLINE void  glm_mat4_quat(mat4 m, versor dest) ;
+   CGLM_INLINE void  glm_mat4_transpose_to(mat4 m, mat4 dest);
+   CGLM_INLINE void  glm_mat4_transpose(mat4 m);
+   CGLM_INLINE void  glm_mat4_scale_p(mat4 m, float s);
+   CGLM_INLINE void  glm_mat4_scale(mat4 m, float s);
+   CGLM_INLINE float glm_mat4_det(mat4 mat);
+   CGLM_INLINE void  glm_mat4_inv(mat4 mat, mat4 dest);
+   CGLM_INLINE void  glm_mat4_inv_fast(mat4 mat, mat4 dest);
+   CGLM_INLINE void  glm_mat4_swap_col(mat4 mat, int col1, int col2);
+   CGLM_INLINE void  glm_mat4_swap_row(mat4 mat, int row1, int row2);
+   CGLM_INLINE float glm_mat4_rmc(vec4 r, mat4 m, vec4 c);
+   CGLM_INLINE void  glm_mat4_make(float * restrict src, mat4 dest);
+   CGLM_INLINE void  glm_mat4_textrans(float sx, float sy, float rot, float tx, float ty, mat4 dest);
+ */
+
+#ifndef cglm_mat_h
+#define cglm_mat_h
+
+#include "common.h"
+#include "vec4.h"
+#include "vec3.h"
+
+#ifdef CGLM_SSE_FP
+#  include "simd/sse2/mat4.h"
+#endif
+
+#ifdef CGLM_AVX_FP
+#  include "simd/avx/mat4.h"
+#endif
+
+#ifdef CGLM_NEON_FP
+#  include "simd/neon/mat4.h"
+#endif
+
+#ifdef CGLM_SIMD_WASM
+#  include "simd/wasm/mat4.h"
+#endif
+
+#ifndef NDEBUG
+# include <assert.h>
+#endif
+
+#define GLM_MAT4_IDENTITY_INIT  {{1.0f, 0.0f, 0.0f, 0.0f},                    \
+                                 {0.0f, 1.0f, 0.0f, 0.0f},                    \
+                                 {0.0f, 0.0f, 1.0f, 0.0f},                    \
+                                 {0.0f, 0.0f, 0.0f, 1.0f}}
+
+#define GLM_MAT4_ZERO_INIT      {{0.0f, 0.0f, 0.0f, 0.0f},                    \
+                                 {0.0f, 0.0f, 0.0f, 0.0f},                    \
+                                 {0.0f, 0.0f, 0.0f, 0.0f},                    \
+                                 {0.0f, 0.0f, 0.0f, 0.0f}}
+
+/* for C only */
+#define GLM_MAT4_IDENTITY ((mat4)GLM_MAT4_IDENTITY_INIT)
+#define GLM_MAT4_ZERO     ((mat4)GLM_MAT4_ZERO_INIT)
+
+/* DEPRECATED! use _copy, _ucopy versions */
+#define glm_mat4_udup(mat, dest) glm_mat4_ucopy(mat, dest)
+#define glm_mat4_dup(mat, dest)  glm_mat4_copy(mat, dest)
+
+/* DEPRECATED! default is precise now. */
+#define glm_mat4_inv_precise(mat, dest) glm_mat4_inv(mat, dest)
+
+/*!
+ * @brief copy all members of [mat] to [dest]
+ *
+ * matrix may not be aligned, u stands for unaligned, this may be useful when
+ * copying a matrix from external source e.g. asset importer...
+ *
+ * @param[in]  mat  source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_mat4_ucopy(mat4 mat, mat4 dest) {
+  dest[0][0] = mat[0][0];  dest[1][0] = mat[1][0];
+  dest[0][1] = mat[0][1];  dest[1][1] = mat[1][1];
+  dest[0][2] = mat[0][2];  dest[1][2] = mat[1][2];
+  dest[0][3] = mat[0][3];  dest[1][3] = mat[1][3];
+
+  dest[2][0] = mat[2][0];  dest[3][0] = mat[3][0];
+  dest[2][1] = mat[2][1];  dest[3][1] = mat[3][1];
+  dest[2][2] = mat[2][2];  dest[3][2] = mat[3][2];
+  dest[2][3] = mat[2][3];  dest[3][3] = mat[3][3];
+}
+
+/*!
+ * @brief copy all members of [mat] to [dest]
+ *
+ * @param[in]  mat  source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_mat4_copy(mat4 mat, mat4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest[0], glmm_load(mat[0]));
+  glmm_store(dest[1], glmm_load(mat[1]));
+  glmm_store(dest[2], glmm_load(mat[2]));
+  glmm_store(dest[3], glmm_load(mat[3]));
+#elif defined(__AVX__)
+  glmm_store256(dest[0], glmm_load256(mat[0]));
+  glmm_store256(dest[2], glmm_load256(mat[2]));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest[0], glmm_load(mat[0]));
+  glmm_store(dest[1], glmm_load(mat[1]));
+  glmm_store(dest[2], glmm_load(mat[2]));
+  glmm_store(dest[3], glmm_load(mat[3]));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest[0], vld1q_f32(mat[0]));
+  vst1q_f32(dest[1], vld1q_f32(mat[1]));
+  vst1q_f32(dest[2], vld1q_f32(mat[2]));
+  vst1q_f32(dest[3], vld1q_f32(mat[3]));
+#else
+  glm_mat4_ucopy(mat, dest);
+#endif
+}
+
+/*!
+ * @brief make given matrix identity. It is identical with below, 
+ *        but it is more easy to do that with this func especially for members
+ *        e.g. glm_mat4_identity(aStruct->aMatrix);
+ *
+ * @code
+ * glm_mat4_copy(GLM_MAT4_IDENTITY, mat); // C only
+ *
+ * // or
+ * mat4 mat = GLM_MAT4_IDENTITY_INIT;
+ * @endcode
+ *
+ * @param[in, out]  mat  destination
+ */
+CGLM_INLINE
+void
+glm_mat4_identity(mat4 mat) {
+  CGLM_ALIGN_MAT mat4 t = GLM_MAT4_IDENTITY_INIT;
+  glm_mat4_copy(t, mat);
+}
+
+/*!
+ * @brief make given matrix array's each element identity matrix
+ *
+ * @param[in, out]  mat   matrix array (must be aligned (16/32)
+ *                        if alignment is not disabled)
+ *
+ * @param[in]       count count of matrices
+ */
+CGLM_INLINE
+void
+glm_mat4_identity_array(mat4 * __restrict mat, size_t count) {
+  CGLM_ALIGN_MAT mat4 t = GLM_MAT4_IDENTITY_INIT;
+  size_t i;
+
+  for (i = 0; i < count; i++) {
+    glm_mat4_copy(t, mat[i]);
+  }
+}
+
+/*!
+ * @brief make given matrix zero.
+ *
+ * @param[in, out]  mat  matrix
+ */
+CGLM_INLINE
+void
+glm_mat4_zero(mat4 mat) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_128 x0;
+  x0 = wasm_f32x4_const_splat(0.f);
+  glmm_store(mat[0], x0);
+  glmm_store(mat[1], x0);
+  glmm_store(mat[2], x0);
+  glmm_store(mat[3], x0);
+#elif defined(__AVX__)
+  __m256 y0;
+  y0 = _mm256_setzero_ps();
+  glmm_store256(mat[0], y0);
+  glmm_store256(mat[2], y0);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_128 x0;
+  x0 = _mm_setzero_ps();
+  glmm_store(mat[0], x0);
+  glmm_store(mat[1], x0);
+  glmm_store(mat[2], x0);
+  glmm_store(mat[3], x0);
+#elif defined(CGLM_NEON_FP)
+  glmm_128 x0;
+  x0 = vdupq_n_f32(0.0f);
+  vst1q_f32(mat[0], x0);
+  vst1q_f32(mat[1], x0);
+  vst1q_f32(mat[2], x0);
+  vst1q_f32(mat[3], x0);
+#else
+  CGLM_ALIGN_MAT mat4 t = GLM_MAT4_ZERO_INIT;
+  glm_mat4_copy(t, mat);
+#endif
+}
+
+/*!
+ * @brief copy upper-left of mat4 to mat3
+ *
+ * @param[in]  mat  source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_mat4_pick3(mat4 mat, mat3 dest) {
+  dest[0][0] = mat[0][0];
+  dest[0][1] = mat[0][1];
+  dest[0][2] = mat[0][2];
+
+  dest[1][0] = mat[1][0];
+  dest[1][1] = mat[1][1];
+  dest[1][2] = mat[1][2];
+
+  dest[2][0] = mat[2][0];
+  dest[2][1] = mat[2][1];
+  dest[2][2] = mat[2][2];
+}
+
+/*!
+ * @brief copy upper-left of mat4 to mat3 (transposed)
+ *
+ * the postfix t stands for transpose
+ *
+ * @param[in]  mat  source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_mat4_pick3t(mat4 mat, mat3 dest) {
+  dest[0][0] = mat[0][0];
+  dest[0][1] = mat[1][0];
+  dest[0][2] = mat[2][0];
+
+  dest[1][0] = mat[0][1];
+  dest[1][1] = mat[1][1];
+  dest[1][2] = mat[2][1];
+
+  dest[2][0] = mat[0][2];
+  dest[2][1] = mat[1][2];
+  dest[2][2] = mat[2][2];
+}
+
+/*!
+ * @brief copy mat3 to mat4's upper-left
+ *
+ * @param[in]  mat  source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_mat4_ins3(mat3 mat, mat4 dest) {
+  dest[0][0] = mat[0][0];
+  dest[0][1] = mat[0][1];
+  dest[0][2] = mat[0][2];
+
+  dest[1][0] = mat[1][0];
+  dest[1][1] = mat[1][1];
+  dest[1][2] = mat[1][2];
+
+  dest[2][0] = mat[2][0];
+  dest[2][1] = mat[2][1];
+  dest[2][2] = mat[2][2];
+}
+
+/*!
+ * @brief multiply m1 and m2 to dest
+ *
+ * m1, m2 and dest matrices can be same matrix, it is possible to write this:
+ *
+ * @code
+ * mat4 m = GLM_MAT4_IDENTITY_INIT;
+ * glm_mat4_mul(m, m, m);
+ * @endcode
+ *
+ * @param[in]  m1   left matrix
+ * @param[in]  m2   right matrix
+ * @param[out] dest destination matrix
+ */
+CGLM_INLINE
+void
+glm_mat4_mul(mat4 m1, mat4 m2, mat4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_mat4_mul_wasm(m1, m2, dest);
+#elif defined(__AVX__)
+  glm_mat4_mul_avx(m1, m2, dest);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_mat4_mul_sse2(m1, m2, dest);
+#elif defined(CGLM_NEON_FP)
+  glm_mat4_mul_neon(m1, m2, dest);
+#else
+  float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3],
+        a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2], a13 = m1[1][3],
+        a20 = m1[2][0], a21 = m1[2][1], a22 = m1[2][2], a23 = m1[2][3],
+        a30 = m1[3][0], a31 = m1[3][1], a32 = m1[3][2], a33 = m1[3][3],
+
+        b00 = m2[0][0], b01 = m2[0][1], b02 = m2[0][2], b03 = m2[0][3],
+        b10 = m2[1][0], b11 = m2[1][1], b12 = m2[1][2], b13 = m2[1][3],
+        b20 = m2[2][0], b21 = m2[2][1], b22 = m2[2][2], b23 = m2[2][3],
+        b30 = m2[3][0], b31 = m2[3][1], b32 = m2[3][2], b33 = m2[3][3];
+
+  dest[0][0] = a00 * b00 + a10 * b01 + a20 * b02 + a30 * b03;
+  dest[0][1] = a01 * b00 + a11 * b01 + a21 * b02 + a31 * b03;
+  dest[0][2] = a02 * b00 + a12 * b01 + a22 * b02 + a32 * b03;
+  dest[0][3] = a03 * b00 + a13 * b01 + a23 * b02 + a33 * b03;
+  dest[1][0] = a00 * b10 + a10 * b11 + a20 * b12 + a30 * b13;
+  dest[1][1] = a01 * b10 + a11 * b11 + a21 * b12 + a31 * b13;
+  dest[1][2] = a02 * b10 + a12 * b11 + a22 * b12 + a32 * b13;
+  dest[1][3] = a03 * b10 + a13 * b11 + a23 * b12 + a33 * b13;
+  dest[2][0] = a00 * b20 + a10 * b21 + a20 * b22 + a30 * b23;
+  dest[2][1] = a01 * b20 + a11 * b21 + a21 * b22 + a31 * b23;
+  dest[2][2] = a02 * b20 + a12 * b21 + a22 * b22 + a32 * b23;
+  dest[2][3] = a03 * b20 + a13 * b21 + a23 * b22 + a33 * b23;
+  dest[3][0] = a00 * b30 + a10 * b31 + a20 * b32 + a30 * b33;
+  dest[3][1] = a01 * b30 + a11 * b31 + a21 * b32 + a31 * b33;
+  dest[3][2] = a02 * b30 + a12 * b31 + a22 * b32 + a32 * b33;
+  dest[3][3] = a03 * b30 + a13 * b31 + a23 * b32 + a33 * b33;
+#endif
+}
+
+/*!
+ * @brief mupliply N mat4 matrices and store result in dest
+ *
+ * this function lets you multiply multiple (more than two or more...) matrices
+ * <br><br>multiplication will be done in loop, this may reduce instructions
+ * size but if <b>len</b> is too small then compiler may unroll whole loop,
+ * usage:
+ * @code
+ * mat4 m1, m2, m3, m4, res;
+ *
+ * glm_mat4_mulN((mat4 *[]){&m1, &m2, &m3, &m4}, 4, res);
+ * @endcode
+ *
+ * @warning matrices parameter is pointer array not mat4 array!
+ *
+ * @param[in]  matrices mat4 * array
+ * @param[in]  len      matrices count
+ * @param[out] dest     result
+ */
+CGLM_INLINE
+void
+glm_mat4_mulN(mat4 * __restrict matrices[], uint32_t len, mat4 dest) {
+  uint32_t i;
+
+#ifndef NDEBUG
+  assert(len > 1 && "there must be least 2 matrices to go!");
+#endif
+
+  glm_mat4_mul(*matrices[0], *matrices[1], dest);
+
+  for (i = 2; i < len; i++)
+    glm_mat4_mul(dest, *matrices[i], dest);
+}
+
+/*!
+ * @brief multiply mat4 with vec4 (column vector) and store in dest vector
+ *
+ * @param[in]  m    mat4 (left)
+ * @param[in]  v    vec4 (right, column vector)
+ * @param[out] dest vec4 (result, column vector)
+ */
+CGLM_INLINE
+void
+glm_mat4_mulv(mat4 m, vec4 v, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_mat4_mulv_wasm(m, v, dest);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_mat4_mulv_sse2(m, v, dest);
+#elif defined(CGLM_NEON_FP)
+  glm_mat4_mulv_neon(m, v, dest);
+#else
+  vec4 res;
+  res[0] = m[0][0] * v[0] + m[1][0] * v[1] + m[2][0] * v[2] + m[3][0] * v[3];
+  res[1] = m[0][1] * v[0] + m[1][1] * v[1] + m[2][1] * v[2] + m[3][1] * v[3];
+  res[2] = m[0][2] * v[0] + m[1][2] * v[1] + m[2][2] * v[2] + m[3][2] * v[3];
+  res[3] = m[0][3] * v[0] + m[1][3] * v[1] + m[2][3] * v[2] + m[3][3] * v[3];
+  glm_vec4_copy(res, dest);
+#endif
+}
+
+/*!
+ * @brief trace of matrix
+ *
+ * sum of the elements on the main diagonal from upper left to the lower right
+ *
+ * @param[in]  m matrix
+ */
+CGLM_INLINE
+float
+glm_mat4_trace(mat4 m) {
+  return m[0][0] + m[1][1] + m[2][2] + m[3][3];
+}
+
+/*!
+ * @brief trace of matrix (rotation part)
+ *
+ * sum of the elements on the main diagonal from upper left to the lower right
+ *
+ * @param[in]  m matrix
+ */
+CGLM_INLINE
+float
+glm_mat4_trace3(mat4 m) {
+  return m[0][0] + m[1][1] + m[2][2];
+}
+
+/*!
+ * @brief convert mat4's rotation part to quaternion
+ *
+ * @param[in]  m    affine matrix
+ * @param[out] dest destination quaternion
+ */
+CGLM_INLINE
+void
+glm_mat4_quat(mat4 m, versor dest) {
+  float trace, r, rinv;
+
+  /* it seems using like m12 instead of m[1][2] causes extra instructions */
+
+  trace = m[0][0] + m[1][1] + m[2][2];
+  if (trace >= 0.0f) {
+    r       = sqrtf(1.0f + trace);
+    rinv    = 0.5f / r;
+
+    dest[0] = rinv * (m[1][2] - m[2][1]);
+    dest[1] = rinv * (m[2][0] - m[0][2]);
+    dest[2] = rinv * (m[0][1] - m[1][0]);
+    dest[3] = r    * 0.5f;
+  } else if (m[0][0] >= m[1][1] && m[0][0] >= m[2][2]) {
+    r       = sqrtf(1.0f - m[1][1] - m[2][2] + m[0][0]);
+    rinv    = 0.5f / r;
+
+    dest[0] = r    * 0.5f;
+    dest[1] = rinv * (m[0][1] + m[1][0]);
+    dest[2] = rinv * (m[0][2] + m[2][0]);
+    dest[3] = rinv * (m[1][2] - m[2][1]);
+  } else if (m[1][1] >= m[2][2]) {
+    r       = sqrtf(1.0f - m[0][0] - m[2][2] + m[1][1]);
+    rinv    = 0.5f / r;
+
+    dest[0] = rinv * (m[0][1] + m[1][0]);
+    dest[1] = r    * 0.5f;
+    dest[2] = rinv * (m[1][2] + m[2][1]);
+    dest[3] = rinv * (m[2][0] - m[0][2]);
+  } else {
+    r       = sqrtf(1.0f - m[0][0] - m[1][1] + m[2][2]);
+    rinv    = 0.5f / r;
+
+    dest[0] = rinv * (m[0][2] + m[2][0]);
+    dest[1] = rinv * (m[1][2] + m[2][1]);
+    dest[2] = r    * 0.5f;
+    dest[3] = rinv * (m[0][1] - m[1][0]);
+  }
+}
+
+/*!
+ * @brief multiply vector with mat4
+ *
+ * actually the result is vec4, after multiplication the last component
+ * is trimmed. if you need it don't use this func.
+ *
+ * @param[in]  m    mat4(affine transform)
+ * @param[in]  v    vec3
+ * @param[in]  last 4th item to make it vec4
+ * @param[out] dest result vector (vec3)
+ */
+CGLM_INLINE
+void
+glm_mat4_mulv3(mat4 m, vec3 v, float last, vec3 dest) {
+  vec4 res;
+  glm_vec4(v, last, res);
+  glm_mat4_mulv(m, res, res);
+  glm_vec3(res, dest);
+}
+
+/*!
+ * @brief transpose mat4 and store in dest
+ *
+ * source matrix will not be transposed unless dest is m
+ *
+ * @param[in]  m    matrix
+ * @param[out] dest result
+ */
+CGLM_INLINE
+void
+glm_mat4_transpose_to(mat4 m, mat4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_mat4_transp_wasm(m, dest);
+#elif defined(__AVX__)
+  glm_mat4_transp_avx(m, dest);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_mat4_transp_sse2(m, dest);
+#elif defined(CGLM_NEON_FP)
+  glm_mat4_transp_neon(m, dest);
+#else
+  dest[0][0] = m[0][0]; dest[1][0] = m[0][1];
+  dest[0][1] = m[1][0]; dest[1][1] = m[1][1];
+  dest[0][2] = m[2][0]; dest[1][2] = m[2][1];
+  dest[0][3] = m[3][0]; dest[1][3] = m[3][1];
+  dest[2][0] = m[0][2]; dest[3][0] = m[0][3];
+  dest[2][1] = m[1][2]; dest[3][1] = m[1][3];
+  dest[2][2] = m[2][2]; dest[3][2] = m[2][3];
+  dest[2][3] = m[3][2]; dest[3][3] = m[3][3];
+#endif
+}
+
+/*!
+ * @brief transpose mat4 and store result in same matrix
+ *
+ * @param[in, out] m source and dest
+ */
+CGLM_INLINE
+void
+glm_mat4_transpose(mat4 m) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_mat4_transp_wasm(m, m);
+#elif defined(__AVX__)
+  glm_mat4_transp_avx(m, m);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_mat4_transp_sse2(m, m);
+#elif defined(CGLM_NEON_FP)
+  glm_mat4_transp_neon(m, m);
+#else
+  mat4 d;
+  glm_mat4_transpose_to(m, d);
+  glm_mat4_ucopy(d, m);
+#endif
+}
+
+/*!
+ * @brief scale (multiply with scalar) matrix without simd optimization
+ *
+ * multiply matrix with scalar
+ *
+ * @param[in, out] m matrix
+ * @param[in]      s scalar
+ */
+CGLM_INLINE
+void
+glm_mat4_scale_p(mat4 m, float s) {
+  m[0][0] *= s; m[0][1] *= s; m[0][2] *= s; m[0][3] *= s;
+  m[1][0] *= s; m[1][1] *= s; m[1][2] *= s; m[1][3] *= s;
+  m[2][0] *= s; m[2][1] *= s; m[2][2] *= s; m[2][3] *= s;
+  m[3][0] *= s; m[3][1] *= s; m[3][2] *= s; m[3][3] *= s;
+}
+
+/*!
+ * @brief scale (multiply with scalar) matrix
+ *
+ * multiply matrix with scalar
+ *
+ * @param[in, out] m matrix
+ * @param[in]      s scalar
+ */
+CGLM_INLINE
+void
+glm_mat4_scale(mat4 m, float s) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_mat4_scale_wasm(m, s);
+#elif defined(__AVX__)
+  glm_mat4_scale_avx(m, s);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_mat4_scale_sse2(m, s);
+#elif defined(CGLM_NEON_FP)
+  glm_mat4_scale_neon(m, s);
+#else
+  glm_mat4_scale_p(m, s);
+#endif
+}
+
+/*!
+ * @brief mat4 determinant
+ *
+ * @param[in] mat matrix
+ *
+ * @return determinant
+ */
+CGLM_INLINE
+float
+glm_mat4_det(mat4 mat) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  return glm_mat4_det_wasm(mat);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  return glm_mat4_det_sse2(mat);
+#elif defined(CGLM_NEON_FP)
+  return glm_mat4_det_neon(mat);
+#else
+  /* [square] det(A) = det(At) */
+  float t[6];
+  float a = mat[0][0], b = mat[0][1], c = mat[0][2], d = mat[0][3],
+        e = mat[1][0], f = mat[1][1], g = mat[1][2], h = mat[1][3],
+        i = mat[2][0], j = mat[2][1], k = mat[2][2], l = mat[2][3],
+        m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3];
+
+  t[0] = k * p - o * l;
+  t[1] = j * p - n * l;
+  t[2] = j * o - n * k;
+  t[3] = i * p - m * l;
+  t[4] = i * o - m * k;
+  t[5] = i * n - m * j;
+
+  return a * (f * t[0] - g * t[1] + h * t[2])
+       - b * (e * t[0] - g * t[3] + h * t[4])
+       + c * (e * t[1] - f * t[3] + h * t[5])
+       - d * (e * t[2] - f * t[4] + g * t[5]);
+#endif
+}
+
+/*!
+ * @brief inverse mat4 and store in dest
+ *
+ * @param[in]  mat  matrix
+ * @param[out] dest inverse matrix
+ */
+CGLM_INLINE
+void
+glm_mat4_inv(mat4 mat, mat4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_mat4_inv_wasm(mat, dest);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_mat4_inv_sse2(mat, dest);
+#elif defined(CGLM_NEON_FP)
+  glm_mat4_inv_neon(mat, dest);
+#else
+  float a = mat[0][0], b = mat[0][1], c = mat[0][2], d = mat[0][3],
+        e = mat[1][0], f = mat[1][1], g = mat[1][2], h = mat[1][3],
+        i = mat[2][0], j = mat[2][1], k = mat[2][2], l = mat[2][3],
+        m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3],
+
+        c1  = k * p - l * o,  c2  = c * h - d * g,  c3  = i * p - l * m,
+        c4  = a * h - d * e,  c5  = j * p - l * n,  c6  = b * h - d * f, 
+        c7  = i * n - j * m,  c8  = a * f - b * e,  c9  = j * o - k * n,
+        c10 = b * g - c * f,  c11 = i * o - k * m,  c12 = a * g - c * e,
+
+        idt = 1.0f/(c8*c1+c4*c9+c10*c3+c2*c7-c12*c5-c6*c11), ndt = -idt;
+
+  dest[0][0] = (f * c1  - g * c5  + h * c9)  * idt;
+  dest[0][1] = (b * c1  - c * c5  + d * c9)  * ndt;
+  dest[0][2] = (n * c2  - o * c6  + p * c10) * idt;
+  dest[0][3] = (j * c2  - k * c6  + l * c10) * ndt;
+
+  dest[1][0] = (e * c1  - g * c3  + h * c11) * ndt;
+  dest[1][1] = (a * c1  - c * c3  + d * c11) * idt;
+  dest[1][2] = (m * c2  - o * c4  + p * c12) * ndt;
+  dest[1][3] = (i * c2  - k * c4  + l * c12) * idt;
+
+  dest[2][0] = (e * c5  - f * c3  + h * c7)  * idt;
+  dest[2][1] = (a * c5  - b * c3  + d * c7)  * ndt;
+  dest[2][2] = (m * c6  - n * c4  + p * c8)  * idt;
+  dest[2][3] = (i * c6  - j * c4  + l * c8)  * ndt;
+
+  dest[3][0] = (e * c9  - f * c11 + g * c7)  * ndt;
+  dest[3][1] = (a * c9  - b * c11 + c * c7)  * idt;
+  dest[3][2] = (m * c10 - n * c12 + o * c8)  * ndt;
+  dest[3][3] = (i * c10 - j * c12 + k * c8)  * idt;
+#endif
+}
+
+/*!
+ * @brief inverse mat4 and store in dest
+ *
+ * this func uses reciprocal approximation without extra corrections
+ * e.g Newton-Raphson. this should work faster than normal,
+ * to get more precise use glm_mat4_inv version.
+ *
+ * NOTE: You will lose precision, glm_mat4_inv is more accurate
+ *
+ * @param[in]  mat  matrix
+ * @param[out] dest inverse matrix
+ */
+CGLM_INLINE
+void
+glm_mat4_inv_fast(mat4 mat, mat4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_mat4_inv_fast_wasm(mat, dest);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_mat4_inv_fast_sse2(mat, dest);
+#else
+  glm_mat4_inv(mat, dest);
+#endif
+}
+
+/*!
+ * @brief swap two matrix columns
+ *
+ * @param[in,out] mat  matrix
+ * @param[in]     col1 col1
+ * @param[in]     col2 col2
+ */
+CGLM_INLINE
+void
+glm_mat4_swap_col(mat4 mat, int col1, int col2) {
+  CGLM_ALIGN(16) vec4 tmp;
+  glm_vec4_copy(mat[col1], tmp);
+  glm_vec4_copy(mat[col2], mat[col1]);
+  glm_vec4_copy(tmp, mat[col2]);
+}
+
+/*!
+ * @brief swap two matrix rows
+ *
+ * @param[in,out] mat  matrix
+ * @param[in]     row1 row1
+ * @param[in]     row2 row2
+ */
+CGLM_INLINE
+void
+glm_mat4_swap_row(mat4 mat, int row1, int row2) {
+  CGLM_ALIGN(16) vec4 tmp;
+  tmp[0] = mat[0][row1];
+  tmp[1] = mat[1][row1];
+  tmp[2] = mat[2][row1];
+  tmp[3] = mat[3][row1];
+
+  mat[0][row1] = mat[0][row2];
+  mat[1][row1] = mat[1][row2];
+  mat[2][row1] = mat[2][row2];
+  mat[3][row1] = mat[3][row2];
+
+  mat[0][row2] = tmp[0];
+  mat[1][row2] = tmp[1];
+  mat[2][row2] = tmp[2];
+  mat[3][row2] = tmp[3];
+}
+
+/*!
+ * @brief helper for  R (row vector) * M (matrix) * C (column vector)
+ *
+ * rmc stands for Row * Matrix * Column
+ *
+ * the result is scalar because R * M = Matrix1x4 (row vector),
+ * then Matrix1x4 * Vec4 (column vector) = Matrix1x1 (Scalar)
+ *
+ * @param[in]  r   row vector or matrix1x4
+ * @param[in]  m   matrix4x4
+ * @param[in]  c   column vector or matrix4x1
+ *
+ * @return scalar value e.g. B(s)
+ */
+CGLM_INLINE
+float
+glm_mat4_rmc(vec4 r, mat4 m, vec4 c) {
+  vec4 tmp;
+  glm_mat4_mulv(m, c, tmp);
+  return glm_vec4_dot(r, tmp);
+}
+
+/*!
+ * @brief Create mat4 matrix from pointer
+ *
+ * @param[in]  src  pointer to an array of floats
+ * @param[out] dest matrix
+ */
+CGLM_INLINE
+void
+glm_mat4_make(const float * __restrict src, mat4 dest) {
+  dest[0][0] = src[0];   dest[1][0] = src[4];
+  dest[0][1] = src[1];   dest[1][1] = src[5];
+  dest[0][2] = src[2];   dest[1][2] = src[6];
+  dest[0][3] = src[3];   dest[1][3] = src[7];
+
+  dest[2][0] = src[8];   dest[3][0] = src[12];
+  dest[2][1] = src[9];   dest[3][1] = src[13];
+  dest[2][2] = src[10];  dest[3][2] = src[14];
+  dest[2][3] = src[11];  dest[3][3] = src[15];
+}
+
+/*!
+ * @brief Create mat4 matrix from texture transform parameters
+ *
+ * @param[in]  sx   scale x
+ * @param[in]  sy   scale y
+ * @param[in]  rot  rotation in radians CCW/RH
+ * @param[in]  tx   translate x
+ * @param[in]  ty   translate y
+ * @param[out] dest texture transform matrix
+ */
+CGLM_INLINE
+void
+glm_mat4_textrans(float sx, float sy, float rot, float tx, float ty, mat4 dest) {
+  float c, s;
+
+  c = cosf(rot);
+  s = sinf(rot);
+
+  glm_mat4_identity(dest);
+
+  dest[0][0] =  c * sx;
+  dest[0][1] = -s * sy;
+  dest[1][0] =  s * sx;
+  dest[1][1] =  c * sy;
+  dest[3][0] =  tx;
+  dest[3][1] =  ty;
+}
+
+#endif /* cglm_mat_h */
diff --git a/external/cglm/mat4x2.h b/external/cglm/mat4x2.h
new file mode 100644
index 0000000..91684e4
--- /dev/null
+++ b/external/cglm/mat4x2.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_MAT4X2_ZERO_INIT
+   GLM_MAT4X2_ZERO
+
+ Functions:
+   CGLM_INLINE void glm_mat4x2_copy(mat4x2 src, mat4x2 dest);
+   CGLM_INLINE void glm_mat4x2_zero(mat4x2 m);
+   CGLM_INLINE void glm_mat4x2_make(const float * __restrict src, mat4x2 dest);
+   CGLM_INLINE void glm_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat2 dest);
+   CGLM_INLINE void glm_mat4x2_mulv(mat4x2 m, vec4 v, vec2 dest);
+   CGLM_INLINE void glm_mat4x2_transpose(mat4x2 src, mat2x4 dest);
+   CGLM_INLINE void glm_mat4x2_scale(mat4x2 m, float s);
+ */
+
+#ifndef cglm_mat4x2_h
+#define cglm_mat4x2_h
+
+#include "common.h"
+
+#define GLM_MAT4X2_ZERO_INIT {{0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}}
+
+/* for C only */
+#define GLM_MAT4X2_ZERO GLM_MAT4X2_ZERO_INIT
+
+/*!
+ * @brief Copy mat4x2 (src) to mat4x2 (dest).
+ *
+ * @param[in]  src  mat4x2 (left)
+ * @param[out] dest destination (result, mat4x2)
+ */
+CGLM_INLINE
+void
+glm_mat4x2_copy(mat4x2 src, mat4x2 dest) {
+  glm_vec2_copy(src[0], dest[0]);
+  glm_vec2_copy(src[1], dest[1]);
+  glm_vec2_copy(src[2], dest[2]);
+  glm_vec2_copy(src[3], dest[3]);
+}
+
+/*!
+ * @brief Zero out the mat4x2 (m).
+ *
+ * @param[in, out] mat4x2 (src, dest)
+ */
+CGLM_INLINE
+void
+glm_mat4x2_zero(mat4x2 m) {
+  CGLM_ALIGN_MAT mat4x2 t = GLM_MAT4X2_ZERO_INIT;
+  glm_mat4x2_copy(t, m);
+}
+
+/*!
+ * @brief Create mat4x2 (dest) from pointer (src).
+ *
+ * @param[in]  src  pointer to an array of floats (left)
+ * @param[out] dest destination (result, mat4x2)
+ */
+CGLM_INLINE
+void
+glm_mat4x2_make(const float * __restrict src, mat4x2 dest) {
+  dest[0][0] = src[0];
+  dest[0][1] = src[1];
+
+  dest[1][0] = src[2];
+  dest[1][1] = src[3];
+
+  dest[2][0] = src[4];
+  dest[2][1] = src[5];
+
+  dest[3][0] = src[6];
+  dest[3][1] = src[7];
+}
+
+/*!
+ * @brief Multiply mat4x2 (m1) by mat2x4 (m2) and store in mat2 (dest).
+ *
+ * @code
+ * glm_mat4x2_mul(mat4x2, mat2x4, mat2);
+ * @endcode
+ *
+ * @param[in]  m1   mat4x2 (left)
+ * @param[in]  m2   mat2x4 (right)
+ * @param[out] dest destination (result, mat2)
+ */
+CGLM_INLINE
+void
+glm_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat2 dest) {
+  float a00 = m1[0][0], a01 = m1[0][1],
+        a10 = m1[1][0], a11 = m1[1][1],
+        a20 = m1[2][0], a21 = m1[2][1],
+        a30 = m1[3][0], a31 = m1[3][1],
+
+        b00 = m2[0][0], b01 = m2[0][1], b02 = m2[0][2], b03 = m2[0][3],
+        b10 = m2[1][0], b11 = m2[1][1], b12 = m2[1][2], b13 = m2[1][3];
+
+  dest[0][0] = a00 * b00 + a10 * b01 + a20 * b02 + a30 * b03;
+  dest[0][1] = a01 * b00 + a11 * b01 + a21 * b02 + a31 * b03;
+
+  dest[1][0] = a00 * b10 + a10 * b11 + a20 * b12 + a30 * b13;
+  dest[1][1] = a01 * b10 + a11 * b11 + a21 * b12 + a31 * b13;
+}
+
+/*!
+ * @brief Multiply mat4x2 (m) by vec4 (v) and store in vec2 (dest).
+ *
+ * @param[in]  m    mat4x2 (left)
+ * @param[in]  v    vec4 (right, column vector)
+ * @param[out] dest destination (result, column vector)
+ */
+CGLM_INLINE
+void
+glm_mat4x2_mulv(mat4x2 m, vec4 v, vec2 dest) {
+  float v0 = v[0], v1 = v[1], v2 = v[2], v3 = v[3];
+
+  dest[0] = m[0][0] * v0 + m[1][0] * v1 + m[2][0] * v2 + m[3][0] * v3;
+  dest[1] = m[0][1] * v0 + m[1][1] * v1 + m[2][1] * v2 + m[3][1] * v3;
+}
+
+/*!
+ * @brief Transpose mat4x2 (src) and store in mat2x4 (dest).
+ *
+ * @param[in]  src  mat4x2 (left)
+ * @param[out] dest destination (result, mat2x4)
+ */
+CGLM_INLINE
+void
+glm_mat4x2_transpose(mat4x2 m, mat2x4 dest) {
+  dest[0][0] = m[0][0]; dest[0][1] = m[1][0]; dest[0][2] = m[2][0]; dest[0][3] = m[3][0];
+  dest[1][0] = m[0][1]; dest[1][1] = m[1][1]; dest[1][2] = m[2][1]; dest[1][3] = m[3][1];
+}
+
+/*!
+ * @brief Multiply mat4x2 (m) by scalar constant (s).
+ *
+ * @param[in, out] m (src, dest)
+ * @param[in]      s float (scalar)
+ */
+CGLM_INLINE
+void
+glm_mat4x2_scale(mat4x2 m, float s) {
+  m[0][0] *= s;  m[0][1] *= s;  m[1][0] *= s;  m[1][1] *= s;
+  m[2][0] *= s;  m[2][1] *= s;  m[3][0] *= s;  m[3][1] *= s;
+}
+
+#endif
diff --git a/external/cglm/mat4x3.h b/external/cglm/mat4x3.h
new file mode 100644
index 0000000..a429437
--- /dev/null
+++ b/external/cglm/mat4x3.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_MAT4X3_ZERO_INIT
+   GLM_MAT4X3_ZERO
+
+ Functions:
+   CGLM_INLINE void glm_mat4x3_copy(mat4x3 src, mat4x3 dest);
+   CGLM_INLINE void glm_mat4x3_zero(mat4x3 m);
+   CGLM_INLINE void glm_mat4x3_make(const float * __restrict src, mat4x3 dest);
+   CGLM_INLINE void glm_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat3 dest);
+   CGLM_INLINE void glm_mat4x3_mulv(mat4x3 m, vec4 v, vec3 dest);
+   CGLM_INLINE void glm_mat4x3_transpose(mat4x3 src, mat3x4 dest);
+   CGLM_INLINE void glm_mat4x3_scale(mat4x3 m, float s);
+ */
+
+#ifndef cglm_mat4x3_h
+#define cglm_mat4x3_h
+
+#include "common.h"
+
+#define GLM_MAT4X3_ZERO_INIT {{0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f}, \
+                              {0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f}}
+
+/* for C only */
+#define GLM_MAT4X3_ZERO GLM_MAT4X3_ZERO_INIT
+
+/*!
+ * @brief Copy mat4x3 (src) to mat4x3 (dest).
+ *
+ * @param[in]  src  mat4x3 (left)
+ * @param[out] dest destination (result, mat4x3)
+ */
+CGLM_INLINE
+void
+glm_mat4x3_copy(mat4x3 src, mat4x3 dest) {
+  glm_vec3_copy(src[0], dest[0]);
+  glm_vec3_copy(src[1], dest[1]);
+  glm_vec3_copy(src[2], dest[2]);
+  glm_vec3_copy(src[3], dest[3]);
+}
+
+/*!
+ * @brief Zero out the mat4x3 (m).
+ *
+ * @param[in, out] mat4x3 (src, dest)
+ */
+CGLM_INLINE
+void
+glm_mat4x3_zero(mat4x3 m) {
+  CGLM_ALIGN_MAT mat4x3 t = GLM_MAT4X3_ZERO_INIT;
+  glm_mat4x3_copy(t, m);
+}
+
+/*!
+ * @brief Create mat4x3 (dest) from pointer (src).
+ *
+ * @param[in]  src  pointer to an array of floats (left)
+ * @param[out] dest destination (result, mat4x3)
+ */
+CGLM_INLINE
+void
+glm_mat4x3_make(const float * __restrict src, mat4x3 dest) {
+  dest[0][0] = src[0];
+  dest[0][1] = src[1];
+  dest[0][2] = src[2];
+
+  dest[1][0] = src[3];
+  dest[1][1] = src[4];
+  dest[1][2] = src[5];
+
+  dest[2][0] = src[6];
+  dest[2][1] = src[7];
+  dest[2][2] = src[8];
+
+  dest[3][0] = src[9];
+  dest[3][1] = src[10];
+  dest[3][2] = src[11];
+}
+
+/*!
+ * @brief Multiply mat4x3 (m1) by mat3x4 (m2) and store in mat3 (dest).
+ *
+ * @code
+ * glm_mat4x3_mul(mat4x3, mat3x4, mat3);
+ * @endcode
+ *
+ * @param[in]  m1   mat4x3 (left)
+ * @param[in]  m2   mat3x4 (right)
+ * @param[out] dest destination (result, mat3)
+ */
+CGLM_INLINE
+void
+glm_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat3 dest) {
+  float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2],
+        a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2],
+        a20 = m1[2][0], a21 = m1[2][1], a22 = m1[2][2],
+        a30 = m1[3][0], a31 = m1[3][1], a32 = m1[3][2],
+
+        b00 = m2[0][0], b01 = m2[0][1], b02 = m2[0][2], b03 = m2[0][3],
+        b10 = m2[1][0], b11 = m2[1][1], b12 = m2[1][2], b13 = m2[1][3],
+        b20 = m2[2][0], b21 = m2[2][1], b22 = m2[2][2], b23 = m2[2][3];
+
+  dest[0][0] = a00 * b00 + a10 * b01 + a20 * b02 + a30 * b03;
+  dest[0][1] = a01 * b00 + a11 * b01 + a21 * b02 + a31 * b03;
+  dest[0][2] = a02 * b00 + a12 * b01 + a22 * b02 + a32 * b03;
+
+  dest[1][0] = a00 * b10 + a10 * b11 + a20 * b12 + a30 * b13;
+  dest[1][1] = a01 * b10 + a11 * b11 + a21 * b12 + a31 * b13;
+  dest[1][2] = a02 * b10 + a12 * b11 + a22 * b12 + a32 * b13;
+
+  dest[2][0] = a00 * b20 + a10 * b21 + a20 * b22 + a30 * b23;
+  dest[2][1] = a01 * b20 + a11 * b21 + a21 * b22 + a31 * b23;
+  dest[2][2] = a02 * b20 + a12 * b21 + a22 * b22 + a32 * b23;
+}
+
+/*!
+ * @brief Multiply mat4x3 (m) by vec4 (v) and store in vec3 (dest).
+ *
+ * @param[in]  m    mat4x3 (left)
+ * @param[in]  v    vec3 (right, column vector)
+ * @param[out] dest destination (result, column vector)
+ */
+CGLM_INLINE
+void
+glm_mat4x3_mulv(mat4x3 m, vec4 v, vec3 dest) {
+  float v0 = v[0], v1 = v[1], v2 = v[2], v3 = v[3];
+
+  dest[0] = m[0][0] * v0 + m[1][0] * v1 + m[2][0] * v2 + m[3][0] * v3;
+  dest[1] = m[0][1] * v0 + m[1][1] * v1 + m[2][1] * v2 + m[3][1] * v3;
+  dest[2] = m[0][2] * v0 + m[1][2] * v1 + m[2][2] * v2 + m[3][2] * v3;
+}
+
+/*!
+ * @brief Transpose mat4x3 (src) and store in mat3x4 (dest).
+ *
+ * @param[in]  src  mat4x3 (left)
+ * @param[out] dest destination (result, mat3x4)
+ */
+CGLM_INLINE
+void
+glm_mat4x3_transpose(mat4x3 src, mat3x4 dest) {
+  dest[0][0] = src[0][0]; dest[0][1] = src[1][0]; dest[0][2] = src[2][0]; dest[0][3] = src[3][0];
+  dest[1][0] = src[0][1]; dest[1][1] = src[1][1]; dest[1][2] = src[2][1]; dest[1][3] = src[3][1];
+  dest[2][0] = src[0][2]; dest[2][1] = src[1][2]; dest[2][2] = src[2][2]; dest[2][3] = src[3][2];
+}
+
+/*!
+ * @brief Multiply mat4x3 (m) by scalar constant (s).
+ *
+ * @param[in, out] m (src, dest)
+ * @param[in]      s float (scalar)
+ */
+CGLM_INLINE
+void
+glm_mat4x3_scale(mat4x3 m, float s) {
+  m[0][0] *= s;  m[0][1] *= s;  m[0][2] *= s;  m[1][0] *= s;
+  m[1][1] *= s;  m[1][2] *= s;  m[2][0] *= s;  m[2][1] *= s;
+  m[2][2] *= s;  m[3][0] *= s;  m[3][1] *= s;  m[3][2] *= s;
+}
+
+#endif /* cglm_mat4x3_h */
diff --git a/external/cglm/noise.h b/external/cglm/noise.h
new file mode 100644
index 0000000..bec12e9
--- /dev/null
+++ b/external/cglm/noise.h
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ *
+ * Based on the work of Stefan Gustavson and Ashima Arts on "webgl-noise":
+ * https://github.com/stegu/webgl-noise
+ * Following Stefan Gustavson's paper "Simplex noise demystified":
+ * http://www.itn.liu.se/~stegu/simplexnoise/simplexnoise.pdf
+ * 
+ * Implementation based on glm::perlin function:
+ * https://github.com/g-truc/glm/blob/master/glm/gtc/noise.inl
+ */
+
+#ifndef cglm_noise_h
+#define cglm_noise_h
+
+#include "vec4.h"
+#include "vec4-ext.h"
+
+#include "vec3.h"
+#include "vec3-ext.h"
+
+#include "vec2.h"
+#include "vec2-ext.h"
+
+#define glm__noiseDetail_mod289(x) (x - floorf(x * (1.0f / 289.0f)) * 289.0f)
+
+/* glm__noiseDetail_permute(vec4 x, vec4 dest) */
+#define glm__noiseDetail_permute(x, dest) { \
+  dest[0] = glm__noiseDetail_mod289((x[0] * 34.0f + 1.0f) * x[0]); \
+  dest[1] = glm__noiseDetail_mod289((x[1] * 34.0f + 1.0f) * x[1]); \
+  dest[2] = glm__noiseDetail_mod289((x[2] * 34.0f + 1.0f) * x[2]); \
+  dest[3] = glm__noiseDetail_mod289((x[3] * 34.0f + 1.0f) * x[3]); \
+}
+
+/* glm__noiseDetail_fade_vec4(vec4 t, vec4 dest) */
+#define glm__noiseDetail_fade_vec4(t, dest) { \
+  /* dest = (t * t * t) * (t * (t * 6.0f - 15.0f) + 10.0f) */ \
+  vec4 temp; \
+  glm_vec4_mul(t, t, temp); \
+  glm_vec4_mul(temp, t, temp); \
+  /* dest = (t * (t * 6.0f - 15.0f) + 10.0f) */ \
+  glm_vec4_scale(t, 6.0f, dest); \
+  glm_vec4_subs(dest, 15.0f, dest); \
+  glm_vec4_mul(t, dest, dest); \
+  glm_vec4_adds(dest, 10.0f, dest); \
+  /* dest = temp * dest */ \
+  glm_vec4_mul(temp, dest, dest); \
+}
+
+/* glm__noiseDetail_fade_vec3(vec3 t, vec3 dest) */
+#define glm__noiseDetail_fade_vec3(t, dest) { \
+  /* dest = (t * t * t) * (t * (t * 6.0f - 15.0f) + 10.0f) */ \
+  /* temp = t * t * t */ \
+  vec3 temp; \
+  glm_vec3_mul(t, t, temp); \
+  glm_vec3_mul(temp, t, temp); \
+  /* dest = (t * (t * 6.0f - 15.0f) + 10.0f) */ \
+  glm_vec3_scale(t, 6.0f, dest); \
+  glm_vec3_subs(dest, 15.0f, dest); \
+  glm_vec3_mul(t, dest, dest); \
+  glm_vec3_adds(dest, 10.0f, dest); \
+  /* dest = temp * dest */ \
+  glm_vec3_mul(temp, dest, dest); \
+}
+
+/* glm__noiseDetail_fade_vec2(vec2 t, vec2 dest) */
+#define glm__noiseDetail_fade_vec2(t, dest) { \
+    /* dest = (t * t * t) * (t * (t * 6.0f - 15.0f) + 10.0f) */ \
+    /* temp = t * t * t */ \
+    vec2 temp; \
+    glm_vec2_mul(t, t, temp); \
+    glm_vec2_mul(temp, t, temp); \
+    /* dest = (t * (t * 6.0f - 15.0f) + 10.0f) */ \
+    glm_vec2_scale(t, 6.0f, dest); \
+    glm_vec2_subs(dest, 15.0f, dest); \
+    glm_vec2_mul(t, dest, dest); \
+    glm_vec2_adds(dest, 10.0f, dest); \
+    /* dest = temp * dest */ \
+    glm_vec2_mul(temp, dest, dest); \
+}
+
+/* glm__noiseDetail_taylorInvSqrt(vec4 x, vec4 dest) */
+#define glm__noiseDetail_taylorInvSqrt(x, dest) {                        \
+  /* dest = 1.79284291400159f - 0.85373472095314f * x */                 \
+  vec4 temp;                                                             \
+  glm_vec4_scale(x, 0.85373472095314f, temp); /* temp = 0.853...f * x */ \
+  glm_vec4_fill(dest, 1.79284291400159f); /* dest = 1.792...f */         \
+  glm_vec4_sub(dest, temp, dest); /* dest = 1.79284291400159f - temp */  \
+}
+
+/* norm = taylorInvSqrt(vec4(
+ *     dot(g00__, g00__),
+ *     dot(g01__, g01__),
+ *     dot(g10__, g10__),
+ *     dot(g11__, g11__)
+ * ));
+*/
+
+/* glm__noiseDetail_gradNorm_vec4(vec4 g00__, vec4 g01__, vec4 g10__, vec4 g11__) */
+#define glm__noiseDetail_gradNorm_vec4(g00__, g01__, g10__, g11__) {           \
+  vec4 norm;                                                                   \
+  norm[0] = glm_vec4_dot(g00__, g00__); /* norm.x = dot(g00__, g00__) */       \
+  norm[1] = glm_vec4_dot(g01__, g01__); /* norm.y = dot(g01__, g01__) */       \
+  norm[2] = glm_vec4_dot(g10__, g10__); /* norm.z = dot(g10__, g10__) */       \
+  norm[3] = glm_vec4_dot(g11__, g11__); /* norm.w = dot(g11__, g11__) */       \
+  glm__noiseDetail_taylorInvSqrt(norm, norm); /* norm = taylorInvSqrt(norm) */ \
+                                                                               \
+  glm_vec4_scale(g00__, norm[0], g00__); /* g00__ *= norm.x */                 \
+  glm_vec4_scale(g01__, norm[1], g01__); /* g01__ *= norm.y */                 \
+  glm_vec4_scale(g10__, norm[2], g10__); /* g10__ *= norm.z */                 \
+  glm_vec4_scale(g11__, norm[3], g11__); /* g11__ *= norm.w */                 \
+}
+
+/* glm__noiseDetail_gradNorm_vec3(vec3 g00_, vec3 g01_, vec3 g10_, vec3 g11_) */
+#define glm__noiseDetail_gradNorm_vec3(g00_, g01_, g10_, g11_) {               \
+  vec4 norm;                                                                   \
+  norm[0] = glm_vec3_dot(g00_, g00_); /* norm.x = dot(g00_, g00_) */           \
+  norm[1] = glm_vec3_dot(g01_, g01_); /* norm.y = dot(g01_, g01_) */           \
+  norm[2] = glm_vec3_dot(g10_, g10_); /* norm.z = dot(g10_, g10_) */           \
+  norm[3] = glm_vec3_dot(g11_, g11_); /* norm.w = dot(g11_, g11_) */           \
+  glm__noiseDetail_taylorInvSqrt(norm, norm); /* norm = taylorInvSqrt(norm) */ \
+                                                                               \
+  glm_vec3_scale(g00_, norm[0], g00_); /* g00_ *= norm.x */                    \
+  glm_vec3_scale(g01_, norm[1], g01_); /* g01_ *= norm.y */                    \
+  glm_vec3_scale(g10_, norm[2], g10_); /* g10_ *= norm.z */                    \
+  glm_vec3_scale(g11_, norm[3], g11_); /* g11_ *= norm.w */                    \
+}
+
+/* glm__noiseDetail_gradNorm_vec2(vec2 g00, vec2 g01, vec2 g10, vec2 g11) */
+#define glm__noiseDetail_gradNorm_vec2(g00, g01, g10, g11) {                   \
+  vec4 norm;                                                                   \
+  norm[0] = glm_vec2_dot(g00, g00); /* norm.x = dot(g00, g00) */               \
+  norm[1] = glm_vec2_dot(g01, g01); /* norm.y = dot(g01, g01) */               \
+  norm[2] = glm_vec2_dot(g10, g10); /* norm.z = dot(g10, g10) */               \
+  norm[3] = glm_vec2_dot(g11, g11); /* norm.w = dot(g11, g11) */               \
+  glm__noiseDetail_taylorInvSqrt(norm, norm); /* norm = taylorInvSqrt(norm) */ \
+                                                                               \
+  glm_vec2_scale(g00, norm[0], g00); /* g00 *= norm.x */                       \
+  glm_vec2_scale(g01, norm[1], g01); /* g01 *= norm.y */                       \
+  glm_vec2_scale(g10, norm[2], g10); /* g10 *= norm.z */                       \
+  glm_vec2_scale(g11, norm[3], g11); /* g11 *= norm.w */                       \
+}
+
+/* glm__noiseDetail_i2gxyzw(vec4 ixy, vec4 gx, vec4 gy, vec4 gz, vec4 gw) */
+#define glm__noiseDetail_i2gxyzw(ixy, gx, gy, gz, gw) {      \
+  /* gx = ixy / 7.0 */                                       \
+  glm_vec4_divs(ixy, 7.0f, gx); /* gx = ixy / 7.0 */         \
+                                                             \
+  /* gy = fract(gx) / 7.0 */                                 \
+  glm_vec4_floor(gx, gy); /* gy = floor(gx) */               \
+  glm_vec4_divs(gy, 7.0f, gy); /* gy /= 7.0 */               \
+                                                             \
+  /* gz = floor(gy) / 6.0 */                                 \
+  glm_vec4_floor(gy, gz); /* gz = floor(gy) */               \
+  glm_vec4_divs(gz, 6.0f, gz); /* gz /= 6.0 */               \
+                                                             \
+  /* gx = fract(gx) - 0.5f */                                \
+  glm_vec4_fract(gx, gx); /* gx = fract(gx) */               \
+  glm_vec4_subs(gx, 0.5f, gx); /* gx -= 0.5f */              \
+                                                             \
+  /* gy = fract(gy) - 0.5f */                                \
+  glm_vec4_fract(gy, gy); /* gy = fract(gy) */               \
+  glm_vec4_subs(gy, 0.5f, gy); /* gy -= 0.5f */              \
+                                                             \
+  /* gz = fract(gz) - 0.5f */                                \
+  glm_vec4_fract(gz, gz); /* gz = fract(gz) */               \
+  glm_vec4_subs(gz, 0.5f, gz); /* gz -= 0.5f */              \
+                                                             \
+  /* abs(gx), abs(gy), abs(gz) */                            \
+  vec4 gxa, gya, gza;                                        \
+  glm_vec4_abs(gx, gxa); /* gxa = abs(gx) */                 \
+  glm_vec4_abs(gy, gya); /* gya = abs(gy) */                 \
+  glm_vec4_abs(gz, gza); /* gza = abs(gz) */                 \
+                                                             \
+  /* gw = 0.75 - abs(gx) - abs(gy) - abs(gz) */              \
+  glm_vec4_fill(gw, 0.75f); /* gw = 0.75 */                  \
+  glm_vec4_sub(gw, gxa, gw); /* gw -= gxa */                 \
+  glm_vec4_sub(gw, gza, gw); /* gw -= gza */                 \
+  glm_vec4_sub(gw, gya, gw); /* gw -= gya */                 \
+                                                             \
+  /* sw = step(gw, 0.0); */                                  \
+  vec4 sw;                                                   \
+  glm_vec4_stepr(gw, 0.0f, sw); /* sw = step(gw, 0.0) */     \
+                                                             \
+  /* gx -= sw * (step(vec4(0), gx) - T(0.5)); */             \
+  vec4 temp = {0.0f}; /* temp = 0.0 */                       \
+  glm_vec4_step(temp, gx, temp); /* temp = step(temp, gx) */ \
+  glm_vec4_subs(temp, 0.5f, temp); /* temp -= 0.5 */         \
+  glm_vec4_mul(sw, temp, temp); /* temp *= sw */             \
+  glm_vec4_sub(gx, temp, gx); /* gx -= temp */               \
+                                                             \
+  /* gy -= sw * (step(vec4(0), gy) - T(0.5)); */             \
+  glm_vec4_zero(temp); /* reset temp */                      \
+  glm_vec4_step(temp, gy, temp); /* temp = step(temp, gy) */ \
+  glm_vec4_subs(temp, 0.5f, temp); /* temp -= 0.5 */         \
+  glm_vec4_mul(sw, temp, temp); /* temp *= sw */             \
+  glm_vec4_sub(gy, temp, gy); /* gy -= temp */               \
+}
+
+/* NOTE: This function is not *quite* analogous to glm__noiseDetail_i2gxyzw
+ * to try to match the output of glm::perlin. I think it might be a bug in
+ * in the original implementation, but for now I'm keeping it consistent. -MK
+ * 
+ * Follow up: The original implementation (glm v 1.0.1) does:
+ * 
+ *   vec<4, T, Q> gx0 = ixy0 * T(1.0 / 7.0);
+ * 
+ * as opposed to:
+ * 
+ *   vec<4, T, Q> gx0 = ixy0 / T(7);
+ * 
+ * This ends up mapping to different simd instructions, at least on AMD.
+ * The delta is tiny but it gets amplified by the rest of the noise function.
+ * Hence we too need to do `glm_vec4_scale` as opposed to `glm_vec4_divs`, to
+ * match it. -MK
+ */
+
+/* glm__noiseDetail_i2gxyz(vec4 i, vec4 gx, vec4 gy, vec4 gz) */
+#define glm__noiseDetail_i2gxyz(ixy, gx, gy, gz) {               \
+  /* gx = ixy / 7.0 */                                           \
+  glm_vec4_scale(ixy, 1.0f / 7.0f, gx); /* gx = ixy * (1/7.0) */\
+                                                                 \
+  /* gy = fract(floor(gx0) / 7.0)) - 0.5; */                     \
+  glm_vec4_floor(gx, gy); /* gy = floor(gx) */                   \
+  glm_vec4_scale(gy, 1.0f / 7.0f, gy); /* gy *= 1 / 7.0 */       \
+  glm_vec4_fract(gy, gy); /* gy = fract(gy) */                   \
+  glm_vec4_subs(gy, 0.5f, gy); /* gy -= 0.5f */                  \
+                                                                 \
+  /* gx = fract(gx); */                                          \
+  glm_vec4_fract(gx, gx); /* gx = fract(gx) */                   \
+                                                                 \
+  /* abs(gx), abs(gy) */                                         \
+  vec4 gxa, gya;                                                 \
+  glm_vec4_abs(gx, gxa); /* gxa = abs(gx) */                     \
+  glm_vec4_abs(gy, gya); /* gya = abs(gy) */                     \
+                                                                 \
+  /* gz = vec4(0.5) - abs(gx0) - abs(gy0); */                    \
+  glm_vec4_fill(gz, 0.5f); /* gz = 0.5 */                        \
+  glm_vec4_sub(gz, gxa, gz); /* gz -= gxa */                     \
+  glm_vec4_sub(gz, gya, gz); /* gz -= gya */                     \
+                                                                 \
+  /* sz = step(gw, 0.0); */                                      \
+  vec4 sz;                                                       \
+  glm_vec4_stepr(gz, 0.0f, sz); /* sz = step(gz, 0.0) */         \
+                                                                 \
+  /* gx0 -= sz0 * (step(0.0, gx0) - T(0.5)); */                  \
+  vec4 temp = {0.0f}; /* temp = 0.0 */                           \
+  glm_vec4_step(temp, gx, temp); /* temp = step(temp, gx) */     \
+  glm_vec4_subs(temp, 0.5f, temp); /* temp -= 0.5 */             \
+  glm_vec4_mul(sz, temp, temp); /* temp *= sz */                 \
+  glm_vec4_sub(gx, temp, gx); /* gx -= temp */                   \
+                                                                 \
+  /* gy0 -= sz0 * (step(0.0, gy0) - T(0.5)); */                  \
+  glm_vec4_zero(temp); /* reset temp */                          \
+  glm_vec4_step(temp, gy, temp); /* temp = step(temp, gy) */     \
+  glm_vec4_subs(temp, 0.5f, temp); /* temp -= 0.5 */             \
+  glm_vec4_mul(sz, temp, temp); /* temp *= sz */                 \
+  glm_vec4_sub(gy, temp, gy); /* gy -= temp */                   \
+}
+
+/* glm__noiseDetail_i2gxy(vec4 i, vec4 gx, vec4 gy) */
+#define glm__noiseDetail_i2gxy(i, gx, gy) {                      \
+  /* gx = 2.0 * fract(i / 41.0) - 1.0; */                        \
+  glm_vec4_divs(i, 41.0f, gx); /* gx = i / 41.0 */               \
+  glm_vec4_fract(gx, gx); /* gx = fract(gx) */                   \
+  glm_vec4_scale(gx, 2.0f, gx); /* gx *= 2.0 */                  \
+  glm_vec4_subs(gx, 1.0f, gx); /* gx -= 1.0 */                   \
+                                                                 \
+  /* gy = abs(gx) - 0.5; */                                      \
+  glm_vec4_abs(gx, gy); /* gy = abs(gx) */                       \
+  glm_vec4_subs(gy, 0.5f, gy); /* gy -= 0.5 */                   \
+                                                                 \
+  /* tx = floor(gx + 0.5); */                                    \
+  vec4 tx;                                                       \
+  glm_vec4_adds(gx, 0.5f, tx); /* tx = gx + 0.5 */               \
+  glm_vec4_floor(tx, tx); /* tx = floor(tx) */                   \
+                                                                 \
+  /* gx = gx - tx; */                                            \
+  glm_vec4_sub(gx, tx, gx); /* gx -= tx */                       \
+}
+
+/* ============================================================================
+ * Classic perlin noise
+ * ============================================================================
+ */
+
+/*!
+ * @brief Classic perlin noise
+ *
+ * @param[in]  point  4D vector
+ * @returns           perlin noise value
+ */
+CGLM_INLINE
+float
+glm_perlin_vec4(vec4 point) {
+  /* Integer part of p for indexing */
+  vec4 Pi0;
+  glm_vec4_floor(point, Pi0); /* Pi0 = floor(point); */
+
+  /* Integer part + 1 */
+  vec4 Pi1;
+  glm_vec4_adds(Pi0, 1.0f, Pi1); /* Pi1 = Pi0 + 1.0f; */
+
+  glm_vec4_mods(Pi0, 289.0f, Pi0); /* Pi0 = mod(Pi0, 289.0f); */
+  glm_vec4_mods(Pi1, 289.0f, Pi1); /* Pi1 = mod(Pi1, 289.0f); */
+
+  /* Fractional part of p for interpolation */
+  vec4 Pf0;
+  glm_vec4_fract(point, Pf0);
+
+  /* Fractional part - 1.0 */
+  vec4 Pf1;
+  glm_vec4_subs(Pf0, 1.0f, Pf1);
+
+  vec4 ix = {Pi0[0], Pi1[0], Pi0[0], Pi1[0]};
+  vec4 iy = {Pi0[1], Pi0[1], Pi1[1], Pi1[1]};
+  vec4 iz0 = {Pi0[2], Pi0[2], Pi0[2], Pi0[2]}; /* iz0 = vec4(Pi0.z); */
+  vec4 iz1 = {Pi1[2], Pi1[2], Pi1[2], Pi1[2]}; /* iz1 = vec4(Pi1.z); */
+  vec4 iw0 = {Pi0[3], Pi0[3], Pi0[3], Pi0[3]}; /* iw0 = vec4(Pi0.w); */
+  vec4 iw1 = {Pi1[3], Pi1[3], Pi1[3], Pi1[3]}; /* iw1 = vec4(Pi1.w); */
+
+  /* ------------ */
+
+  /* ixy = permute(permute(ix) + iy) */
+  vec4 ixy;
+  glm__noiseDetail_permute(ix, ixy); /* ixy = permute(ix) */
+  glm_vec4_add(ixy, iy, ixy); /* ixy += iy; */
+  glm__noiseDetail_permute(ixy, ixy); /* ixy = permute(ixy) */
+
+  /* ixy0 = permute(ixy + iz0) */
+  vec4 ixy0;
+  glm_vec4_add(ixy, iz0, ixy0); /* ixy0 = ixy + iz0 */
+  glm__noiseDetail_permute(ixy0, ixy0); /* ixy0 = permute(ixy0) */
+
+  /* ixy1 = permute(ixy + iz1) */
+  vec4 ixy1;
+  glm_vec4_add(ixy, iz1, ixy1); /* ixy1 = ixy, iz1 */
+  glm__noiseDetail_permute(ixy1, ixy1); /* ixy1 = permute(ixy1) */
+
+  /* ixy00 = permute(ixy0 + iw0) */
+  vec4 ixy00;
+  glm_vec4_add(ixy0, iw0, ixy00); /* ixy00 = ixy0 + iw0 */
+  glm__noiseDetail_permute(ixy00, ixy00); /* ixy00 = permute(ixy00) */
+
+  /* ixy01 = permute(ixy0 + iw1) */
+  vec4 ixy01;
+  glm_vec4_add(ixy0, iw1, ixy01); /* ixy01 = ixy0 + iw1 */
+  glm__noiseDetail_permute(ixy01, ixy01); /* ixy01 = permute(ixy01) */
+
+  /* ixy10 = permute(ixy1 + iw0) */
+  vec4 ixy10;
+  glm_vec4_add(ixy1, iw0, ixy10); /* ixy10 = ixy1 + iw0 */
+  glm__noiseDetail_permute(ixy10, ixy10); /* ixy10 = permute(ixy10) */
+
+  /* ixy11 = permute(ixy1 + iw1) */
+  vec4 ixy11;
+  glm_vec4_add(ixy1, iw1, ixy11); /* ixy11 = ixy1 + iw1 */
+  glm__noiseDetail_permute(ixy11, ixy11); /* ixy11 = permute(ixy11) */
+
+  /* ------------ */
+
+  vec4 gx00, gy00, gz00, gw00;
+  glm__noiseDetail_i2gxyzw(ixy00, gx00, gy00, gz00, gw00);
+
+  vec4 gx01, gy01, gz01, gw01;
+  glm__noiseDetail_i2gxyzw(ixy01, gx01, gy01, gz01, gw01);
+
+  vec4 gx10, gy10, gz10, gw10;
+  glm__noiseDetail_i2gxyzw(ixy10, gx10, gy10, gz10, gw10);
+
+  vec4 gx11, gy11, gz11, gw11;
+  glm__noiseDetail_i2gxyzw(ixy11, gx11, gy11, gz11, gw11);
+
+  /* ------------ */
+
+  vec4 g0000 = {gx00[0], gy00[0], gz00[0], gw00[0]}; /* g0000 = vec4(gx00.x, gy00.x, gz00.x, gw00.x); */
+  vec4 g0100 = {gx00[2], gy00[2], gz00[2], gw00[2]}; /* g0100 = vec4(gx00.z, gy00.z, gz00.z, gw00.z); */
+  vec4 g1000 = {gx00[1], gy00[1], gz00[1], gw00[1]}; /* g1000 = vec4(gx00.y, gy00.y, gz00.y, gw00.y); */
+  vec4 g1100 = {gx00[3], gy00[3], gz00[3], gw00[3]}; /* g1100 = vec4(gx00.w, gy00.w, gz00.w, gw00.w); */
+
+  vec4 g0001 = {gx01[0], gy01[0], gz01[0], gw01[0]}; /* g0001 = vec4(gx01.x, gy01.x, gz01.x, gw01.x); */
+  vec4 g0101 = {gx01[2], gy01[2], gz01[2], gw01[2]}; /* g0101 = vec4(gx01.z, gy01.z, gz01.z, gw01.z); */
+  vec4 g1001 = {gx01[1], gy01[1], gz01[1], gw01[1]}; /* g1001 = vec4(gx01.y, gy01.y, gz01.y, gw01.y); */
+  vec4 g1101 = {gx01[3], gy01[3], gz01[3], gw01[3]}; /* g1101 = vec4(gx01.w, gy01.w, gz01.w, gw01.w); */
+
+  vec4 g0010 = {gx10[0], gy10[0], gz10[0], gw10[0]}; /* g0010 = vec4(gx10.x, gy10.x, gz10.x, gw10.x); */
+  vec4 g0110 = {gx10[2], gy10[2], gz10[2], gw10[2]}; /* g0110 = vec4(gx10.z, gy10.z, gz10.z, gw10.z); */
+  vec4 g1010 = {gx10[1], gy10[1], gz10[1], gw10[1]}; /* g1010 = vec4(gx10.y, gy10.y, gz10.y, gw10.y); */
+  vec4 g1110 = {gx10[3], gy10[3], gz10[3], gw10[3]}; /* g1110 = vec4(gx10.w, gy10.w, gz10.w, gw10.w); */
+  
+  vec4 g0011 = {gx11[0], gy11[0], gz11[0], gw11[0]}; /* g0011 = vec4(gx11.x, gy11.x, gz11.x, gw11.x); */
+  vec4 g0111 = {gx11[2], gy11[2], gz11[2], gw11[2]}; /* g0111 = vec4(gx11.z, gy11.z, gz11.z, gw11.z); */
+  vec4 g1011 = {gx11[1], gy11[1], gz11[1], gw11[1]}; /* g1011 = vec4(gx11.y, gy11.y, gz11.y, gw11.y); */
+  vec4 g1111 = {gx11[3], gy11[3], gz11[3], gw11[3]}; /* g1111 = vec4(gx11.w, gy11.w, gz11.w, gw11.w); */
+
+  glm__noiseDetail_gradNorm_vec4(g0000, g0100, g1000, g1100);
+  glm__noiseDetail_gradNorm_vec4(g0001, g0101, g1001, g1101);
+  glm__noiseDetail_gradNorm_vec4(g0010, g0110, g1010, g1110);
+  glm__noiseDetail_gradNorm_vec4(g0011, g0111, g1011, g1111);
+
+  /* ------------ */
+
+  float n0000 = glm_vec4_dot(g0000, Pf0); /* n0000 = dot(g0000, Pf0) */
+
+  /* n1000 = dot(g1000, vec4(Pf1.x, Pf0.y, Pf0.z, Pf0.w)) */
+  vec4 n1000d = {Pf1[0], Pf0[1], Pf0[2], Pf0[3]};
+  float n1000 = glm_vec4_dot(g1000, n1000d);
+  
+  /* n0100 = dot(g0100, vec4(Pf0.x, Pf1.y, Pf0.z, Pf0.w)) */
+  vec4 n0100d = {Pf0[0], Pf1[1], Pf0[2], Pf0[3]};
+  float n0100 = glm_vec4_dot(g0100, n0100d);
+  
+  /* n1100 = dot(g1100, vec4(Pf1.x, Pf1.y, Pf0.z, Pf0.w)) */
+  vec4 n1100d = {Pf1[0], Pf1[1], Pf0[2], Pf0[3]};
+  float n1100 = glm_vec4_dot(g1100, n1100d);
+  
+  /* n0010 = dot(g0010, vec4(Pf0.x, Pf0.y, Pf1.z, Pf0.w)) */
+  vec4 n0010d = {Pf0[0], Pf0[1], Pf1[2], Pf0[3]};
+  float n0010 = glm_vec4_dot(g0010, n0010d);
+
+  /* n1010 = dot(g1010, vec4(Pf1.x, Pf0.y, Pf1.z, Pf0.w)) */
+  vec4 n1010d = {Pf1[0], Pf0[1], Pf1[2], Pf0[3]};
+  float n1010 = glm_vec4_dot(g1010, n1010d);
+
+  /* n0110 = dot(g0110, vec4(Pf0.x, Pf1.y, Pf1.z, Pf0.w)) */
+  vec4 n0110d = {Pf0[0], Pf1[1], Pf1[2], Pf0[3]};
+  float n0110 = glm_vec4_dot(g0110, n0110d);
+
+  /* n1110 = dot(g1110, vec4(Pf1.x, Pf1.y, Pf1.z, Pf0.w)) */
+  vec4 n1110d = {Pf1[0], Pf1[1], Pf1[2], Pf0[3]};
+  float n1110 = glm_vec4_dot(g1110, n1110d);
+
+  /* n0001 = dot(g0001, vec4(Pf0.x, Pf0.y, Pf0.z, Pf1.w)) */
+  vec4 n0001d = {Pf0[0], Pf0[1], Pf0[2], Pf1[3]};
+  float n0001 = glm_vec4_dot(g0001, n0001d);
+
+  /* n1001 = dot(g1001, vec4(Pf1.x, Pf0.y, Pf0.z, Pf1.w)) */
+  vec4 n1001d = {Pf1[0], Pf0[1], Pf0[2], Pf1[3]};
+  float n1001 = glm_vec4_dot(g1001, n1001d);
+
+  /* n0101 = dot(g0101, vec4(Pf0.x, Pf1.y, Pf0.z, Pf1.w)) */
+  vec4 n0101d = {Pf0[0], Pf1[1], Pf0[2], Pf1[3]};
+  float n0101 = glm_vec4_dot(g0101, n0101d);
+
+  /* n1101 = dot(g1101, vec4(Pf1.x, Pf1.y, Pf0.z, Pf1.w)) */
+  vec4 n1101d = {Pf1[0], Pf1[1], Pf0[2], Pf1[3]};
+  float n1101 = glm_vec4_dot(g1101, n1101d);
+
+  /* n0011 = dot(g0011, vec4(Pf0.x, Pf0.y, Pf1.z, Pf1.w)) */
+  vec4 n0011d = {Pf0[0], Pf0[1], Pf1[2], Pf1[3]};
+  float n0011 = glm_vec4_dot(g0011, n0011d);
+
+  /* n1011 = dot(g1011, vec4(Pf1.x, Pf0.y, Pf1.z, Pf1.w)) */
+  vec4 n1011d = {Pf1[0], Pf0[1], Pf1[2], Pf1[3]};
+  float n1011 = glm_vec4_dot(g1011, n1011d);
+
+  /* n0111 = dot(g0111, vec4(Pf0.x, Pf1.y, Pf1.z, Pf1.w)) */
+  vec4 n0111d = {Pf0[0], Pf1[1], Pf1[2], Pf1[3]};
+  float n0111 = glm_vec4_dot(g0111, n0111d);
+
+  float n1111 = glm_vec4_dot(g1111, Pf1); /* n1111 = dot(g1111, Pf1) */
+
+  /* ------------ */
+
+  vec4 fade_xyzw;
+  glm__noiseDetail_fade_vec4(Pf0, fade_xyzw); /* fade_xyzw = fade(Pf0) */
+  
+  /* n_0w = lerp(vec4(n0000, n1000, n0100, n1100), vec4(n0001, n1001, n0101, n1101), fade_xyzw.w) */
+  vec4 n_0w1 = {n0000, n1000, n0100, n1100};
+  vec4 n_0w2 = {n0001, n1001, n0101, n1101};
+  vec4 n_0w;
+  glm_vec4_lerp(n_0w1, n_0w2, fade_xyzw[3], n_0w);
+  
+  /* n_1w = lerp(vec4(n0010, n1010, n0110, n1110), vec4(n0011, n1011, n0111, n1111), fade_xyzw.w) */
+  vec4 n_1w1 = {n0010, n1010, n0110, n1110};
+  vec4 n_1w2 = {n0011, n1011, n0111, n1111};
+  vec4 n_1w;
+  glm_vec4_lerp(n_1w1, n_1w2, fade_xyzw[3], n_1w);
+  
+  /* n_zw = lerp(n_0w, n_1w, fade_xyzw.z) */
+  vec4 n_zw;
+  glm_vec4_lerp(n_0w, n_1w, fade_xyzw[2], n_zw);
+  
+  /* n_yzw = lerp(vec2(n_zw.x, n_zw.y), vec2(n_zw.z, n_zw.w), fade_xyzw.y) */
+  vec2 n_yzw;
+  vec2 n_yzw1 = {n_zw[0], n_zw[1]};
+  vec2 n_yzw2 = {n_zw[2], n_zw[3]};
+  glm_vec2_lerp(n_yzw1, n_yzw2, fade_xyzw[1], n_yzw);
+
+  /* n_xyzw = lerp(n_yzw.x, n_yzw.y, fade_xyzw.x) */
+  float n_xyzw = glm_lerp(n_yzw[0], n_yzw[1], fade_xyzw[0]);
+
+  return n_xyzw * 2.2f;
+}
+
+
+/*!
+ * @brief Classic perlin noise
+ *
+ * @param[in]  point  3D vector
+ * @returns           perlin noise value
+ */
+CGLM_INLINE
+float
+glm_perlin_vec3(vec3 point) {
+  /* Integer part of p for indexing */
+  vec3 Pi0;
+  glm_vec3_floor(point, Pi0); /* Pi0 = floor(point); */
+
+  /* Integer part + 1 */
+  vec3 Pi1;
+  glm_vec3_adds(Pi0, 1.0f, Pi1); /* Pi1 = Pi0 + 1.0f; */
+
+  glm_vec3_mods(Pi0, 289.0f, Pi0); /* Pi0 = mod(Pi0, 289.0f); */
+  glm_vec3_mods(Pi1, 289.0f, Pi1); /* Pi1 = mod(Pi1, 289.0f); */
+
+  /* Fractional part of p for interpolation */
+  vec3 Pf0;
+  glm_vec3_fract(point, Pf0);
+
+  /* Fractional part - 1.0 */
+  vec3 Pf1;
+  glm_vec3_subs(Pf0, 1.0f, Pf1);
+
+  vec4 ix = {Pi0[0], Pi1[0], Pi0[0], Pi1[0]};
+  vec4 iy = {Pi0[1], Pi0[1], Pi1[1], Pi1[1]};
+  vec4 iz0 = {Pi0[2], Pi0[2], Pi0[2], Pi0[2]}; /* iz0 = vec4(Pi0.z); */
+  vec4 iz1 = {Pi1[2], Pi1[2], Pi1[2], Pi1[2]}; /* iz1 = vec4(Pi1.z); */
+
+  /* ------------ */
+
+  /* ixy = permute(permute(ix) + iy) */
+  vec4 ixy;
+  glm__noiseDetail_permute(ix, ixy); /* ixy = permute(ix) */
+  glm_vec4_add(ixy, iy, ixy); /* ixy += iy; */
+  glm__noiseDetail_permute(ixy, ixy); /* ixy = permute(ixy) */
+
+  /* ixy0 = permute(ixy + iz0) */
+  vec4 ixy0;
+  glm_vec4_add(ixy, iz0, ixy0); /* ixy0 = ixy + iz0 */
+  glm__noiseDetail_permute(ixy0, ixy0); /* ixy0 = permute(ixy0) */
+
+  /* ixy1 = permute(ixy + iz1) */
+  vec4 ixy1;
+  glm_vec4_add(ixy, iz1, ixy1); /* ixy1 = ixy, iz1 */
+  glm__noiseDetail_permute(ixy1, ixy1); /* ixy1 = permute(ixy1) */
+
+  /* ------------ */
+
+  vec4 gx0, gy0, gz0;
+  glm__noiseDetail_i2gxyz(ixy0, gx0, gy0, gz0);
+
+  vec4 gx1, gy1, gz1;
+  glm__noiseDetail_i2gxyz(ixy1, gx1, gy1, gz1);
+
+  /* ------------ */
+
+  vec3 g000 = {gx0[0], gy0[0], gz0[0]}; /* g000 = vec3(gx0.x, gy0.x, gz0.x); */
+  vec3 g100 = {gx0[1], gy0[1], gz0[1]}; /* g100 = vec3(gx0.y, gy0.y, gz0.y); */
+  vec3 g010 = {gx0[2], gy0[2], gz0[2]}; /* g010 = vec3(gx0.z, gy0.z, gz0.z); */
+  vec3 g110 = {gx0[3], gy0[3], gz0[3]}; /* g110 = vec3(gx0.w, gy0.w, gz0.w); */
+
+  vec3 g001 = {gx1[0], gy1[0], gz1[0]}; /* g001 = vec3(gx1.x, gy1.x, gz1.x); */
+  vec3 g101 = {gx1[1], gy1[1], gz1[1]}; /* g101 = vec3(gx1.y, gy1.y, gz1.y); */
+  vec3 g011 = {gx1[2], gy1[2], gz1[2]}; /* g011 = vec3(gx1.z, gy1.z, gz1.z); */
+  vec3 g111 = {gx1[3], gy1[3], gz1[3]}; /* g111 = vec3(gx1.w, gy1.w, gz1.w); */
+
+  glm__noiseDetail_gradNorm_vec3(g000, g010, g100, g110);
+  glm__noiseDetail_gradNorm_vec3(g001, g011, g101, g111);
+
+  /* ------------ */
+
+  float n000 = glm_vec3_dot(g000, Pf0); /* n000 = dot(g000, Pf0) */
+
+  /* n100 = dot(g100, vec3(Pf1.x, Pf0.y, Pf0.z)) */
+  vec3 n100d = {Pf1[0], Pf0[1], Pf0[2]};
+  float n100 = glm_vec3_dot(g100, n100d);
+
+  /* n010 = dot(g010, vec3(Pf0.x, Pf1.y, Pf0.z)) */
+  vec3 n010d = {Pf0[0], Pf1[1], Pf0[2]};
+  float n010 = glm_vec3_dot(g010, n010d);
+
+  /* n110 = dot(g110, vec3(Pf1.x, Pf1.y, Pf0.z)) */
+  vec3 n110d = {Pf1[0], Pf1[1], Pf0[2]};
+  float n110 = glm_vec3_dot(g110, n110d);
+
+  /* n001 = dot(g001, vec3(Pf0.x, Pf0.y, Pf1.z)) */
+  vec3 n001d = {Pf0[0], Pf0[1], Pf1[2]};
+  float n001 = glm_vec3_dot(g001, n001d);
+
+  /* n101 = dot(g101, vec3(Pf1.x, Pf0.y, Pf1.z)) */
+  vec3 n101d = {Pf1[0], Pf0[1], Pf1[2]};
+  float n101 = glm_vec3_dot(g101, n101d);
+
+  /* n011 = dot(g011, vec3(Pf0.x, Pf1.y, Pf1.z)) */
+  vec3 n011d = {Pf0[0], Pf1[1], Pf1[2]};
+  float n011 = glm_vec3_dot(g011, n011d);
+
+  float n111 = glm_vec3_dot(g111, Pf1); /* n111 = dot(g111, Pf1) */
+
+  /* ------------ */
+
+  vec3 fade_xyz;
+  glm__noiseDetail_fade_vec3(Pf0, fade_xyz); /* fade_xyz = fade(Pf0) */
+
+  /* n_z = lerp(vec4(n000, n100, n010, n110), vec4(n001, n101, n011, n111), fade_xyz.z); */
+  vec4 n_z;
+  vec4 n_z1 = {n000, n100, n010, n110};
+  vec4 n_z2 = {n001, n101, n011, n111};
+  glm_vec4_lerp(n_z1, n_z2, fade_xyz[2], n_z);
+
+  /* vec2 n_yz = lerp(vec2(n_z.x, n_z.y), vec2(n_z.z, n_z.w), fade_xyz.y); */
+  vec2 n_yz;
+  vec2 n_yz1 = {n_z[0], n_z[1]};
+  vec2 n_yz2 = {n_z[2], n_z[3]};
+  glm_vec2_lerp(n_yz1, n_yz2, fade_xyz[1], n_yz);
+
+  /* n_xyz = lerp(n_yz.x, n_yz.y, fade_xyz.x); */
+  float n_xyz = glm_lerp(n_yz[0], n_yz[1], fade_xyz[0]);
+
+  return n_xyz * 2.2f;
+}
+
+/*!
+ * @brief Classic perlin noise
+ *
+ * @param[in]  point  2D vector
+ * @returns           perlin noise value
+ */
+CGLM_INLINE
+float
+glm_perlin_vec2(vec2 point) {
+
+  /* Integer part of p for indexing */
+  /* Pi = floor(vec4(point.x, point.y, point.x, point.y)) + vec4(0.0, 0.0, 1.0, 1.0); */
+  vec4 Pi = {point[0], point[1], point[0], point[1]}; /* Pi = vec4(point.x, point.y, point.x, point.y) */
+  glm_vec4_floor(Pi, Pi); /* Pi = floor(Pi) */
+  Pi[2] += 1.0f; /* Pi.z += 1.0 */
+  Pi[3] += 1.0f; /* Pi.w += 1.0 */
+
+  /* Fractional part of p for interpolation */
+  /* vec<4, T, Q> Pf = glm::fract(vec<4, T, Q>(Position.x, Position.y, Position.x, Position.y)) - vec<4, T, Q>(0.0, 0.0, 1.0, 1.0); */
+  vec4 Pf = {point[0], point[1], point[0], point[1]}; /* Pf = vec4(point.x, point.y, point.x, point.y) */
+  glm_vec4_fract(Pf, Pf); /* Pf = fract(Pf) */
+  Pf[2] -= 1.0f; /* Pf.z -= 1.0 */
+  Pf[3] -= 1.0f; /* Pf.w -= 1.0 */
+
+  /* Mod to avoid truncation effects in permutation */
+  glm_vec4_mods(Pi, 289.0f, Pi); /* Pi = mod(Pi, 289.0f); */
+
+  vec4 ix = {Pi[0], Pi[2], Pi[0], Pi[2]}; /* ix = vec4(Pi.x, Pi.z, Pi.x, Pi.z) */
+  vec4 iy = {Pi[1], Pi[1], Pi[3], Pi[3]}; /* iy = vec4(Pi.y, Pi.y, Pi.w, Pi.w) */
+  vec4 fx = {Pf[0], Pf[2], Pf[0], Pf[2]}; /* fx = vec4(Pf.x, Pf.z, Pf.x, Pf.z) */
+  vec4 fy = {Pf[1], Pf[1], Pf[3], Pf[3]}; /* fy = vec4(Pf.y, Pf.y, Pf.w, Pf.w) */
+
+  /* ------------ */
+
+  /* i = permute(permute(ix) + iy); */
+  vec4 i;
+  glm__noiseDetail_permute(ix, i); /* i = permute(ix) */
+  glm_vec4_add(i, iy, i); /* i += iy; */
+  glm__noiseDetail_permute(i, i); /* i = permute(i) */
+
+  /* ------------ */
+
+  vec4 gx, gy;
+  glm__noiseDetail_i2gxy(i, gx, gy);
+
+  /* ------------ */
+
+  vec2 g00 = {gx[0], gy[0]}; /* g00 = vec2(gx.x, gy.x) */
+  vec2 g10 = {gx[1], gy[1]}; /* g10 = vec2(gx.y, gy.y) */
+  vec2 g01 = {gx[2], gy[2]}; /* g01 = vec2(gx.z, gy.z) */
+  vec2 g11 = {gx[3], gy[3]}; /* g11 = vec2(gx.w, gy.w) */
+
+  glm__noiseDetail_gradNorm_vec2(g00, g01, g10, g11);
+
+  /* ------------ */
+
+  /* n00 = dot(g00, vec2(fx.x, fy.x)) */
+  vec2 n00d = {fx[0], fy[0]}; /* n00d = vec2(fx.x, fy.x) */
+  float n00 = glm_vec2_dot(g00, n00d); /* n00 = dot(g00, n00d) */
+
+  /* n10 = dot(g10, vec2(fx.y, fy.y)) */
+  vec2 n10d = {fx[1], fy[1]}; /* n10d = vec2(fx.y, fy.y) */
+  float n10 = glm_vec2_dot(g10, n10d); /* n10 = dot(g10, n10d) */
+
+  /* n01 = dot(g01, vec2(fx.z, fy.z)) */
+  vec2 n01d = {fx[2], fy[2]}; /* n01d = vec2(fx.z, fy.z) */
+  float n01 = glm_vec2_dot(g01, n01d); /* n01 = dot(g01, n01d) */
+
+  /* n11 = dot(g11, vec2(fx.w, fy.w)) */
+  vec2 n11d = {fx[3], fy[3]}; /* n11d = vec2(fx.w, fy.w) */
+  float n11 = glm_vec2_dot(g11, n11d); /* n11 = dot(g11, n11d) */
+
+  /* ------------ */
+
+  /* fade_xyz = fade(vec2(Pf.x, Pf.y)) */
+  vec2 fade_xy;
+  vec2 temp2 = {Pf[0], Pf[1]}; /* temp = vec2(Pf.x, Pf.y) */
+  glm__noiseDetail_fade_vec2(temp2, fade_xy); /* fade_xy = fade(temp) */
+
+  /* n_x = lerp(vec2(n00, n01), vec2(n10, n11), fade_xy.x); */
+  vec2 n_x;
+  vec2 n_x1 = {n00, n01}; /* n_x1 = vec2(n00, n01) */
+  vec2 n_x2 = {n10, n11}; /* n_x2 = vec2(n10, n11) */
+  glm_vec2_lerp(n_x1, n_x2, fade_xy[0], n_x); /* n_x = lerp(n_x1, n_x2, fade_xy.x) */
+
+  /* T n_xy = mix(n_x.x, n_x.y, fade_xy.y); */
+  /* n_xy = lerp(n_x.x, n_x.y, fade_xy.y); */
+  float n_xy = glm_lerp(n_x[0], n_x[1], fade_xy[1]);
+
+  return n_xy * 2.3f;
+}
+
+/* Undefine all helper macros */
+
+#undef glm__noiseDetail_mod289
+#undef glm__noiseDetail_permute
+#undef glm__noiseDetail_fade_vec4
+#undef glm__noiseDetail_fade_vec3
+#undef glm__noiseDetail_fade_vec2
+#undef glm__noiseDetail_taylorInvSqrt
+#undef glm__noiseDetail_gradNorm_vec4
+#undef glm__noiseDetail_gradNorm_vec3
+#undef glm__noiseDetail_gradNorm_vec2
+#undef glm__noiseDetail_i2gxyzw
+#undef glm__noiseDetail_i2gxyz
+#undef glm__noiseDetail_i2gxy
+
+#endif /* cglm_noise_h */
diff --git a/external/cglm/plane.h b/external/cglm/plane.h
new file mode 100644
index 0000000..9efabb7
--- /dev/null
+++ b/external/cglm/plane.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_plane_h
+#define cglm_plane_h
+
+#include "common.h"
+#include "vec3.h"
+#include "vec4.h"
+
+/*
+ Plane equation:  Ax + By + Cz + D = 0;
+
+ It stored in vec4 as [A, B, C, D]. (A, B, C) is normal and D is distance
+*/
+
+/*
+ Functions:
+   CGLM_INLINE void  glm_plane_normalize(vec4 plane);
+ */
+
+/*!
+ * @brief normalizes a plane
+ *
+ * @param[in, out] plane plane to normalize
+ */
+CGLM_INLINE
+void
+glm_plane_normalize(vec4 plane) {
+  float norm;
+  
+  if (CGLM_UNLIKELY((norm = glm_vec3_norm(plane)) < FLT_EPSILON)) {
+    glm_vec4_zero(plane);
+    return;
+  }
+  
+  glm_vec4_scale(plane, 1.0f / norm, plane);
+}
+
+#endif /* cglm_plane_h */
diff --git a/external/cglm/project.h b/external/cglm/project.h
new file mode 100644
index 0000000..1d0a4e5
--- /dev/null
+++ b/external/cglm/project.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_project_h
+#define cglm_project_h
+
+#include "common.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+
+#ifndef CGLM_CLIPSPACE_INCLUDE_ALL
+#  if CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_ZO_BIT
+#    include "clipspace/project_zo.h"
+#  elif CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_NO_BIT
+#    include "clipspace/project_no.h"
+#  endif
+#else
+#  include "clipspace/project_zo.h"
+#  include "clipspace/project_no.h"
+#endif
+
+/*!
+ * @brief maps the specified viewport coordinates into specified space [1]
+ *        the matrix should contain projection matrix.
+ *
+ * if you don't have ( and don't want to have ) an inverse matrix then use
+ * glm_unproject version. You may use existing inverse of matrix in somewhere
+ * else, this is why glm_unprojecti exists to save save inversion cost
+ *
+ * [1] space:
+ *  1- if m = invProj:     View Space
+ *  2- if m = invViewProj: World Space
+ *  3- if m = invMVP:      Object Space
+ *
+ * You probably want to map the coordinates into object space
+ * so use invMVP as m
+ *
+ * Computing viewProj:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *   glm_mat4_inv(viewProj, invMVP);
+ *
+ * @param[in]  pos      point/position in viewport coordinates
+ * @param[in]  invMat   matrix (see brief)
+ * @param[in]  vp       viewport as [x, y, width, height]
+ * @param[out] dest     unprojected coordinates
+ */
+CGLM_INLINE
+void
+glm_unprojecti(vec3 pos, mat4 invMat, vec4 vp, vec3 dest) {
+#if CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_ZO_BIT
+  glm_unprojecti_zo(pos, invMat, vp, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_NO_BIT
+  glm_unprojecti_no(pos, invMat, vp, dest);
+#endif
+}
+
+/*!
+ * @brief maps the specified viewport coordinates into specified space [1]
+ *        the matrix should contain projection matrix.
+ *
+ * this is same as glm_unprojecti except this function get inverse matrix for
+ * you.
+ *
+ * [1] space:
+ *  1- if m = proj:     View Space
+ *  2- if m = viewProj: World Space
+ *  3- if m = MVP:      Object Space
+ *
+ * You probably want to map the coordinates into object space
+ * so use MVP as m
+ *
+ * Computing viewProj and MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * @param[in]  pos      point/position in viewport coordinates
+ * @param[in]  m        matrix (see brief)
+ * @param[in]  vp       viewport as [x, y, width, height]
+ * @param[out] dest     unprojected coordinates
+ */
+CGLM_INLINE
+void
+glm_unproject(vec3 pos, mat4 m, vec4 vp, vec3 dest) {
+  mat4 inv;
+  glm_mat4_inv(m, inv);
+  glm_unprojecti(pos, inv, vp, dest);
+}
+
+/*!
+ * @brief map object coordinates to window coordinates
+ *
+ * Computing MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * @param[in]  pos      object coordinates
+ * @param[in]  m        MVP matrix
+ * @param[in]  vp       viewport as [x, y, width, height]
+ * @param[out] dest     projected coordinates
+ */
+CGLM_INLINE
+void
+glm_project(vec3 pos, mat4 m, vec4 vp, vec3 dest) {
+#if CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_ZO_BIT
+  glm_project_zo(pos, m, vp, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_NO_BIT
+  glm_project_no(pos, m, vp, dest);
+#endif
+}
+
+/*!
+ * @brief map object's z coordinate to window coordinates
+ *
+ * Computing MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * @param[in]  v  object coordinates
+ * @param[in]  m  MVP matrix
+ *
+ * @returns projected z coordinate
+ */
+CGLM_INLINE
+float
+glm_project_z(vec3 v, mat4 m) {
+#if CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_ZO_BIT
+  return glm_project_z_zo(v, m);
+#elif CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_NO_BIT
+  return glm_project_z_no(v, m);
+#endif
+}
+
+/*!
+ * @brief define a picking region
+ *
+ * @param[in]  center   center [x, y] of a picking region in window coordinates
+ * @param[in]  size     size [width, height] of the picking region in window coordinates
+ * @param[in]  vp       viewport as [x, y, width, height]
+ * @param[out] dest     projected coordinates
+ */
+CGLM_INLINE
+void
+glm_pickmatrix(vec2 center, vec2 size, vec4 vp, mat4 dest) {
+  mat4 res;
+  vec3 v;
+
+  if (size[0] <= 0.0f || size[1] <= 0.0f)
+    return;
+  
+  /* Translate and scale the picked region to the entire window */
+  v[0] = (vp[2] - 2.0f * (center[0] - vp[0])) / size[0];
+  v[1] = (vp[3] - 2.0f * (center[1] - vp[1])) / size[1];
+  v[2] = 0.0f;
+
+  glm_translate_make(res, v);
+  
+  v[0] = vp[2] / size[0];
+  v[1] = vp[3] / size[1];
+  v[2] = 1.0f;
+
+  glm_scale(res, v);
+
+  glm_mat4_copy(res, dest);
+}
+
+#endif /* cglm_project_h */
diff --git a/external/cglm/quat.h b/external/cglm/quat.h
new file mode 100644
index 0000000..cf1f325
--- /dev/null
+++ b/external/cglm/quat.h
@@ -0,0 +1,949 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_QUAT_IDENTITY_INIT
+   GLM_QUAT_IDENTITY
+
+ Functions:
+   CGLM_INLINE void glm_quat_identity(versor q);
+   CGLM_INLINE void glm_quat_init(versor q, float x, float y, float z, float w);
+   CGLM_INLINE void glm_quat(versor q, float angle, float x, float y, float z);
+   CGLM_INLINE void glm_quatv(versor q, float angle, vec3 axis);
+   CGLM_INLINE void glm_quat_copy(versor q, versor dest);
+   CGLM_INLINE void glm_quat_from_vecs(vec3 a, vec3 b, versor dest);
+   CGLM_INLINE float glm_quat_norm(versor q);
+   CGLM_INLINE void glm_quat_normalize(versor q);
+   CGLM_INLINE void glm_quat_normalize_to(versor q, versor dest);
+   CGLM_INLINE float glm_quat_dot(versor p, versor q);
+   CGLM_INLINE void glm_quat_conjugate(versor q, versor dest);
+   CGLM_INLINE void glm_quat_inv(versor q, versor dest);
+   CGLM_INLINE void glm_quat_add(versor p, versor q, versor dest);
+   CGLM_INLINE void glm_quat_sub(versor p, versor q, versor dest);
+   CGLM_INLINE float glm_quat_real(versor q);
+   CGLM_INLINE void glm_quat_imag(versor q, vec3 dest);
+   CGLM_INLINE void glm_quat_imagn(versor q, vec3 dest);
+   CGLM_INLINE float glm_quat_imaglen(versor q);
+   CGLM_INLINE float glm_quat_angle(versor q);
+   CGLM_INLINE void glm_quat_axis(versor q, vec3 dest);
+   CGLM_INLINE void glm_quat_mul(versor p, versor q, versor dest);
+   CGLM_INLINE void glm_quat_mat4(versor q, mat4 dest);
+   CGLM_INLINE void glm_quat_mat4t(versor q, mat4 dest);
+   CGLM_INLINE void glm_quat_mat3(versor q, mat3 dest);
+   CGLM_INLINE void glm_quat_mat3t(versor q, mat3 dest);
+   CGLM_INLINE void glm_quat_lerp(versor from, versor to, float t, versor dest);
+   CGLM_INLINE void glm_quat_lerpc(versor from, versor to, float t, versor dest);
+   CGLM_INLINE void glm_quat_slerp(versor q, versor r, float t, versor dest);
+   CGLM_INLINE void glm_quat_slerp_longest(versor q, versor r, float t, versor dest);
+   CGLM_INLINE void glm_quat_nlerp(versor q, versor r, float t, versor dest);
+   CGLM_INLINE void glm_quat_look(vec3 eye, versor ori, mat4 dest);
+   CGLM_INLINE void glm_quat_for(vec3 dir, vec3 fwd, vec3 up, versor dest);
+   CGLM_INLINE void glm_quat_forp(vec3 from,
+                                  vec3 to,
+                                  vec3 fwd,
+                                  vec3 up,
+                                  versor dest);
+   CGLM_INLINE void glm_quat_rotatev(versor q, vec3 v, vec3 dest);
+   CGLM_INLINE void glm_quat_rotate(mat4 m, versor q, mat4 dest);
+   CGLM_INLINE void glm_quat_make(float * restrict src, versor dest);
+ */
+
+#ifndef cglm_quat_h
+#define cglm_quat_h
+
+#include "common.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+#include "mat3.h"
+#include "affine-mat.h"
+#include "affine.h"
+
+#ifdef CGLM_SSE_FP
+#  include "simd/sse2/quat.h"
+#endif
+
+#ifdef CGLM_NEON_FP
+#  include "simd/neon/quat.h"
+#endif
+
+#ifdef CGLM_SIMD_WASM
+#  include "simd/wasm/quat.h"
+#endif
+
+CGLM_INLINE void glm_quat_normalize(versor q);
+
+/*
+ * IMPORTANT:
+ * ----------------------------------------------------------------------------
+ * cglm stores quat as [x, y, z, w] since v0.3.6
+ *
+ * it was [w, x, y, z] before v0.3.6 it has been changed to [x, y, z, w]
+ * with v0.3.6 version.
+ * ----------------------------------------------------------------------------
+ */
+
+#define GLM_QUAT_IDENTITY_INIT  {0.0f, 0.0f, 0.0f, 1.0f}
+#define GLM_QUAT_IDENTITY       ((versor)GLM_QUAT_IDENTITY_INIT)
+
+/*!
+ * @brief makes given quat to identity
+ *
+ * @param[in, out]  q  quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_identity(versor q) {
+  CGLM_ALIGN(16) versor v = GLM_QUAT_IDENTITY_INIT;
+  glm_vec4_copy(v, q);
+}
+
+/*!
+ * @brief make given quaternion array's each element identity quaternion
+ *
+ * @param[in, out]  q     quat array (must be aligned (16)
+ *                        if alignment is not disabled)
+ *
+ * @param[in]       count count of quaternions
+ */
+CGLM_INLINE
+void
+glm_quat_identity_array(versor * __restrict q, size_t count) {
+  CGLM_ALIGN(16) versor v = GLM_QUAT_IDENTITY_INIT;
+  size_t i;
+
+  for (i = 0; i < count; i++) {
+    glm_vec4_copy(v, q[i]);
+  }
+}
+
+/*!
+ * @brief inits quaternion with raw values
+ *
+ * @param[out]  q     quaternion
+ * @param[in]   x     x
+ * @param[in]   y     y
+ * @param[in]   z     z
+ * @param[in]   w     w (real part)
+ */
+CGLM_INLINE
+void
+glm_quat_init(versor q, float x, float y, float z, float w) {
+  q[0] = x;
+  q[1] = y;
+  q[2] = z;
+  q[3] = w;
+}
+
+/*!
+ * @brief creates NEW quaternion with axis vector
+ *
+ * @param[out]  q     quaternion
+ * @param[in]   angle angle (radians)
+ * @param[in]   axis  axis
+ */
+CGLM_INLINE
+void
+glm_quatv(versor q, float angle, vec3 axis) {
+  CGLM_ALIGN(8) vec3 k;
+  float a, c, s;
+
+  a = angle * 0.5f;
+  c = cosf(a);
+  s = sinf(a);
+
+  glm_normalize_to(axis, k);
+
+  q[0] = s * k[0];
+  q[1] = s * k[1];
+  q[2] = s * k[2];
+  q[3] = c;
+}
+
+/*!
+ * @brief creates NEW quaternion with individual axis components
+ *
+ * @param[out]  q     quaternion
+ * @param[in]   angle angle (radians)
+ * @param[in]   x     axis.x
+ * @param[in]   y     axis.y
+ * @param[in]   z     axis.z
+ */
+CGLM_INLINE
+void
+glm_quat(versor q, float angle, float x, float y, float z) {
+  CGLM_ALIGN(8) vec3 axis = {x, y, z};
+  glm_quatv(q, angle, axis);
+}
+
+/*!
+ * @brief copy quaternion to another one
+ *
+ * @param[in]  q     quaternion
+ * @param[out] dest  destination
+ */
+CGLM_INLINE
+void
+glm_quat_copy(versor q, versor dest) {
+  glm_vec4_copy(q, dest);
+}
+
+/*!
+ * @brief compute quaternion rotating vector A to vector B
+ *
+ * @param[in]   a     vec3 (must have unit length)
+ * @param[in]   b     vec3 (must have unit length)
+ * @param[out]  dest  quaternion (of unit length)
+ */
+CGLM_INLINE
+void
+glm_quat_from_vecs(vec3 a, vec3 b, versor dest) {
+  CGLM_ALIGN(8) vec3 axis;
+  float cos_theta;
+  float cos_half_theta;
+
+  cos_theta = glm_vec3_dot(a, b);
+  if (cos_theta >= 1.f - GLM_FLT_EPSILON) {  /*  a ∥ b  */
+    glm_quat_identity(dest);
+    return;
+  }
+  if (cos_theta < -1.f + GLM_FLT_EPSILON) {  /*  angle(a, b) = π  */
+    glm_vec3_ortho(a, axis);
+    cos_half_theta = 0.f;                    /*  cos π/2 */
+  } else {
+    glm_vec3_cross(a, b, axis);
+    cos_half_theta = 1.0f + cos_theta;       /*  cos 0 + cos θ  */
+  }
+
+  glm_quat_init(dest, axis[0], axis[1], axis[2], cos_half_theta);
+  glm_quat_normalize(dest);
+}
+
+/*!
+ * @brief returns norm (magnitude) of quaternion
+ *
+ * @param[in]  q  quaternion
+ */
+CGLM_INLINE
+float
+glm_quat_norm(versor q) {
+  return glm_vec4_norm(q);
+}
+
+/*!
+ * @brief normalize quaternion and store result in dest
+ *
+ * @param[in]   q     quaternion to normalze
+ * @param[out]  dest  destination quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_normalize_to(versor q, versor dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_128 xdot, x0;
+  float  dot;
+
+  x0   = glmm_load(q);
+  xdot = glmm_vdot(x0, x0);
+  /* dot  = _mm_cvtss_f32(xdot); */
+  dot  = wasm_f32x4_extract_lane(xdot, 0);
+
+  if (dot <= 0.0f) {
+    glm_quat_identity(dest);
+    return;
+  }
+
+  glmm_store(dest, wasm_f32x4_div(x0, wasm_f32x4_sqrt(xdot)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  __m128 xdot, x0;
+  float  dot;
+
+  x0   = glmm_load(q);
+  xdot = glmm_vdot(x0, x0);
+  dot  = _mm_cvtss_f32(xdot);
+
+  if (dot <= 0.0f) {
+    glm_quat_identity(dest);
+    return;
+  }
+
+  glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot)));
+#else
+  float dot;
+
+  dot = glm_vec4_norm2(q);
+
+  if (dot <= 0.0f) {
+    glm_quat_identity(dest);
+    return;
+  }
+
+  glm_vec4_scale(q, 1.0f / sqrtf(dot), dest);
+#endif
+}
+
+/*!
+ * @brief normalize quaternion
+ *
+ * @param[in, out]  q  quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_normalize(versor q) {
+  glm_quat_normalize_to(q, q);
+}
+
+/*!
+ * @brief dot product of two quaternion
+ *
+ * @param[in]  p  quaternion 1
+ * @param[in]  q  quaternion 2
+ */
+CGLM_INLINE
+float
+glm_quat_dot(versor p, versor q) {
+  return glm_vec4_dot(p, q);
+}
+
+/*!
+ * @brief conjugate of quaternion
+ *
+ * @param[in]   q     quaternion
+ * @param[out]  dest  conjugate
+ */
+CGLM_INLINE
+void
+glm_quat_conjugate(versor q, versor dest) {
+  glm_vec4_negate_to(q, dest);
+  dest[3] = -dest[3];
+}
+
+/*!
+ * @brief inverse of non-zero quaternion
+ *
+ * @param[in]   q    quaternion
+ * @param[out]  dest inverse quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_inv(versor q, versor dest) {
+  CGLM_ALIGN(16) versor conj;
+  glm_quat_conjugate(q, conj);
+  glm_vec4_scale(conj, 1.0f / glm_vec4_norm2(q), dest);
+}
+
+/*!
+ * @brief add (componentwise) two quaternions and store result in dest
+ *
+ * @param[in]   p    quaternion 1
+ * @param[in]   q    quaternion 2
+ * @param[out]  dest result quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_add(versor p, versor q, versor dest) {
+  glm_vec4_add(p, q, dest);
+}
+
+/*!
+ * @brief subtract (componentwise) two quaternions and store result in dest
+ *
+ * @param[in]   p    quaternion 1
+ * @param[in]   q    quaternion 2
+ * @param[out]  dest result quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_sub(versor p, versor q, versor dest) {
+  glm_vec4_sub(p, q, dest);
+}
+
+/*!
+ * @brief returns real part of quaternion
+ *
+ * @param[in]   q    quaternion
+ */
+CGLM_INLINE
+float
+glm_quat_real(versor q) {
+  return q[3];
+}
+
+/*!
+ * @brief returns imaginary part of quaternion
+ *
+ * @param[in]   q    quaternion
+ * @param[out]  dest imag
+ */
+CGLM_INLINE
+void
+glm_quat_imag(versor q, vec3 dest) {
+  dest[0] = q[0];
+  dest[1] = q[1];
+  dest[2] = q[2];
+}
+
+/*!
+ * @brief returns normalized imaginary part of quaternion
+ *
+ * @param[in]   q    quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_imagn(versor q, vec3 dest) {
+  glm_normalize_to(q, dest);
+}
+
+/*!
+ * @brief returns length of imaginary part of quaternion
+ *
+ * @param[in]   q    quaternion
+ */
+CGLM_INLINE
+float
+glm_quat_imaglen(versor q) {
+  return glm_vec3_norm(q);
+}
+
+/*!
+ * @brief returns angle of quaternion
+ *
+ * @param[in]   q    quaternion
+ */
+CGLM_INLINE
+float
+glm_quat_angle(versor q) {
+  /*
+   sin(theta / 2) = length(x*x + y*y + z*z)
+   cos(theta / 2) = w
+   theta          = 2 * atan(sin(theta / 2) / cos(theta / 2))
+   */
+  return 2.0f * atan2f(glm_quat_imaglen(q), glm_quat_real(q));
+}
+
+/*!
+ * @brief axis of quaternion
+ *
+ * @param[in]   q    quaternion
+ * @param[out]  dest axis of quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_axis(versor q, vec3 dest) {
+  glm_quat_imagn(q, dest);
+}
+
+/*!
+ * @brief multiplies two quaternion and stores result in dest
+ *        this is also called Hamilton Product
+ *
+ * According to WikiPedia:
+ * The product of two rotation quaternions [clarification needed] will be
+ * equivalent to the rotation q followed by the rotation p
+ *
+ * @param[in]   p     quaternion 1
+ * @param[in]   q     quaternion 2
+ * @param[out]  dest  result quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_mul(versor p, versor q, versor dest) {
+  /*
+    + (a1 b2 + b1 a2 + c1 d2 − d1 c2)i
+    + (a1 c2 − b1 d2 + c1 a2 + d1 b2)j
+    + (a1 d2 + b1 c2 − c1 b2 + d1 a2)k
+       a1 a2 − b1 b2 − c1 c2 − d1 d2
+   */
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glm_quat_mul_wasm(p, q, dest);
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glm_quat_mul_sse2(p, q, dest);
+#elif defined(CGLM_NEON_FP)
+  glm_quat_mul_neon(p, q, dest);
+#else
+  dest[0] = p[3] * q[0] + p[0] * q[3] + p[1] * q[2] - p[2] * q[1];
+  dest[1] = p[3] * q[1] - p[0] * q[2] + p[1] * q[3] + p[2] * q[0];
+  dest[2] = p[3] * q[2] + p[0] * q[1] - p[1] * q[0] + p[2] * q[3];
+  dest[3] = p[3] * q[3] - p[0] * q[0] - p[1] * q[1] - p[2] * q[2];
+#endif
+}
+
+/*!
+ * @brief convert quaternion to mat4
+ *
+ * @param[in]   q     quaternion
+ * @param[out]  dest  result matrix
+ */
+CGLM_INLINE
+void
+glm_quat_mat4(versor q, mat4 dest) {
+  float w, x, y, z,
+        xx, yy, zz,
+        xy, yz, xz,
+        wx, wy, wz, norm, s;
+
+  norm = glm_quat_norm(q);
+  s    = norm > 0.0f ? 2.0f / norm : 0.0f;
+
+  x = q[0];
+  y = q[1];
+  z = q[2];
+  w = q[3];
+
+  xx = s * x * x;   xy = s * x * y;   wx = s * w * x;
+  yy = s * y * y;   yz = s * y * z;   wy = s * w * y;
+  zz = s * z * z;   xz = s * x * z;   wz = s * w * z;
+
+  dest[0][0] = 1.0f - yy - zz;
+  dest[1][1] = 1.0f - xx - zz;
+  dest[2][2] = 1.0f - xx - yy;
+
+  dest[0][1] = xy + wz;
+  dest[1][2] = yz + wx;
+  dest[2][0] = xz + wy;
+
+  dest[1][0] = xy - wz;
+  dest[2][1] = yz - wx;
+  dest[0][2] = xz - wy;
+
+  dest[0][3] = 0.0f;
+  dest[1][3] = 0.0f;
+  dest[2][3] = 0.0f;
+  dest[3][0] = 0.0f;
+  dest[3][1] = 0.0f;
+  dest[3][2] = 0.0f;
+  dest[3][3] = 1.0f;
+}
+
+/*!
+ * @brief convert quaternion to mat4 (transposed)
+ *
+ * @param[in]   q     quaternion
+ * @param[out]  dest  result matrix as transposed
+ */
+CGLM_INLINE
+void
+glm_quat_mat4t(versor q, mat4 dest) {
+  float w, x, y, z,
+        xx, yy, zz,
+        xy, yz, xz,
+        wx, wy, wz, norm, s;
+
+  norm = glm_quat_norm(q);
+  s    = norm > 0.0f ? 2.0f / norm : 0.0f;
+
+  x = q[0];
+  y = q[1];
+  z = q[2];
+  w = q[3];
+
+  xx = s * x * x;   xy = s * x * y;   wx = s * w * x;
+  yy = s * y * y;   yz = s * y * z;   wy = s * w * y;
+  zz = s * z * z;   xz = s * x * z;   wz = s * w * z;
+
+  dest[0][0] = 1.0f - yy - zz;
+  dest[1][1] = 1.0f - xx - zz;
+  dest[2][2] = 1.0f - xx - yy;
+
+  dest[1][0] = xy + wz;
+  dest[2][1] = yz + wx;
+  dest[0][2] = xz + wy;
+
+  dest[0][1] = xy - wz;
+  dest[1][2] = yz - wx;
+  dest[2][0] = xz - wy;
+
+  dest[0][3] = 0.0f;
+  dest[1][3] = 0.0f;
+  dest[2][3] = 0.0f;
+  dest[3][0] = 0.0f;
+  dest[3][1] = 0.0f;
+  dest[3][2] = 0.0f;
+  dest[3][3] = 1.0f;
+}
+
+/*!
+ * @brief convert quaternion to mat3
+ *
+ * @param[in]   q     quaternion
+ * @param[out]  dest  result matrix
+ */
+CGLM_INLINE
+void
+glm_quat_mat3(versor q, mat3 dest) {
+  float w, x, y, z,
+        xx, yy, zz,
+        xy, yz, xz,
+        wx, wy, wz, norm, s;
+
+  norm = glm_quat_norm(q);
+  s    = norm > 0.0f ? 2.0f / norm : 0.0f;
+
+  x = q[0];
+  y = q[1];
+  z = q[2];
+  w = q[3];
+
+  xx = s * x * x;   xy = s * x * y;   wx = s * w * x;
+  yy = s * y * y;   yz = s * y * z;   wy = s * w * y;
+  zz = s * z * z;   xz = s * x * z;   wz = s * w * z;
+
+  dest[0][0] = 1.0f - yy - zz;
+  dest[1][1] = 1.0f - xx - zz;
+  dest[2][2] = 1.0f - xx - yy;
+
+  dest[0][1] = xy + wz;
+  dest[1][2] = yz + wx;
+  dest[2][0] = xz + wy;
+
+  dest[1][0] = xy - wz;
+  dest[2][1] = yz - wx;
+  dest[0][2] = xz - wy;
+}
+
+/*!
+ * @brief convert quaternion to mat3 (transposed)
+ *
+ * @param[in]   q     quaternion
+ * @param[out]  dest  result matrix
+ */
+CGLM_INLINE
+void
+glm_quat_mat3t(versor q, mat3 dest) {
+  float w, x, y, z,
+        xx, yy, zz,
+        xy, yz, xz,
+        wx, wy, wz, norm, s;
+
+  norm = glm_quat_norm(q);
+  s    = norm > 0.0f ? 2.0f / norm : 0.0f;
+
+  x = q[0];
+  y = q[1];
+  z = q[2];
+  w = q[3];
+
+  xx = s * x * x;   xy = s * x * y;   wx = s * w * x;
+  yy = s * y * y;   yz = s * y * z;   wy = s * w * y;
+  zz = s * z * z;   xz = s * x * z;   wz = s * w * z;
+
+  dest[0][0] = 1.0f - yy - zz;
+  dest[1][1] = 1.0f - xx - zz;
+  dest[2][2] = 1.0f - xx - yy;
+
+  dest[1][0] = xy + wz;
+  dest[2][1] = yz + wx;
+  dest[0][2] = xz + wy;
+
+  dest[0][1] = xy - wz;
+  dest[1][2] = yz - wx;
+  dest[2][0] = xz - wy;
+}
+
+/*!
+ * @brief interpolates between two quaternions
+ *        using linear interpolation (LERP)
+ *
+ * @param[in]   from  from
+ * @param[in]   to    to
+ * @param[in]   t     interpolant (amount)
+ * @param[out]  dest  result quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_lerp(versor from, versor to, float t, versor dest) {
+  glm_vec4_lerp(from, to, t, dest);
+}
+
+/*!
+ * @brief interpolates between two quaternions
+ *        using linear interpolation (LERP)
+ *
+ * @param[in]   from  from
+ * @param[in]   to    to
+ * @param[in]   t     interpolant (amount) clamped between 0 and 1
+ * @param[out]  dest  result quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_lerpc(versor from, versor to, float t, versor dest) {
+  glm_vec4_lerpc(from, to, t, dest);
+}
+
+/*!
+ * @brief interpolates between two quaternions
+ *        taking the shortest rotation path using
+ *        normalized linear interpolation (NLERP)
+ *
+ * @param[in]   from  from
+ * @param[in]   to    to
+ * @param[in]   t     interpolant (amount)
+ * @param[out]  dest  result quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_nlerp(versor from, versor to, float t, versor dest) {
+  versor target;
+  float  dot;
+  
+  dot = glm_vec4_dot(from, to);
+  
+  glm_vec4_scale(to, (dot >= 0) ? 1.0f : -1.0f, target);
+  glm_quat_lerp(from, target, t, dest);
+  glm_quat_normalize(dest);
+}
+
+/*!
+ * @brief interpolates between two quaternions
+ *        using spherical linear interpolation (SLERP)
+ *
+ * @param[in]   from  from
+ * @param[in]   to    to
+ * @param[in]   t     amount
+ * @param[out]  dest  result quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_slerp(versor from, versor to, float t, versor dest) {
+  CGLM_ALIGN(16) vec4 q1, q2;
+  float cosTheta, sinTheta, angle;
+
+  cosTheta = glm_quat_dot(from, to);
+  glm_quat_copy(from, q1);
+
+  if (fabsf(cosTheta) >= 1.0f) {
+    glm_quat_copy(q1, dest);
+    return;
+  }
+
+  if (cosTheta < 0.0f) {
+    glm_vec4_negate(q1);
+    cosTheta = -cosTheta;
+  }
+
+  sinTheta = sqrtf(1.0f - cosTheta * cosTheta);
+
+  /* LERP to avoid zero division */
+  if (fabsf(sinTheta) < 0.001f) {
+    glm_quat_lerp(from, to, t, dest);
+    return;
+  }
+
+  /* SLERP */
+  angle = acosf(cosTheta);
+  glm_vec4_scale(q1, sinf((1.0f - t) * angle), q1);
+  glm_vec4_scale(to, sinf(t * angle), q2);
+
+  glm_vec4_add(q1, q2, q1);
+  glm_vec4_scale(q1, 1.0f / sinTheta, dest);
+}
+
+/*!
+ * @brief interpolates between two quaternions
+ *        using spherical linear interpolation (SLERP) and always takes the long path
+ *
+ * @param[in]   from  from
+ * @param[in]   to    to
+ * @param[in]   t     amount
+ * @param[out]  dest  result quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_slerp_longest(versor from, versor to, float t, versor dest) {
+  CGLM_ALIGN(16) vec4 q1, q2;
+  float cosTheta, sinTheta, angle;
+
+  cosTheta = glm_quat_dot(from, to);
+  glm_quat_copy(from, q1);
+
+  if (fabsf(cosTheta) >= 1.0f) {
+    glm_quat_copy(q1, dest);
+    return;
+  }
+
+  /* longest path */
+  if (!(cosTheta < 0.0f)) {
+    glm_vec4_negate(q1);
+    cosTheta = -cosTheta;
+  }
+
+  sinTheta = sqrtf(1.0f - cosTheta * cosTheta);
+
+  /* LERP to avoid zero division */
+  if (fabsf(sinTheta) < 0.001f) {
+    glm_quat_lerp(from, to, t, dest);
+    return;
+  }
+
+  /* SLERP */
+  angle = acosf(cosTheta);
+  glm_vec4_scale(q1, sinf((1.0f - t) * angle), q1);
+  glm_vec4_scale(to, sinf(t * angle), q2);
+
+  glm_vec4_add(q1, q2, q1);
+  glm_vec4_scale(q1, 1.0f / sinTheta, dest);
+}
+
+/*!
+ * @brief creates view matrix using quaternion as camera orientation
+ *
+ * @param[in]   eye   eye
+ * @param[in]   ori   orientation in world space as quaternion
+ * @param[out]  dest  view matrix
+ */
+CGLM_INLINE
+void
+glm_quat_look(vec3 eye, versor ori, mat4 dest) {
+  /* orientation */
+  glm_quat_mat4t(ori, dest);
+
+  /* translate */
+  glm_mat4_mulv3(dest, eye, 1.0f, dest[3]);
+  glm_vec3_negate(dest[3]);
+}
+
+/*!
+ * @brief creates look rotation quaternion
+ *
+ * @param[in]   dir   direction to look
+ * @param[in]   up    up vector
+ * @param[out]  dest  destination quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_for(vec3 dir, vec3 up, versor dest) {
+  CGLM_ALIGN_MAT mat3 m;
+
+  glm_vec3_normalize_to(dir, m[2]); 
+
+  /* No need to negate in LH, but we use RH here */
+  glm_vec3_negate(m[2]);
+  
+  glm_vec3_crossn(up, m[2], m[0]);
+  glm_vec3_cross(m[2], m[0], m[1]);
+
+  glm_mat3_quat(m, dest);
+}
+
+/*!
+ * @brief creates look rotation quaternion using source and
+ *        destination positions p suffix stands for position
+ *
+ * @param[in]   from  source point
+ * @param[in]   to    destination point
+ * @param[in]   up    up vector
+ * @param[out]  dest  destination quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_forp(vec3 from, vec3 to, vec3 up, versor dest) {
+  CGLM_ALIGN(8) vec3 dir;
+  glm_vec3_sub(to, from, dir);
+  glm_quat_for(dir, up, dest);
+}
+
+/*!
+ * @brief rotate vector using using quaternion
+ *
+ * @param[in]   q     quaternion
+ * @param[in]   v     vector to rotate
+ * @param[out]  dest  rotated vector
+ */
+CGLM_INLINE
+void
+glm_quat_rotatev(versor q, vec3 v, vec3 dest) {
+  CGLM_ALIGN(16) versor p;
+  CGLM_ALIGN(8)  vec3   u, v1, v2;
+  float s;
+
+  glm_quat_normalize_to(q, p);
+  glm_quat_imag(p, u);
+  s = glm_quat_real(p);
+
+  glm_vec3_scale(u, 2.0f * glm_vec3_dot(u, v), v1);
+  glm_vec3_scale(v, s * s - glm_vec3_dot(u, u), v2);
+  glm_vec3_add(v1, v2, v1);
+
+  glm_vec3_cross(u, v, v2);
+  glm_vec3_scale(v2, 2.0f * s, v2);
+
+  glm_vec3_add(v1, v2, dest);
+}
+
+/*!
+ * @brief rotate existing transform matrix using quaternion
+ *
+ * @param[in]   m     existing transform matrix
+ * @param[in]   q     quaternion
+ * @param[out]  dest  rotated matrix/transform
+ */
+CGLM_INLINE
+void
+glm_quat_rotate(mat4 m, versor q, mat4 dest) {
+  CGLM_ALIGN_MAT mat4 rot;
+  glm_quat_mat4(q, rot);
+  glm_mul_rot(m, rot, dest);
+}
+
+/*!
+ * @brief rotate existing transform matrix using quaternion at pivot point
+ *
+ * @param[in, out]   m     existing transform matrix
+ * @param[in]        q     quaternion
+ * @param[out]       pivot pivot
+ */
+CGLM_INLINE
+void
+glm_quat_rotate_at(mat4 m, versor q, vec3 pivot) {
+  CGLM_ALIGN(8) vec3 pivotInv;
+
+  glm_vec3_negate_to(pivot, pivotInv);
+
+  glm_translate(m, pivot);
+  glm_quat_rotate(m, q, m);
+  glm_translate(m, pivotInv);
+}
+
+/*!
+ * @brief rotate NEW transform matrix using quaternion at pivot point
+ *
+ * this creates rotation matrix, it assumes you don't have a matrix
+ *
+ * this should work faster than glm_quat_rotate_at because it reduces
+ * one glm_translate.
+ *
+ * @param[out]  m     existing transform matrix
+ * @param[in]   q     quaternion
+ * @param[in]   pivot pivot
+ */
+CGLM_INLINE
+void
+glm_quat_rotate_atm(mat4 m, versor q, vec3 pivot) {
+  CGLM_ALIGN(8) vec3 pivotInv;
+
+  glm_vec3_negate_to(pivot, pivotInv);
+
+  glm_translate_make(m, pivot);
+  glm_quat_rotate(m, q, m);
+  glm_translate(m, pivotInv);
+}
+
+/*!
+ * @brief Create quaternion from pointer
+ *
+ * @param[in]  src  pointer to an array of floats
+ * @param[out] dest quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_make(const float * __restrict src, versor dest) {
+  dest[0] = src[0]; dest[1] = src[1];
+  dest[2] = src[2]; dest[3] = src[3];
+}
+
+#endif /* cglm_quat_h */
diff --git a/external/cglm/ray.h b/external/cglm/ray.h
new file mode 100644
index 0000000..d7831bc
--- /dev/null
+++ b/external/cglm/ray.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE bool glm_ray_triangle(vec3   origin,
+                                     vec3   direction,
+                                     vec3   v0,
+                                     vec3   v1,
+                                     vec3   v2,
+                                     float *d);
+ CGLM_INLINE bool glm_ray_sphere(vec3 origin,
+                                 vec3 dir,
+                                 vec4 s,
+                                 float * __restrict t1,
+                                 float * __restrict t2)
+ CGLM_INLINE void glm_ray_at(vec3 orig, vec3 dir, float t, vec3 point);
+*/
+
+#ifndef cglm_ray_h
+#define cglm_ray_h
+
+#include "vec3.h"
+
+/*!
+ * @brief Möller–Trumbore ray-triangle intersection algorithm
+ * 
+ * @param[in] origin         origin of ray
+ * @param[in] direction      direction of ray
+ * @param[in] v0             first vertex of triangle
+ * @param[in] v1             second vertex of triangle
+ * @param[in] v2             third vertex of triangle
+ * @param[in, out] d         distance to intersection
+ * @return whether there is intersection
+ */
+CGLM_INLINE
+bool
+glm_ray_triangle(vec3   origin,
+                 vec3   direction,
+                 vec3   v0,
+                 vec3   v1,
+                 vec3   v2,
+                 float *d) {
+  vec3        edge1, edge2, p, t, q;
+  float       det, inv_det, u, v, dist;
+  const float epsilon = 0.000001f;
+
+  glm_vec3_sub(v1, v0, edge1);
+  glm_vec3_sub(v2, v0, edge2);
+  glm_vec3_cross(direction, edge2, p);
+
+  det = glm_vec3_dot(edge1, p);
+  if (det > -epsilon && det < epsilon)
+    return false;
+
+  inv_det = 1.0f / det;
+  
+  glm_vec3_sub(origin, v0, t);
+
+  u = inv_det * glm_vec3_dot(t, p);
+  if (u < 0.0f || u > 1.0f)
+    return false;
+
+  glm_vec3_cross(t, edge1, q);
+
+  v = inv_det * glm_vec3_dot(direction, q);
+  if (v < 0.0f || u + v > 1.0f)
+    return false;
+
+  dist = inv_det * glm_vec3_dot(edge2, q);
+
+  if (d)
+    *d = dist;
+
+  return dist > epsilon;
+}
+
+/*!
+ * @brief ray sphere intersection
+ *
+ * returns false if there is no intersection if true:
+ *
+ * - t1 > 0, t2 > 0: ray intersects the sphere at t1 and t2 both ahead of the origin
+ * - t1 < 0, t2 > 0: ray starts inside the sphere, exits at t2
+ * - t1 < 0, t2 < 0: no intersection ahead of the ray ( returns false )
+ * - the caller can check if the intersection points (t1 and t2) fall within a
+ *   specific range (for example, tmin < t1, t2 < tmax) to determine if the
+ *   intersections are within a desired segment of the ray
+ *
+ * @param[in]  origin ray origin
+ * @param[out] dir    normalized ray direction
+ * @param[in]  s      sphere  [center.x, center.y, center.z, radii]
+ * @param[in]  t1     near point1 (closer to origin)
+ * @param[in]  t2     far point2 (farther from origin)
+ *
+ * @returns whether there is intersection
+ */
+CGLM_INLINE
+bool 
+glm_ray_sphere(vec3 origin,
+               vec3 dir,
+               vec4 s,
+               float * __restrict t1,
+               float * __restrict t2) {
+  vec3  dp;
+  float r2, ddp, dpp, dscr, q, tmp, _t1, _t2;
+
+  glm_vec3_sub(s, origin, dp);
+
+  ddp = glm_vec3_dot(dir, dp);
+  dpp = glm_vec3_norm2(dp);
+
+  /* compute the remedy term for numerical stability */
+  glm_vec3_mulsubs(dir, ddp, dp); /* dp: remedy term */
+
+  r2   = s[3] * s[3];
+  dscr = r2 - glm_vec3_norm2(dp);
+
+  if (dscr < 0.0f) {
+    /* no intersection */
+    return false;
+  }
+
+  dscr = sqrtf(dscr);
+  q    = (ddp >= 0.0f) ? (ddp + dscr) : (ddp - dscr);
+
+  /*
+     include Press, William H., Saul A. Teukolsky,
+     William T. Vetterling, and Brian P. Flannery,
+     "Numerical Recipes in C," Cambridge University Press, 1992.
+   */
+  _t1 = q;
+  _t2 = (dpp - r2) / q;
+
+  /* adjust t1 and t2 to ensure t1 is the closer intersection */
+  if (_t1 > _t2) {
+    tmp = _t1;
+    _t1 = _t2;
+    _t2 = tmp;
+  }
+
+  *t1 = _t1;
+  *t2 = _t2;
+
+  /* check if the closest intersection (t1) is behind the ray's origin */
+  if (_t1 < 0.0f && _t2 < 0.0f) {
+    /* both intersections are behind the ray, no visible intersection */
+    return false;
+  }
+
+  return true;
+}
+
+/*!
+ * @brief point using t by 𝐏(𝑡)=𝐀+𝑡𝐛
+ *
+ * @param[in]  orig  origin of ray
+ * @param[in]  dir   direction of ray
+ * @param[in]  t     parameter
+ * @param[out] point point at t
+ */
+CGLM_INLINE
+void
+glm_ray_at(vec3 orig, vec3 dir, float t, vec3 point) {
+  vec3 dst;
+  glm_vec3_scale(dir, t, dst);
+  glm_vec3_add(orig, dst, point);
+}
+
+#endif
diff --git a/external/cglm/simd/arm.h b/external/cglm/simd/arm.h
new file mode 100644
index 0000000..9f51742
--- /dev/null
+++ b/external/cglm/simd/arm.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_simd_arm_h
+#define cglm_simd_arm_h
+#include "intrin.h"
+#ifdef CGLM_SIMD_ARM
+
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__)
+# define CGLM_ARM64 1
+#else
+# define CGLM_ARM64 0
+#endif
+
+#define glmm_load(p)      vld1q_f32(p)
+#define glmm_store(p, a)  vst1q_f32(p, a)
+
+#define glmm_set1(x)      vdupq_n_f32(x)
+#define glmm_set1_ptr(x)  vdupq_n_f32(*x)
+#define glmm_set1_rval(x) vdupq_n_f32(x)
+#define glmm_128          float32x4_t
+
+#define glmm_splat_x(x) vdupq_lane_f32(vget_low_f32(x),  0)
+#define glmm_splat_y(x) vdupq_lane_f32(vget_low_f32(x),  1)
+#define glmm_splat_z(x) vdupq_lane_f32(vget_high_f32(x), 0)
+#define glmm_splat_w(x) vdupq_lane_f32(vget_high_f32(x), 1)
+
+#define glmm_xor(a, b)                                                        \
+  vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a),                   \
+                                  vreinterpretq_s32_f32(b)))
+
+#define glmm_swplane(v) vextq_f32(v, v, 2)
+#define glmm_low(x)     vget_low_f32(x)
+#define glmm_high(x)    vget_high_f32(x)
+
+#define glmm_combine_ll(x, y) vcombine_f32(vget_low_f32(x),  vget_low_f32(y))
+#define glmm_combine_hl(x, y) vcombine_f32(vget_high_f32(x), vget_low_f32(y))
+#define glmm_combine_lh(x, y) vcombine_f32(vget_low_f32(x),  vget_high_f32(y))
+#define glmm_combine_hh(x, y) vcombine_f32(vget_high_f32(x), vget_high_f32(y))
+
+#if defined(_WIN32) && defined(_MSC_VER)
+/* #  define glmm_float32x4_init(x, y, z, w) { .n128_f32 = { x, y, z, w } } */
+CGLM_INLINE
+float32x4_t
+glmm_float32x4_init(float x, float y, float z, float w) {
+  CGLM_ALIGN(16) float v[4] = {x, y, z, w};
+  return vld1q_f32(v);
+}
+#else
+#  define glmm_float32x4_init(x, y, z, w) { x, y, z, w }
+#endif
+
+#define glmm_float32x4_SIGNMASK_PNPN glmm_float32x4_init( 0.f, -0.f,  0.f, -0.f)
+#define glmm_float32x4_SIGNMASK_NPNP glmm_float32x4_init(-0.f,  0.f, -0.f,  0.f)
+#define glmm_float32x4_SIGNMASK_NPPN glmm_float32x4_init(-0.f,  0.f,  0.f, -0.f)
+
+static inline float32x4_t glmm_abs(float32x4_t v)                { return vabsq_f32(v);    }
+static inline float32x4_t glmm_min(float32x4_t a, float32x4_t b) { return vminq_f32(a, b); }
+static inline float32x4_t glmm_max(float32x4_t a, float32x4_t b) { return vmaxq_f32(a, b); }
+
+static inline
+float32x4_t
+glmm_vhadd(float32x4_t v) {
+#if CGLM_ARM64
+  float32x4_t p;
+  p = vpaddq_f32(v, v); /* [a+b, c+d, a+b, c+d] */
+  return vpaddq_f32(p, p); /* [t, t, t, t] */;
+#else
+  return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
+                   vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
+#endif
+  /* TODO: measure speed of this compare to above */
+  /* return vdupq_n_f32(vaddvq_f32(v)); */
+
+  /*
+  return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
+                   vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
+   */
+  /*
+   this seems slower:
+   v = vaddq_f32(v, vrev64q_f32(v));
+   return vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
+   */
+}
+
+static inline
+float
+glmm_hadd(float32x4_t v) {
+#if CGLM_ARM64
+  return vaddvq_f32(v);
+#else
+  v = vaddq_f32(v, vrev64q_f32(v));
+  v = vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
+  return vgetq_lane_f32(v, 0);
+#endif
+}
+
+static inline
+float
+glmm_hmin(float32x4_t v) {
+  float32x2_t t;
+  t = vpmin_f32(vget_low_f32(v), vget_high_f32(v));
+  t = vpmin_f32(t, t);
+  return vget_lane_f32(t, 0);
+}
+
+static inline
+float
+glmm_hmax(float32x4_t v) {
+  float32x2_t t;
+  t = vpmax_f32(vget_low_f32(v), vget_high_f32(v));
+  t = vpmax_f32(t, t);
+  return vget_lane_f32(t, 0);
+}
+
+static inline
+float
+glmm_dot(float32x4_t a, float32x4_t b) {
+  return glmm_hadd(vmulq_f32(a, b));
+}
+
+static inline
+float32x4_t
+glmm_vdot(float32x4_t a, float32x4_t b) {
+  return glmm_vhadd(vmulq_f32(a, b));
+}
+
+static inline
+float
+glmm_norm(float32x4_t a) {
+  return sqrtf(glmm_dot(a, a));
+}
+
+static inline
+float
+glmm_norm2(float32x4_t a) {
+  return glmm_dot(a, a);
+}
+
+static inline
+float
+glmm_norm_one(float32x4_t a) {
+  return glmm_hadd(glmm_abs(a));
+}
+
+static inline
+float
+glmm_norm_inf(float32x4_t a) {
+  return glmm_hmax(glmm_abs(a));
+}
+
+static inline
+float32x4_t
+glmm_div(float32x4_t a, float32x4_t b) {
+#if CGLM_ARM64
+  return vdivq_f32(a, b);
+#else
+  /* 2 iterations of Newton-Raphson refinement of reciprocal */
+  float32x4_t r0, r1;
+  r0 = vrecpeq_f32(b);
+  r1 = vrecpsq_f32(r0, b);
+  r0 = vmulq_f32(r1, r0);
+  r1 = vrecpsq_f32(r0, b);
+  r0 = vmulq_f32(r1, r0);
+  return vmulq_f32(a, r0);
+#endif
+}
+
+static inline
+float32x4_t
+glmm_fmadd(float32x4_t a, float32x4_t b, float32x4_t c) {
+#if CGLM_ARM64
+  return vfmaq_f32(c, a, b); /* why vfmaq_f32 is slower than vmlaq_f32 ??? */
+#else
+  return vmlaq_f32(c, a, b);
+#endif
+}
+
+static inline
+float32x4_t
+glmm_fnmadd(float32x4_t a, float32x4_t b, float32x4_t c) {
+#if CGLM_ARM64
+  return vfmsq_f32(c, a, b);
+#else
+  return vmlsq_f32(c, a, b);
+#endif
+}
+
+static inline
+float32x4_t
+glmm_fmsub(float32x4_t a, float32x4_t b, float32x4_t c) {
+  return glmm_fmadd(a, b, vnegq_f32(c));
+}
+
+static inline
+float32x4_t
+glmm_fnmsub(float32x4_t a, float32x4_t b, float32x4_t c) {
+  return vsubq_f32(vdupq_n_f32(0.0f), glmm_fmadd(a, b, c));
+}
+
+#endif
+#endif /* cglm_simd_arm_h */
diff --git a/external/cglm/simd/avx/affine.h b/external/cglm/simd/avx/affine.h
new file mode 100644
index 0000000..b02ff0c
--- /dev/null
+++ b/external/cglm/simd/avx/affine.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_affine_mat_avx_h
+#define cglm_affine_mat_avx_h
+#ifdef __AVX__
+
+#include "../../common.h"
+#include "../intrin.h"
+
+#include <immintrin.h>
+
+CGLM_INLINE
+void
+glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
+  /* D = R * L (Column-Major) */
+
+  __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
+
+  y0 = glmm_load256(m2[0]); /* h g f e d c b a */
+  y1 = glmm_load256(m2[2]); /* p o n m l k j i */
+
+  y2 = glmm_load256(m1[0]); /* h g f e d c b a */
+  y3 = glmm_load256(m1[2]); /* p o n m l k j i */
+
+  /* 0x03: 0b00000011 */
+  y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */
+  y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */
+
+  /* f f f f a a a a */
+  /* h h h h c c c c */
+  /* e e e e b b b b */
+  /* g g g g d d d d */
+  y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
+  y7 = _mm256_permutevar_ps(y0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
+  y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
+  y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
+
+  glmm_store256(dest[0],
+                _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
+                                            _mm256_mul_ps(y3, y7)),
+                              _mm256_add_ps(_mm256_mul_ps(y4, y8),
+                                            _mm256_mul_ps(y5, y9))));
+
+  /* n n n n i i i i */
+  /* p p p p k k k k */
+  /* m m m m j j j j */
+  /* o o o o l l l l */
+  y6 = _mm256_permutevar_ps(y1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
+  y7 = _mm256_permutevar_ps(y1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
+  y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
+  y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
+
+  glmm_store256(dest[2],
+                _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
+                                            _mm256_mul_ps(y3, y7)),
+                              _mm256_add_ps(_mm256_mul_ps(y4, y8),
+                                            _mm256_mul_ps(y5, y9))));
+}
+
+#endif
+#endif /* cglm_affine_mat_avx_h */
diff --git a/external/cglm/simd/avx/mat4.h b/external/cglm/simd/avx/mat4.h
new file mode 100644
index 0000000..33771c2
--- /dev/null
+++ b/external/cglm/simd/avx/mat4.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_mat_simd_avx_h
+#define cglm_mat_simd_avx_h
+#ifdef __AVX__
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_mat4_scale_avx(mat4 m, float s) {
+  __m256 y0, y1, y2, y3, y4;
+
+  y0 = glmm_load256(m[0]);            /* h g f e d c b a */
+  y1 = glmm_load256(m[2]);            /* p o n m l k j i */
+
+  y2 = _mm256_broadcast_ss(&s);
+
+  y3 = _mm256_mul_ps(y0, y2);
+  y4 = _mm256_mul_ps(y1, y2);
+
+  glmm_store256(m[0], y3);
+  glmm_store256(m[2], y4);
+}
+
+/* TODO: this must be tested and compared to SSE version, may be slower!!! */
+CGLM_INLINE
+void
+glm_mat4_transp_avx(mat4 m, mat4 dest) {
+  __m256 y0, y1, y2, y3;
+
+  y0 = glmm_load256(m[0]);                   /* h g f e d c b a */
+  y1 = glmm_load256(m[2]);                   /* p o n m l k j i */
+
+  y2 = _mm256_unpacklo_ps(y0, y1);           /* n f m e j b i a */
+  y3 = _mm256_unpackhi_ps(y0, y1);           /* p h o g l d k c */
+  
+  y0 = _mm256_permute2f128_ps(y2, y3, 0x20); /* l d k c j b i a */
+  y1 = _mm256_permute2f128_ps(y2, y3, 0x31); /* p h o g n f m e */
+
+  y2 = _mm256_unpacklo_ps(y0, y1);           /* o k g c m i e a */
+  y3 = _mm256_unpackhi_ps(y0, y1);           /* p l h d n j f b */
+
+  y0 = _mm256_permute2f128_ps(y2, y3, 0x20); /* n j f b m i e a */
+  y1 = _mm256_permute2f128_ps(y2, y3, 0x31); /* p l h d o k g c */
+
+  glmm_store256(dest[0], y0);
+  glmm_store256(dest[2], y1);
+}
+
+CGLM_INLINE
+void
+glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
+  /* D = R * L (Column-Major) */
+
+  __m256  y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13;
+  __m256i yi0, yi1, yi2, yi3;
+
+  y0 = glmm_load256(m2[0]); /* h g f e d c b a */
+  y1 = glmm_load256(m2[2]); /* p o n m l k j i */
+
+  y2 = glmm_load256(m1[0]); /* h g f e d c b a */
+  y3 = glmm_load256(m1[2]); /* p o n m l k j i */
+
+  /* 0x03: 0b00000011 */
+  y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */
+  y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */
+
+  yi0 = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  yi1 = _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2);
+  yi2 = _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1);
+  yi3 = _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3);
+  
+  /* f f f f a a a a */
+  /* h h h h c c c c */
+  /* e e e e b b b b */
+  /* g g g g d d d d */
+  y6 = _mm256_permutevar_ps(y0, yi0);
+  y7 = _mm256_permutevar_ps(y0, yi1);
+  y8 = _mm256_permutevar_ps(y0, yi2);
+  y9 = _mm256_permutevar_ps(y0, yi3);
+
+  /* n n n n i i i i */
+  /* p p p p k k k k */
+  /* m m m m j j j j */
+  /* o o o o l l l l */
+  y10 = _mm256_permutevar_ps(y1, yi0);
+  y11 = _mm256_permutevar_ps(y1, yi1);
+  y12 = _mm256_permutevar_ps(y1, yi2);
+  y13 = _mm256_permutevar_ps(y1, yi3);
+
+  y0 = _mm256_mul_ps(y2, y6);
+  y1 = _mm256_mul_ps(y2, y10);
+
+  y0 = glmm256_fmadd(y3, y7, y0);
+  y1 = glmm256_fmadd(y3, y11, y1);
+
+  y0 = glmm256_fmadd(y4, y8, y0);
+  y1 = glmm256_fmadd(y4, y12, y1);
+
+  y0 = glmm256_fmadd(y5, y9, y0);
+  y1 = glmm256_fmadd(y5, y13, y1);
+
+  glmm_store256(dest[0], y0);
+  glmm_store256(dest[2], y1);
+}
+
+#endif
+#endif /* cglm_mat_simd_avx_h */
diff --git a/external/cglm/simd/intrin.h b/external/cglm/simd/intrin.h
new file mode 100644
index 0000000..c477f34
--- /dev/null
+++ b/external/cglm/simd/intrin.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_intrin_h
+#define cglm_intrin_h
+
+#if defined(_MSC_VER) && !defined(_M_ARM64EC)
+#  if (defined(_M_AMD64) || defined(_M_X64)) || _M_IX86_FP == 2
+#    ifndef __SSE__
+#      define __SSE__
+#    endif
+#    ifndef __SSE2__
+#      define __SSE2__
+#    endif
+#  elif _M_IX86_FP == 1
+#    ifndef __SSE__
+#      define __SSE__
+#    endif
+#  endif
+/* do not use alignment for older visual studio versions */
+/* also ARM32 also causes similar error, disable it for now on ARM32 too */
+#  if _MSC_VER < 1913 || _M_ARM     /* Visual Studio 2017 version 15.6 */
+#    define CGLM_ALL_UNALIGNED
+#  endif
+#endif
+
+#ifdef __AVX__
+#  include <immintrin.h>
+#  define CGLM_AVX_FP 1
+#    ifndef __SSE2__
+#      define __SSE2__
+#    endif
+#    ifndef __SSE3__
+#      define __SSE3__
+#    endif
+#    ifndef __SSE4__
+#      define __SSE4__
+#    endif
+#    ifndef __SSE4_1__
+#      define __SSE4_1__
+#    endif
+#    ifndef __SSE4_2__
+#      define __SSE4_2__
+#    endif
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
+#endif
+
+#if defined(__SSE__)
+#  include <xmmintrin.h>
+#  define CGLM_SSE_FP 1
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
+#endif
+
+#if defined(__SSE2__)
+#  include <emmintrin.h>
+#  define CGLM_SSE2_FP 1
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
+#endif
+
+#if defined(__SSE3__)
+#  include <pmmintrin.h>
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
+#endif
+
+#if defined(__SSE4_1__)
+#  include <smmintrin.h>
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
+#endif
+
+#if defined(__SSE4_2__)
+#  include <nmmintrin.h>
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
+#endif
+
+/* ARM Neon */
+#if defined(_WIN32) && defined(_MSC_VER)
+/* TODO: non-ARM stuff already inported, will this be better option */
+/* #  include <intrin.h> */
+
+#  if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
+#    include <arm64intr.h>
+#    include <arm64_neon.h>
+#    ifndef CGLM_NEON_FP
+#      define CGLM_NEON_FP  1
+#    endif
+#    ifndef CGLM_SIMD_ARM
+#      define CGLM_SIMD_ARM
+#    endif
+#  elif defined(_M_ARM)
+#    include <armintr.h>
+#    include <arm_neon.h>
+#    ifndef CGLM_NEON_FP
+#      define CGLM_NEON_FP 1
+#    endif
+#    ifndef CGLM_SIMD_ARM
+#      define CGLM_SIMD_ARM
+#    endif
+#  endif
+
+#else /* non-windows */
+#  if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#    include <arm_neon.h>
+#    if defined(__ARM_NEON_FP) || defined(__ARM_FP)
+#      define CGLM_NEON_FP 1
+#    endif
+#    ifndef CGLM_SIMD_ARM
+#      define CGLM_SIMD_ARM
+#    endif
+#  endif
+#endif
+
+/* WebAssembly */
+#if defined(__wasm__) && defined(__wasm_simd128__)
+#  ifndef CGLM_SIMD_WASM
+#    define CGLM_SIMD_WASM
+#  endif
+#endif
+
+#if defined(CGLM_SIMD_x86) || defined(CGLM_SIMD_ARM) || defined(CGLM_SIMD_WASM)
+#  ifndef CGLM_SIMD
+#    define CGLM_SIMD
+#  endif
+#endif
+
+#if defined(CGLM_SIMD_x86) && !defined(CGLM_SIMD_WASM)
+#  include "x86.h"
+#endif
+
+#if defined(CGLM_SIMD_ARM)
+#  include "arm.h"
+#endif
+
+#if defined(CGLM_SIMD_WASM)
+#  include "wasm.h"
+#endif
+
+#endif /* cglm_intrin_h */
diff --git a/external/cglm/simd/neon/affine.h b/external/cglm/simd/neon/affine.h
new file mode 100644
index 0000000..b0a65a6
--- /dev/null
+++ b/external/cglm/simd/neon/affine.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_affine_neon_h
+#define cglm_affine_neon_h
+#if defined(CGLM_NEON_FP)
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_mul_neon(mat4 m1, mat4 m2, mat4 dest) {
+  /* D = R * L (Column-Major) */
+
+  glmm_128 l, r0, r1, r2, r3, v0, v1, v2, v3;
+
+  l  = glmm_load(m1[0]);
+  r0 = glmm_load(m2[0]);
+  r1 = glmm_load(m2[1]);
+  r2 = glmm_load(m2[2]);
+  r3 = glmm_load(m2[3]);
+
+  v0 = vmulq_f32(glmm_splat_x(r0), l);
+  v1 = vmulq_f32(glmm_splat_x(r1), l);
+  v2 = vmulq_f32(glmm_splat_x(r2), l);
+  v3 = vmulq_f32(glmm_splat_x(r3), l);
+
+  l  = glmm_load(m1[1]);
+  v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_y(r3), l, v3);
+
+  l  = glmm_load(m1[2]);
+  v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_z(r3), l, v3);
+
+  v3 = glmm_fmadd(glmm_splat_w(r3), glmm_load(m1[3]), v3);
+
+  glmm_store(dest[0], v0);
+  glmm_store(dest[1], v1);
+  glmm_store(dest[2], v2);
+  glmm_store(dest[3], v3);
+}
+
+CGLM_INLINE
+void
+glm_mul_rot_neon(mat4 m1, mat4 m2, mat4 dest) {
+  /* D = R * L (Column-Major) */
+
+  glmm_128 l, r0, r1, r2, v0, v1, v2;
+
+  l  = glmm_load(m1[0]);
+  r0 = glmm_load(m2[0]);
+  r1 = glmm_load(m2[1]);
+  r2 = glmm_load(m2[2]);
+
+  v0 = vmulq_f32(glmm_splat_x(r0), l);
+  v1 = vmulq_f32(glmm_splat_x(r1), l);
+  v2 = vmulq_f32(glmm_splat_x(r2), l);
+
+  l  = glmm_load(m1[1]);
+  v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
+  
+  l  = glmm_load(m1[2]);
+  v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
+
+  glmm_store(dest[0], v0);
+  glmm_store(dest[1], v1);
+  glmm_store(dest[2], v2);
+  glmm_store(dest[3], glmm_load(m1[3]));
+}
+
+CGLM_INLINE
+void
+glm_inv_tr_neon(mat4 mat) {
+  float32x4x4_t vmat;
+  glmm_128      r0, r1, r2, x0;
+
+  vmat = vld4q_f32(mat[0]);
+  r0   = vmat.val[0];
+  r1   = vmat.val[1];
+  r2   = vmat.val[2];
+
+  x0 = glmm_fmadd(r0, glmm_splat_w(r0),
+                  glmm_fmadd(r1, glmm_splat_w(r1),
+                             vmulq_f32(r2, glmm_splat_w(r2))));
+  x0 = vnegq_f32(x0);
+
+  glmm_store(mat[0], r0);
+  glmm_store(mat[1], r1);
+  glmm_store(mat[2], r2);
+  glmm_store(mat[3], x0);
+  
+  mat[0][3] = 0.0f;
+  mat[1][3] = 0.0f;
+  mat[2][3] = 0.0f;
+  mat[3][3] = 1.0f;
+
+  /* TODO: ?
+  zo   = vget_high_f32(r3);
+  vst1_lane_f32(&mat[0][3], zo, 0);
+  vst1_lane_f32(&mat[1][3], zo, 0);
+  vst1_lane_f32(&mat[2][3], zo, 0);
+  vst1_lane_f32(&mat[3][3], zo, 1);
+  */
+}
+
+#endif
+#endif /* cglm_affine_neon_h */
diff --git a/external/cglm/simd/neon/mat2.h b/external/cglm/simd/neon/mat2.h
new file mode 100644
index 0000000..7d0d9eb
--- /dev/null
+++ b/external/cglm/simd/neon/mat2.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_mat2_neon_h
+#define cglm_mat2_neon_h
+#if defined(CGLM_NEON_FP)
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_mat2_mul_neon(mat2 m1, mat2 m2, mat2 dest) {
+  float32x4x2_t a1;
+  glmm_128 x0,  x1, x2;
+  float32x2_t   dc, ba;
+
+  x1 = glmm_load(m1[0]); /* d c b a */
+  x2 = glmm_load(m2[0]); /* h g f e */
+  
+  dc = vget_high_f32(x1);
+  ba = vget_low_f32(x1);
+
+  /* g g e e, h h f f */
+  a1 = vtrnq_f32(x2, x2);
+  
+  /*
+   dest[0][0] = a * e + c * f;
+   dest[0][1] = b * e + d * f;
+   dest[1][0] = a * g + c * h;
+   dest[1][1] = b * g + d * h;
+   */
+  x0 = glmm_fmadd(vcombine_f32(ba, ba), a1.val[0],
+                  vmulq_f32(vcombine_f32(dc, dc), a1.val[1]));
+
+  glmm_store(dest[0], x0);
+}
+
+#endif
+#endif /* cglm_mat2_neon_h */
diff --git a/external/cglm/simd/neon/mat4.h b/external/cglm/simd/neon/mat4.h
new file mode 100644
index 0000000..6cf9811
--- /dev/null
+++ b/external/cglm/simd/neon/mat4.h
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_mat4_neon_h
+#define cglm_mat4_neon_h
+#if defined(CGLM_NEON_FP)
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_mat4_scale_neon(mat4 m, float s) {
+  float32x4_t v0;
+  
+  v0 = vdupq_n_f32(s);
+
+  vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), v0));
+  vst1q_f32(m[1], vmulq_f32(vld1q_f32(m[1]), v0));
+  vst1q_f32(m[2], vmulq_f32(vld1q_f32(m[2]), v0));
+  vst1q_f32(m[3], vmulq_f32(vld1q_f32(m[3]), v0));
+}
+
+CGLM_INLINE
+void
+glm_mat4_transp_neon(mat4 m, mat4 dest) {
+  float32x4x4_t vmat;
+  
+  vmat = vld4q_f32(m[0]);
+
+  vst1q_f32(dest[0], vmat.val[0]);
+  vst1q_f32(dest[1], vmat.val[1]);
+  vst1q_f32(dest[2], vmat.val[2]);
+  vst1q_f32(dest[3], vmat.val[3]);
+}
+
+CGLM_INLINE
+void
+glm_mat4_mul_neon(mat4 m1, mat4 m2, mat4 dest) {
+  /* D = R * L (Column-Major) */
+
+  glmm_128 l, r0, r1, r2, r3, v0, v1, v2, v3;
+
+  l  = glmm_load(m1[0]);
+  r0 = glmm_load(m2[0]);
+  r1 = glmm_load(m2[1]);
+  r2 = glmm_load(m2[2]);
+  r3 = glmm_load(m2[3]);
+
+  v0 = vmulq_f32(glmm_splat_x(r0), l);
+  v1 = vmulq_f32(glmm_splat_x(r1), l);
+  v2 = vmulq_f32(glmm_splat_x(r2), l);
+  v3 = vmulq_f32(glmm_splat_x(r3), l);
+
+  l  = glmm_load(m1[1]);
+  v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_y(r3), l, v3);
+
+  l  = glmm_load(m1[2]);
+  v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_z(r3), l, v3);
+
+  l  = glmm_load(m1[3]);
+  v0 = glmm_fmadd(glmm_splat_w(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_w(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_w(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_w(r3), l, v3);
+
+  glmm_store(dest[0], v0);
+  glmm_store(dest[1], v1);
+  glmm_store(dest[2], v2);
+  glmm_store(dest[3], v3);
+}
+
+CGLM_INLINE
+void
+glm_mat4_mulv_neon(mat4 m, vec4 v, vec4 dest) {
+  float32x4_t l0, l1, l2, l3;
+  float32x2_t vlo, vhi;
+  
+  l0  = vld1q_f32(m[0]);
+  l1  = vld1q_f32(m[1]);
+  l2  = vld1q_f32(m[2]);
+  l3  = vld1q_f32(m[3]);
+
+  vlo = vld1_f32(&v[0]);
+  vhi = vld1_f32(&v[2]);
+
+  l0  = vmulq_lane_f32(l0, vlo, 0);
+  l0  = vmlaq_lane_f32(l0, l1, vlo, 1);
+  l0  = vmlaq_lane_f32(l0, l2, vhi, 0);
+  l0  = vmlaq_lane_f32(l0, l3, vhi, 1);
+
+  vst1q_f32(dest, l0);
+}
+
+CGLM_INLINE
+float
+glm_mat4_det_neon(mat4 mat) {
+  float32x4_t   r0, r1, r2, r3, x0, x1, x2;
+  float32x2_t   ij, op, mn, kl, nn, mm, jj, ii, gh, ef, t12, t34;
+  float32x4x2_t a1;
+  float32x4_t   x3 = glmm_float32x4_SIGNMASK_PNPN;
+
+  /* 127 <- 0, [square] det(A) = det(At) */
+  r0 = glmm_load(mat[0]);              /* d c b a */
+  r1 = vrev64q_f32(glmm_load(mat[1])); /* g h e f */
+  r2 = vrev64q_f32(glmm_load(mat[2])); /* l k i j */
+  r3 = vrev64q_f32(glmm_load(mat[3])); /* o p m n */
+
+  gh = vget_high_f32(r1);
+  ef = vget_low_f32(r1);
+  kl = vget_high_f32(r2);
+  ij = vget_low_f32(r2);
+  op = vget_high_f32(r3);
+  mn = vget_low_f32(r3);
+  mm = vdup_lane_f32(mn, 1);
+  nn = vdup_lane_f32(mn, 0);
+  ii = vdup_lane_f32(ij, 1);
+  jj = vdup_lane_f32(ij, 0);
+  
+  /*
+   t[1] = j * p - n * l;
+   t[2] = j * o - n * k;
+   t[3] = i * p - m * l;
+   t[4] = i * o - m * k;
+   */
+  x0 = glmm_fnmadd(vcombine_f32(kl, kl), vcombine_f32(nn, mm),
+                   vmulq_f32(vcombine_f32(op, op), vcombine_f32(jj, ii)));
+
+  t12 = vget_low_f32(x0);
+  t34 = vget_high_f32(x0);
+  
+  /* 1 3 1 3 2 4 2 4 */
+  a1 = vuzpq_f32(x0, x0);
+  
+  /*
+   t[0] = k * p - o * l;
+   t[0] = k * p - o * l;
+   t[5] = i * n - m * j;
+   t[5] = i * n - m * j;
+   */
+  x1 = glmm_fnmadd(vcombine_f32(vdup_lane_f32(kl, 0), jj),
+                   vcombine_f32(vdup_lane_f32(op, 1), mm),
+                   vmulq_f32(vcombine_f32(vdup_lane_f32(op, 0), nn),
+                             vcombine_f32(vdup_lane_f32(kl, 1), ii)));
+
+  /*
+     a * (f * t[0] - g * t[1] + h * t[2])
+   - b * (e * t[0] - g * t[3] + h * t[4])
+   + c * (e * t[1] - f * t[3] + h * t[5])
+   - d * (e * t[2] - f * t[4] + g * t[5])
+   */
+  x2 = glmm_fnmadd(vcombine_f32(vdup_lane_f32(gh, 1), vdup_lane_f32(ef, 0)),
+                   vcombine_f32(vget_low_f32(a1.val[0]), t34),
+                   vmulq_f32(vcombine_f32(ef, vdup_lane_f32(ef, 1)),
+                             vcombine_f32(vget_low_f32(x1), t12)));
+
+  x2 = glmm_fmadd(vcombine_f32(vdup_lane_f32(gh, 0), gh),
+                  vcombine_f32(vget_low_f32(a1.val[1]), vget_high_f32(x1)), x2);
+
+  x2 = glmm_xor(x2, x3);
+
+  return glmm_hadd(vmulq_f32(x2, r0));
+}
+
+/* old one */
+#if 0
+CGLM_INLINE
+void
+glm_mat4_inv_neon(mat4 mat, mat4 dest) {
+  float32x4_t   r0, r1, r2, r3,
+                v0, v1, v2, v3,
+                t0, t1, t2, t3, t4, t5,
+                x0, x1, x2, x3, x4, x5, x6, x7, x8;
+  float32x4x2_t a1;
+  float32x2_t   lp, ko, hg, jn, im, fe, ae, bf, cg, dh;
+  float32x4_t   x9 = glmm_float32x4_SIGNMASK_NPNP;
+
+  x8 = vrev64q_f32(x9);
+
+  /* 127 <- 0 */
+  r0 = glmm_load(mat[0]); /* d c b a */
+  r1 = glmm_load(mat[1]); /* h g f e */
+  r2 = glmm_load(mat[2]); /* l k j i */
+  r3 = glmm_load(mat[3]); /* p o n m */
+  
+  /* l p k o, j n i m */
+  a1  = vzipq_f32(r3, r2);
+  
+  jn  = vget_high_f32(a1.val[0]);
+  im  = vget_low_f32(a1.val[0]);
+  lp  = vget_high_f32(a1.val[1]);
+  ko  = vget_low_f32(a1.val[1]);
+  hg  = vget_high_f32(r1);
+
+  x1  = vcombine_f32(vdup_lane_f32(lp, 0), lp);                   /* l p p p */
+  x2  = vcombine_f32(vdup_lane_f32(ko, 0), ko);                   /* k o o o */
+  x0  = vcombine_f32(vdup_lane_f32(lp, 1), vdup_lane_f32(hg, 1)); /* h h l l */
+  x3  = vcombine_f32(vdup_lane_f32(ko, 1), vdup_lane_f32(hg, 0)); /* g g k k */
+  
+  /* t1[0] = k * p - o * l;
+     t1[0] = k * p - o * l;
+     t2[0] = g * p - o * h;
+     t3[0] = g * l - k * h; */
+  t0 = glmm_fnmadd(x2, x0, vmulq_f32(x3, x1));
+
+  fe = vget_low_f32(r1);
+  x4 = vcombine_f32(vdup_lane_f32(jn, 0), jn);                   /* j n n n */
+  x5 = vcombine_f32(vdup_lane_f32(jn, 1), vdup_lane_f32(fe, 1)); /* f f j j */
+  
+  /* t1[1] = j * p - n * l;
+     t1[1] = j * p - n * l;
+     t2[1] = f * p - n * h;
+     t3[1] = f * l - j * h; */
+   t1 = glmm_fnmadd(x4, x0, vmulq_f32(x5, x1));
+  
+  /* t1[2] = j * o - n * k
+     t1[2] = j * o - n * k;
+     t2[2] = f * o - n * g;
+     t3[2] = f * k - j * g; */
+  t2 = glmm_fnmadd(x4, x3, vmulq_f32(x5, x2));
+  
+  x6 = vcombine_f32(vdup_lane_f32(im, 1), vdup_lane_f32(fe, 0)); /* e e i i */
+  x7 = vcombine_f32(vdup_lane_f32(im, 0), im);                   /* i m m m */
+  
+  /* t1[3] = i * p - m * l;
+     t1[3] = i * p - m * l;
+     t2[3] = e * p - m * h;
+     t3[3] = e * l - i * h; */
+  t3 = glmm_fnmadd(x7, x0, vmulq_f32(x6, x1));
+  
+  /* t1[4] = i * o - m * k;
+     t1[4] = i * o - m * k;
+     t2[4] = e * o - m * g;
+     t3[4] = e * k - i * g; */
+  t4 = glmm_fnmadd(x7, x3, vmulq_f32(x6, x2));
+  
+  /* t1[5] = i * n - m * j;
+     t1[5] = i * n - m * j;
+     t2[5] = e * n - m * f;
+     t3[5] = e * j - i * f; */
+  t5 = glmm_fnmadd(x7, x5, vmulq_f32(x6, x4));
+  
+  /* h d f b, g c e a */
+  a1 = vtrnq_f32(r0, r1);
+  
+  x4 = vrev64q_f32(a1.val[0]); /* c g a e */
+  x5 = vrev64q_f32(a1.val[1]); /* d h b f */
+
+  ae = vget_low_f32(x4);
+  cg = vget_high_f32(x4);
+  bf = vget_low_f32(x5);
+  dh = vget_high_f32(x5);
+  
+  x0 = vcombine_f32(ae, vdup_lane_f32(ae, 1)); /* a a a e */
+  x1 = vcombine_f32(bf, vdup_lane_f32(bf, 1)); /* b b b f */
+  x2 = vcombine_f32(cg, vdup_lane_f32(cg, 1)); /* c c c g */
+  x3 = vcombine_f32(dh, vdup_lane_f32(dh, 1)); /* d d d h */
+  
+  /*
+   dest[0][0] =  f * t1[0] - g * t1[1] + h * t1[2];
+   dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]);
+   dest[0][2] =  b * t2[0] - c * t2[1] + d * t2[2];
+   dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */
+  v0 = glmm_xor(glmm_fmadd(x3, t2, glmm_fnmadd(x2, t1, vmulq_f32(x1, t0))), x8);
+  
+  /*
+   dest[2][0] =  e * t1[1] - f * t1[3] + h * t1[5];
+   dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
+   dest[2][2] =  a * t2[1] - b * t2[3] + d * t2[5];
+   dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
+  v2 = glmm_xor(glmm_fmadd(x3, t5, glmm_fnmadd(x1, t3, vmulq_f32(x0, t1))), x8);
+
+  /*
+   dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]);
+   dest[1][1] =  a * t1[0] - c * t1[3] + d * t1[4];
+   dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]);
+   dest[1][3] =  a * t3[0] - c * t3[3] + d * t3[4]; */
+  v1 = glmm_xor(glmm_fmadd(x3, t4, glmm_fnmadd(x2, t3, vmulq_f32(x0, t0))), x9);
+  
+  /*
+   dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]);
+   dest[3][1] =  a * t1[2] - b * t1[4] + c * t1[5];
+   dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]);
+   dest[3][3] =  a * t3[2] - b * t3[4] + c * t3[5]; */
+  v3 = glmm_xor(glmm_fmadd(x2, t5, glmm_fnmadd(x1, t4, vmulq_f32(x0, t2))), x9);
+
+  /* determinant */
+  x0 = vcombine_f32(vget_low_f32(vzipq_f32(v0, v1).val[0]),
+                    vget_low_f32(vzipq_f32(v2, v3).val[0]));
+
+  /*
+  x0 = glmm_div(glmm_set1_rval(1.0f), glmm_vhadd(vmulq_f32(x0, r0)));
+
+  glmm_store(dest[0], vmulq_f32(v0, x0));
+  glmm_store(dest[1], vmulq_f32(v1, x0));
+  glmm_store(dest[2], vmulq_f32(v2, x0));
+  glmm_store(dest[3], vmulq_f32(v3, x0));
+  */
+
+  x0 = glmm_vhadd(vmulq_f32(x0, r0));
+
+  glmm_store(dest[0], glmm_div(v0, x0));
+  glmm_store(dest[1], glmm_div(v1, x0));
+  glmm_store(dest[2], glmm_div(v2, x0));
+  glmm_store(dest[3], glmm_div(v3, x0));
+}
+#endif
+
+CGLM_INLINE
+void
+glm_mat4_inv_neon(mat4 mat, mat4 dest) {
+  float32x4_t   r0, r1, r2, r3,
+                v0, v1, v2, v3, v4, v5,
+                t0, t1, t2;
+  float32x4x2_t a0, a1, a2, a3, a4;
+  float32x4_t   s1 = glmm_float32x4_SIGNMASK_PNPN, s2;
+
+#if !CGLM_ARM64
+  float32x2_t   l0, l1;
+#endif
+
+  s2 = vrev64q_f32(s1);
+
+  /* 127 <- 0 */
+  r0 = glmm_load(mat[0]);                  /* d c b a */
+  r1 = glmm_load(mat[1]);                  /* h g f e */
+  r2 = glmm_load(mat[2]);                  /* l k j i */
+  r3 = glmm_load(mat[3]);                  /* p o n m */
+
+  a1 = vzipq_f32(r0, r2);                  /* l d k c, j b i a */
+  a2 = vzipq_f32(r1, r3);                  /* p h o g, n f m e */
+  a3 = vzipq_f32(a2.val[0], a1.val[0]);    /* j n b f, i m a e */
+  a4 = vzipq_f32(a2.val[1], a1.val[1]);    /* l p d h, k o c g */
+
+  v0 = vextq_f32(a1.val[0], a1.val[1], 2); /* k c j b */
+  v1 = vextq_f32(a2.val[0], a2.val[1], 2); /* o g n f */
+  v2 = vextq_f32(a1.val[1], a2.val[0], 2); /* m e l d */
+  v3 = vextq_f32(a2.val[1], a1.val[0], 2); /* i a p h */
+  v4 = vextq_f32(v1, v2, 2);               /* l d o g */
+  v5 = vextq_f32(v0, v3, 2);               /* p h k c */
+
+  /* c2 = c * h - g * d   c12 = a * g - c * e   c8  = a * f - b * e
+     c1 = k * p - o * l   c11 = i * o - k * m   c7  = i * n - j * m
+     c4 = h * a - d * e   c6  = b * h - d * f   c10 = b * g - c * f
+     c3 = p * i - l * m   c5  = j * p - l * n   c9  = j * o - k * n */
+  t0 = vmulq_f32(v5, v3);
+  t1 = vmulq_f32(a1.val[0], a2.val[1]);
+  t2 = vmulq_f32(a1.val[0], v1);
+
+  t0 = glmm_fnmadd(v4, v2, t0);
+  t1 = glmm_fnmadd(a1.val[1], a2.val[0], t1);
+  t2 = glmm_fnmadd(v0, a2.val[0], t2);
+
+  t0 = vrev64q_f32(t0);
+  t1 = vrev64q_f32(t1);
+  t2 = vrev64q_f32(t2);
+
+  /* det */
+  v0 = vrev64q_f32(t2);
+  v1 = vextq_f32(t1, t1, 2);
+  v0 = vmulq_f32(t0, v0);
+  v1 = vrev64q_f32(v1);
+  v1 = vmulq_f32(v1, t1);
+
+  /* c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */
+#if CGLM_ARM64
+  v0 = vpaddq_f32(v0, v0);
+  v0 = vpaddq_f32(v0, v0);
+#else
+  l0 = vget_low_f32(v0);
+  l1 = vget_high_f32(v0);
+
+  l0 = vpadd_f32(l0, l0); /* [a+b, a+b] */ 
+  l1 = vpadd_f32(l1, l1); /* [c+d, c+d] */ 
+  l0 = vadd_f32(l0, l1);  /* [sum, sum] */ 
+
+  v0 = vcombine_f32(l0, l0); 
+#endif
+
+  /* c5 * c12 + c6 * c11 */
+#if CGLM_ARM64
+  v1 = vpaddq_f32(v1, v1);
+#else
+  l0 = vget_low_f32(v1);
+  l1 = vget_high_f32(v1);
+
+  l0 = vpadd_f32(l0, l0); /* [a+b, a+b] */ 
+  l1 = vpadd_f32(l1, l1); /* [c+d, c+d] */ 
+
+  v1 = vcombine_f32(l0, l1);
+#endif
+
+  v0 = vsubq_f32(v0, v1);    /* det */
+
+  /* inv div */
+  v1 = vdupq_n_f32(1.0f);
+  v0 = glmm_div(v1, v0);     /* inv div */
+
+  /* multiply t0,t1,t2 by idt to reduce 1mul below: 2eor+4mul vs 3mul+4eor */
+  t0 = vmulq_f32(t0, v0);
+  t1 = vmulq_f32(t1, v0);
+  t2 = vmulq_f32(t2, v0);
+
+  a0 = vzipq_f32(t0, t0);    /* c4  c4  c3 c3, c2  c2  c1  c1  */
+  a1 = vzipq_f32(t1, t1);    /* c6  c6  c5 c5, c12 c12 c11 c11 */
+  a2 = vzipq_f32(t2, t2);    /* c10 c10 c9 c9, c8  c8  c7  c7  */
+
+  /* result */
+
+  /* dest[0][0] = (f * c1  - g * c5  + h * c9)  * idt;
+     dest[0][1] = (b * c1  - c * c5  + d * c9)  * ndt;
+     dest[0][2] = (n * c2  - o * c6  + p * c10) * idt;
+     dest[0][3] = (j * c2  - k * c6  + l * c10) * ndt;
+
+     dest[1][0] = (e * c1  - g * c3  + h * c11) * ndt;
+     dest[1][1] = (a * c1  - c * c3  + d * c11) * idt;
+     dest[1][2] = (m * c2  - o * c4  + p * c12) * ndt;
+     dest[1][3] = (i * c2  - k * c4  + l * c12) * idt;
+
+     dest[2][0] = (e * c5  - f * c3  + h * c7)  * idt;
+     dest[2][1] = (a * c5  - b * c3  + d * c7)  * ndt;
+     dest[2][2] = (m * c6  - n * c4  + p * c8)  * idt;
+     dest[2][3] = (i * c6  - j * c4  + l * c8)  * ndt;
+
+     dest[3][0] = (e * c9  - f * c11 + g * c7)  * ndt;
+     dest[3][1] = (a * c9  - b * c11 + c * c7)  * idt;
+     dest[3][2] = (m * c10 - n * c12 + o * c8)  * ndt;
+     dest[3][3] = (i * c10 - j * c12 + k * c8)  * idt; */
+
+  r0 = vmulq_f32(a3.val[1], a0.val[0]);
+  r1 = vmulq_f32(a3.val[0], a0.val[0]);
+  r2 = vmulq_f32(a3.val[0], a1.val[1]);
+  r3 = vmulq_f32(a3.val[0], a2.val[1]);
+
+  r0 = glmm_fnmadd(a4.val[0], a1.val[1], r0);
+  r1 = glmm_fnmadd(a4.val[0], a0.val[1], r1);
+  r2 = glmm_fnmadd(a3.val[1], a0.val[1], r2);
+  r3 = glmm_fnmadd(a3.val[1], a1.val[0], r3);
+
+  r0 = glmm_fmadd(a4.val[1], a2.val[1], r0);
+  r1 = glmm_fmadd(a4.val[1], a1.val[0], r1);
+  r2 = glmm_fmadd(a4.val[1], a2.val[0], r2);
+  r3 = glmm_fmadd(a4.val[0], a2.val[0], r3);
+
+  /* 4xor may be fastart then 4mul, see above  */
+  r0 = glmm_xor(r0, s1);
+  r1 = glmm_xor(r1, s2);
+  r2 = glmm_xor(r2, s1);
+  r3 = glmm_xor(r3, s2);
+
+  glmm_store(dest[0], r0);
+  glmm_store(dest[1], r1);
+  glmm_store(dest[2], r2);
+  glmm_store(dest[3], r3);
+}
+
+#endif
+#endif /* cglm_mat4_neon_h */
diff --git a/external/cglm/simd/neon/quat.h b/external/cglm/simd/neon/quat.h
new file mode 100644
index 0000000..55dc1da
--- /dev/null
+++ b/external/cglm/simd/neon/quat.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_quat_neon_h
+#define cglm_quat_neon_h
+#if defined(CGLM_NEON_FP)
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_quat_mul_neon(versor p, versor q, versor dest) {
+  /*
+   + (a1 b2 + b1 a2 + c1 d2 − d1 c2)i
+   + (a1 c2 − b1 d2 + c1 a2 + d1 b2)j
+   + (a1 d2 + b1 c2 − c1 b2 + d1 a2)k
+     a1 a2 − b1 b2 − c1 c2 − d1 d2
+   */
+
+  glmm_128 xp, xq, xqr, r, x, y, z, s2, s3;
+  glmm_128 s1 = glmm_float32x4_SIGNMASK_NPPN;
+
+  float32x2_t   qh, ql;
+  
+  xp  = glmm_load(p); /* 3 2 1 0 */
+  xq  = glmm_load(q);
+
+  r   = vmulq_f32(glmm_splat_w(xp), xq);
+  x   = glmm_splat_x(xp);
+  y   = glmm_splat_y(xp);
+  z   = glmm_splat_z(xp);
+
+  ql  = vget_high_f32(s1);
+  s3  = vcombine_f32(ql, ql);
+  s2  = vzipq_f32(s3, s3).val[0];
+
+  xqr = vrev64q_f32(xq);
+  qh  = vget_high_f32(xqr);
+  ql  = vget_low_f32(xqr);
+
+  r = glmm_fmadd(glmm_xor(x, s3), vcombine_f32(qh, ql), r);
+  
+  r = glmm_fmadd(glmm_xor(y, s2), vcombine_f32(vget_high_f32(xq),
+                                               vget_low_f32(xq)), r);
+  
+  r = glmm_fmadd(glmm_xor(z, s1), vcombine_f32(ql, qh), r);
+
+  glmm_store(dest, r);
+}
+
+#endif
+#endif /* cglm_quat_neon_h */
diff --git a/external/cglm/simd/sse2/affine.h b/external/cglm/simd/sse2/affine.h
new file mode 100644
index 0000000..0619995
--- /dev/null
+++ b/external/cglm/simd/sse2/affine.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_affine_mat_sse2_h
+#define cglm_affine_mat_sse2_h
+#if defined( __SSE__ ) || defined( __SSE2__ )
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
+  /* D = R * L (Column-Major) */
+  glmm_128 l, r0, r1, r2, r3, v0, v1, v2, v3;
+
+  l  = glmm_load(m1[0]);
+  r0 = glmm_load(m2[0]);
+  r1 = glmm_load(m2[1]);
+  r2 = glmm_load(m2[2]);
+  r3 = glmm_load(m2[3]);
+
+  v0 = _mm_mul_ps(glmm_splat_x(r0), l);
+  v1 = _mm_mul_ps(glmm_splat_x(r1), l);
+  v2 = _mm_mul_ps(glmm_splat_x(r2), l);
+  v3 = _mm_mul_ps(glmm_splat_x(r3), l);
+
+  l  = glmm_load(m1[1]);
+  v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_y(r3), l, v3);
+
+  l  = glmm_load(m1[2]);
+  v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_z(r3), l, v3);
+
+  l  = glmm_load(m1[3]);
+  v3 = glmm_fmadd(glmm_splat_w(r3), l, v3);
+
+  glmm_store(dest[0], v0);
+  glmm_store(dest[1], v1);
+  glmm_store(dest[2], v2);
+  glmm_store(dest[3], v3);
+}
+
+CGLM_INLINE
+void
+glm_mul_rot_sse2(mat4 m1, mat4 m2, mat4 dest) {
+  /* D = R * L (Column-Major) */
+
+  glmm_128 l, r0, r1, r2, v0, v1, v2;
+
+  l  = glmm_load(m1[0]);
+  r0 = glmm_load(m2[0]);
+  r1 = glmm_load(m2[1]);
+  r2 = glmm_load(m2[2]);
+
+  v0 = _mm_mul_ps(glmm_splat_x(r0), l);
+  v1 = _mm_mul_ps(glmm_splat_x(r1), l);
+  v2 = _mm_mul_ps(glmm_splat_x(r2), l);
+
+  l  = glmm_load(m1[1]);
+  v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
+
+  l  = glmm_load(m1[2]);
+  v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
+
+  glmm_store(dest[0], v0);
+  glmm_store(dest[1], v1);
+  glmm_store(dest[2], v2);
+  glmm_store(dest[3], glmm_load(m1[3]));
+}
+
+CGLM_INLINE
+void
+glm_inv_tr_sse2(mat4 mat) {
+  __m128 r0, r1, r2, r3, x0, x1, x2, x3, x4, x5;
+
+  r0 = glmm_load(mat[0]);
+  r1 = glmm_load(mat[1]);
+  r2 = glmm_load(mat[2]);
+  r3 = glmm_load(mat[3]);
+  x1 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
+
+  _MM_TRANSPOSE4_PS(r0, r1, r2, x1);
+
+  x2 = glmm_shuff1(r3, 0, 0, 0, 0);
+  x3 = glmm_shuff1(r3, 1, 1, 1, 1);
+  x4 = glmm_shuff1(r3, 2, 2, 2, 2);
+  x5 = glmm_float32x4_SIGNMASK_NEG;
+
+  x0 = glmm_fmadd(r0, x2, glmm_fmadd(r1, x3, _mm_mul_ps(r2, x4)));
+  x0 = _mm_xor_ps(x0, x5);
+
+  x0 = _mm_add_ps(x0, x1);
+
+  glmm_store(mat[0], r0);
+  glmm_store(mat[1], r1);
+  glmm_store(mat[2], r2);
+  glmm_store(mat[3], x0);
+}
+
+#endif
+#endif /* cglm_affine_mat_sse2_h */
diff --git a/external/cglm/simd/sse2/mat2.h b/external/cglm/simd/sse2/mat2.h
new file mode 100644
index 0000000..31b3a29
--- /dev/null
+++ b/external/cglm/simd/sse2/mat2.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_mat2_sse_h
+#define cglm_mat2_sse_h
+#if defined( __SSE__ ) || defined( __SSE2__ )
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_mat2_mul_sse2(mat2 m1, mat2 m2, mat2 dest) {
+  __m128 x0, x1, x2, x3, x4;
+
+  x1 = glmm_load(m1[0]); /* d c b a */
+  x2 = glmm_load(m2[0]); /* h g f e */
+
+  x3 = glmm_shuff1(x2, 2, 2, 0, 0);
+  x4 = glmm_shuff1(x2, 3, 3, 1, 1);
+  x0 = _mm_movelh_ps(x1, x1);
+  x2 = _mm_movehl_ps(x1, x1);
+
+  /*
+   dest[0][0] = a * e + c * f;
+   dest[0][1] = b * e + d * f;
+   dest[1][0] = a * g + c * h;
+   dest[1][1] = b * g + d * h;
+   */
+  x0 = glmm_fmadd(x0, x3, _mm_mul_ps(x2, x4));
+
+  glmm_store(dest[0], x0);
+}
+
+CGLM_INLINE
+void
+glm_mat2_transp_sse2(mat2 m, mat2 dest) {
+  /* d c b a */
+  /* d b c a */
+  glmm_store(dest[0], glmm_shuff1(glmm_load(m[0]), 3, 1, 2, 0));
+}
+
+#endif
+#endif /* cglm_mat2_sse_h */
diff --git a/external/cglm/simd/sse2/mat3.h b/external/cglm/simd/sse2/mat3.h
new file mode 100644
index 0000000..f07320c
--- /dev/null
+++ b/external/cglm/simd/sse2/mat3.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_mat3_sse_h
+#define cglm_mat3_sse_h
+#if defined( __SSE__ ) || defined( __SSE2__ )
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_mat3_mul_sse2(mat3 m1, mat3 m2, mat3 dest) {
+  __m128 l0, l1, l2, r0, r1, r2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
+  
+  l0 = _mm_loadu_ps(m1[0]);
+  l1 = _mm_loadu_ps(&m1[1][1]);
+
+  r0 = _mm_loadu_ps(m2[0]);
+  r1 = _mm_loadu_ps(&m2[1][1]);
+
+  x8 = glmm_shuff1(l0, 0, 2, 1, 0);                     /* a00 a02 a01 a00 */
+  x1 = glmm_shuff1(r0, 3, 0, 0, 0);                     /* b10 b00 b00 b00 */
+  x2 = _mm_shuffle_ps(l0, l1, _MM_SHUFFLE(1, 0, 3, 3)); /* a12 a11 a10 a10 */
+  x3 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 0, 3, 1)); /* b20 b11 b10 b01 */
+  x0 = _mm_mul_ps(x8, x1);
+
+  x6 = glmm_shuff1(l0, 1, 0, 2, 1);                     /* a01 a00 a02 a01 */
+  x7 = glmm_shuff1(x3, 3, 3, 1, 1);                     /* b20 b20 b10 b10 */
+  l2 = _mm_load_ss(&m1[2][2]);
+  r2 = _mm_load_ss(&m2[2][2]);
+  x1 = _mm_mul_ps(x6, x7);
+  l2 = glmm_shuff1(l2, 0, 0, 1, 0);                     /* a22 a22 0.f a22 */
+  r2 = glmm_shuff1(r2, 0, 0, 1, 0);                     /* b22 b22 0.f b22 */
+
+  x4 = glmm_shuff1(x2, 0, 3, 2, 0);                     /* a10 a12 a11 a10 */
+  x5 = glmm_shuff1(x2, 2, 0, 3, 2);                     /* a11 a10 a12 a11 */
+  x6 = glmm_shuff1(x3, 2, 0, 0, 0);                     /* b11 b01 b01 b01 */
+  x2 = glmm_shuff1(r1, 3, 3, 0, 0);                     /* b21 b21 b11 b11 */
+
+  x8 = _mm_unpackhi_ps(x8, x4);                         /* a10 a00 a12 a02 */
+  x9 = _mm_unpackhi_ps(x7, x2);                         /* b21 b20 b21 b20 */
+
+  x0 = glmm_fmadd(x4, x6, x0);
+  x1 = glmm_fmadd(x5, x2, x1);
+
+  x2 = _mm_movehl_ps(l2, l1);                           /* a22 a22 a21 a20 */
+  x3 = glmm_shuff1(x2, 0, 2, 1, 0);                     /* a20 a22 a21 a20 */
+  x2 = glmm_shuff1(x2, 1, 0, 2, 1);                     /* a21 a20 a22 a21 */
+  x4 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 1, 2, 2)); /* b12 b12 b02 b02 */
+  
+  x5 = glmm_shuff1(x4, 3, 0, 0, 0);                     /* b12 b02 b02 b02 */
+  x4 = _mm_movehl_ps(r2, x4);                           /* b22 b22 b12 b12 */
+  x0 = glmm_fmadd(x3, x5, x0);
+  x1 = glmm_fmadd(x2, x4, x1);
+
+  /*
+   Dot Product : dest[2][2] =  a02 * b20 +
+                               a12 * b21 +
+                               a22 * b22 +
+                               0   * 00                                    */
+  x2 = _mm_movelh_ps(x8, l2);                           /* 0.f a22 a12 a02 */
+  x3 = _mm_movelh_ps(x9, r2);                           /* 0.f b22 b21 b20 */
+  x2 = glmm_vdots(x2, x3);
+
+  _mm_storeu_ps(&dest[0][0], x0);
+  _mm_storeu_ps(&dest[1][1], x1);
+  _mm_store_ss (&dest[2][2], x2);
+}
+
+#endif
+#endif /* cglm_mat3_sse_h */
diff --git a/external/cglm/simd/sse2/mat4.h b/external/cglm/simd/sse2/mat4.h
new file mode 100644
index 0000000..2127e72
--- /dev/null
+++ b/external/cglm/simd/sse2/mat4.h
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_mat_sse_h
+#define cglm_mat_sse_h
+#if defined( __SSE__ ) || defined( __SSE2__ )
+
+#include "../../common.h"
+#include "../intrin.h"
+
+#define glm_mat4_inv_precise_sse2(mat, dest) glm_mat4_inv_sse2(mat, dest)
+
+CGLM_INLINE
+void
+glm_mat4_scale_sse2(mat4 m, float s) {
+  __m128 x0;
+  x0 = glmm_set1(s);
+
+  glmm_store(m[0], _mm_mul_ps(glmm_load(m[0]), x0));
+  glmm_store(m[1], _mm_mul_ps(glmm_load(m[1]), x0));
+  glmm_store(m[2], _mm_mul_ps(glmm_load(m[2]), x0));
+  glmm_store(m[3], _mm_mul_ps(glmm_load(m[3]), x0));
+}
+
+CGLM_INLINE
+void
+glm_mat4_transp_sse2(mat4 m, mat4 dest) {
+  __m128 r0, r1, r2, r3;
+
+  r0 = glmm_load(m[0]);
+  r1 = glmm_load(m[1]);
+  r2 = glmm_load(m[2]);
+  r3 = glmm_load(m[3]);
+
+  _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+
+  glmm_store(dest[0], r0);
+  glmm_store(dest[1], r1);
+  glmm_store(dest[2], r2);
+  glmm_store(dest[3], r3);
+}
+
+CGLM_INLINE
+void
+glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
+  /* D = R * L (Column-Major) */
+
+  glmm_128 l, r0, r1, r2, r3, v0, v1, v2, v3;
+
+  l  = glmm_load(m1[0]);
+  r0 = glmm_load(m2[0]);
+  r1 = glmm_load(m2[1]);
+  r2 = glmm_load(m2[2]);
+  r3 = glmm_load(m2[3]);
+
+  v0 = _mm_mul_ps(glmm_splat_x(r0), l);
+  v1 = _mm_mul_ps(glmm_splat_x(r1), l);
+  v2 = _mm_mul_ps(glmm_splat_x(r2), l);
+  v3 = _mm_mul_ps(glmm_splat_x(r3), l);
+
+  l  = glmm_load(m1[1]);
+  v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_y(r3), l, v3);
+
+  l  = glmm_load(m1[2]);
+  v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_z(r3), l, v3);
+
+  l  = glmm_load(m1[3]);
+  v0 = glmm_fmadd(glmm_splat_w(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_w(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_w(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_w(r3), l, v3);
+
+  glmm_store(dest[0], v0);
+  glmm_store(dest[1], v1);
+  glmm_store(dest[2], v2);
+  glmm_store(dest[3], v3);
+}
+
+CGLM_INLINE
+void
+glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) {
+  __m128 x0, x1, m0, m1, m2, m3, v0, v1, v2, v3;
+
+  m0 = glmm_load(m[0]);
+  m1 = glmm_load(m[1]);
+  m2 = glmm_load(m[2]);
+  m3 = glmm_load(m[3]);
+
+  x0 = glmm_load(v);
+  v0 = glmm_splat_x(x0);
+  v1 = glmm_splat_y(x0);
+  v2 = glmm_splat_z(x0);
+  v3 = glmm_splat_w(x0);
+
+  x1 = _mm_mul_ps(m3, v3);
+  x1 = glmm_fmadd(m2, v2, x1);
+  x1 = glmm_fmadd(m1, v1, x1);
+  x1 = glmm_fmadd(m0, v0, x1);
+
+  glmm_store(dest, x1);
+}
+
+CGLM_INLINE
+float
+glm_mat4_det_sse2(mat4 mat) {
+  __m128 r0, r1, r2, r3, x0, x1, x2;
+
+  /* 127 <- 0, [square] det(A) = det(At) */
+  r0 = glmm_load(mat[0]); /* d c b a */
+  r1 = glmm_load(mat[1]); /* h g f e */
+  r2 = glmm_load(mat[2]); /* l k j i */
+  r3 = glmm_load(mat[3]); /* p o n m */
+
+  /*
+   t[1] = j * p - n * l;
+   t[2] = j * o - n * k;
+   t[3] = i * p - m * l;
+   t[4] = i * o - m * k;
+   */
+  x0 = glmm_fnmadd(glmm_shuff1(r3, 0, 0, 1, 1), glmm_shuff1(r2, 2, 3, 2, 3),
+                   _mm_mul_ps(glmm_shuff1(r2, 0, 0, 1, 1),
+                              glmm_shuff1(r3, 2, 3, 2, 3)));
+  /*
+   t[0] = k * p - o * l;
+   t[0] = k * p - o * l;
+   t[5] = i * n - m * j;
+   t[5] = i * n - m * j;
+   */
+  x1 = glmm_fnmadd(glmm_shuff1(r3, 0, 0, 2, 2), glmm_shuff1(r2, 1, 1, 3, 3),
+                   _mm_mul_ps(glmm_shuff1(r2, 0, 0, 2, 2),
+                              glmm_shuff1(r3, 1, 1, 3, 3)));
+
+  /*
+     a * (f * t[0] - g * t[1] + h * t[2])
+   - b * (e * t[0] - g * t[3] + h * t[4])
+   + c * (e * t[1] - f * t[3] + h * t[5])
+   - d * (e * t[2] - f * t[4] + g * t[5])
+   */
+  x2 = glmm_fnmadd(glmm_shuff1(r1, 1, 1, 2, 2), glmm_shuff1(x0, 3, 2, 2, 0),
+                   _mm_mul_ps(glmm_shuff1(r1, 0, 0, 0, 1),
+                              _mm_shuffle_ps(x1, x0, _MM_SHUFFLE(1, 0, 0, 0))));
+  x2 = glmm_fmadd(glmm_shuff1(r1, 2, 3, 3, 3),
+                  _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 3, 1)),
+                  x2);
+  
+  x2 = _mm_xor_ps(x2, glmm_float32x4_SIGNMASK_NPNP);
+  
+  return glmm_hadd(_mm_mul_ps(x2, r0));
+}
+
+CGLM_INLINE
+void
+glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
+  __m128 r0, r1, r2, r3,
+         v0, v1, v2, v3,
+         t0, t1, t2, t3, t4, t5,
+         x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
+
+  /* x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */
+  x8 = glmm_float32x4_SIGNMASK_NPNP;
+  x9 = glmm_shuff1(x8, 2, 1, 2, 1);
+
+  /* 127 <- 0 */
+  r0 = glmm_load(mat[0]); /* d c b a */
+  r1 = glmm_load(mat[1]); /* h g f e */
+  r2 = glmm_load(mat[2]); /* l k j i */
+  r3 = glmm_load(mat[3]); /* p o n m */
+  
+  x0 = _mm_movehl_ps(r3, r2);                            /* p o l k */
+  x3 = _mm_movelh_ps(r2, r3);                            /* n m j i */
+  x1 = glmm_shuff1(x0, 1, 3, 3 ,3);                      /* l p p p */
+  x2 = glmm_shuff1(x0, 0, 2, 2, 2);                      /* k o o o */
+  x4 = glmm_shuff1(x3, 1, 3, 3, 3);                      /* j n n n */
+  x7 = glmm_shuff1(x3, 0, 2, 2, 2);                      /* i m m m */
+
+  x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0));  /* e e i i */
+  x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1));  /* f f j j */
+  x3 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(2, 2, 2, 2));  /* g g k k */
+  x0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(3, 3, 3, 3));  /* h h l l */
+  
+  t0 = _mm_mul_ps(x3, x1);
+  t1 = _mm_mul_ps(x5, x1);
+  t2 = _mm_mul_ps(x5, x2);
+  t3 = _mm_mul_ps(x6, x1);
+  t4 = _mm_mul_ps(x6, x2);
+  t5 = _mm_mul_ps(x6, x4);
+  
+  /* t1[0] = k * p - o * l;
+     t1[0] = k * p - o * l;
+     t2[0] = g * p - o * h;
+     t3[0] = g * l - k * h; */
+  t0 = glmm_fnmadd(x2, x0, t0);
+  
+  /* t1[1] = j * p - n * l;
+     t1[1] = j * p - n * l;
+     t2[1] = f * p - n * h;
+     t3[1] = f * l - j * h; */
+   t1 = glmm_fnmadd(x4, x0, t1);
+  
+  /* t1[2] = j * o - n * k
+     t1[2] = j * o - n * k;
+     t2[2] = f * o - n * g;
+     t3[2] = f * k - j * g; */
+  t2 = glmm_fnmadd(x4, x3, t2);
+  
+  /* t1[3] = i * p - m * l;
+     t1[3] = i * p - m * l;
+     t2[3] = e * p - m * h;
+     t3[3] = e * l - i * h; */
+  t3 = glmm_fnmadd(x7, x0, t3);
+  
+  /* t1[4] = i * o - m * k;
+     t1[4] = i * o - m * k;
+     t2[4] = e * o - m * g;
+     t3[4] = e * k - i * g; */
+  t4 = glmm_fnmadd(x7, x3, t4);
+  
+  /* t1[5] = i * n - m * j;
+     t1[5] = i * n - m * j;
+     t2[5] = e * n - m * f;
+     t3[5] = e * j - i * f; */
+  t5 = glmm_fnmadd(x7, x5, t5);
+  
+  x4 = _mm_movelh_ps(r0, r1);        /* f e b a */
+  x5 = _mm_movehl_ps(r1, r0);        /* h g d c */
+  
+  x0 = glmm_shuff1(x4, 0, 0, 0, 2);  /* a a a e */
+  x1 = glmm_shuff1(x4, 1, 1, 1, 3);  /* b b b f */
+  x2 = glmm_shuff1(x5, 0, 0, 0, 2);  /* c c c g */
+  x3 = glmm_shuff1(x5, 1, 1, 1, 3);  /* d d d h */
+  
+  v2 = _mm_mul_ps(x0, t1);
+  v1 = _mm_mul_ps(x0, t0);
+  v3 = _mm_mul_ps(x0, t2);
+  v0 = _mm_mul_ps(x1, t0);
+  
+  v2 = glmm_fnmadd(x1, t3, v2);
+  v3 = glmm_fnmadd(x1, t4, v3);
+  v0 = glmm_fnmadd(x2, t1, v0);
+  v1 = glmm_fnmadd(x2, t3, v1);
+  
+  v3 = glmm_fmadd(x2, t5, v3);
+  v0 = glmm_fmadd(x3, t2, v0);
+  v2 = glmm_fmadd(x3, t5, v2);
+  v1 = glmm_fmadd(x3, t4, v1);
+
+  /*
+   dest[0][0] =  f * t1[0] - g * t1[1] + h * t1[2];
+   dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]);
+   dest[0][2] =  b * t2[0] - c * t2[1] + d * t2[2];
+   dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */
+  v0 = _mm_xor_ps(v0, x8);
+  
+  /*
+   dest[2][0] =  e * t1[1] - f * t1[3] + h * t1[5];
+   dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
+   dest[2][2] =  a * t2[1] - b * t2[3] + d * t2[5];
+   dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
+  v2 = _mm_xor_ps(v2, x8);
+
+  /*
+   dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]);
+   dest[1][1] =  a * t1[0] - c * t1[3] + d * t1[4];
+   dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]);
+   dest[1][3] =  a * t3[0] - c * t3[3] + d * t3[4]; */
+  v1 = _mm_xor_ps(v1, x9);
+
+  /*
+   dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]);
+   dest[3][1] =  a * t1[2] - b * t1[4] + c * t1[5];
+   dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]);
+   dest[3][3] =  a * t3[2] - b * t3[4] + c * t3[5]; */
+  v3 = _mm_xor_ps(v3, x9);
+
+  /* determinant */
+  x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0));
+  x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0));
+  x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0));
+
+  x0 = _mm_rcp_ps(glmm_vhadd(_mm_mul_ps(x0, r0)));
+
+  glmm_store(dest[0], _mm_mul_ps(v0, x0));
+  glmm_store(dest[1], _mm_mul_ps(v1, x0));
+  glmm_store(dest[2], _mm_mul_ps(v2, x0));
+  glmm_store(dest[3], _mm_mul_ps(v3, x0));
+}
+
+/* old one */
+#if 0
+CGLM_INLINE
+void
+glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
+  __m128 r0, r1, r2, r3,
+         v0, v1, v2, v3,
+         t0, t1, t2, t3, t4, t5,
+         x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
+
+  /* x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */
+  x8 = glmm_float32x4_SIGNMASK_NPNP;
+  x9 = glmm_shuff1(x8, 2, 1, 2, 1);
+
+  /* 127 <- 0 */
+  r0 = glmm_load(mat[0]); /* d c b a */
+  r1 = glmm_load(mat[1]); /* h g f e */
+  r2 = glmm_load(mat[2]); /* l k j i */
+  r3 = glmm_load(mat[3]); /* p o n m */
+  
+  x0 = _mm_movehl_ps(r3, r2);                            /* p o l k */
+  x3 = _mm_movelh_ps(r2, r3);                            /* n m j i */
+  x1 = glmm_shuff1(x0, 1, 3, 3 ,3);                      /* l p p p */
+  x2 = glmm_shuff1(x0, 0, 2, 2, 2);                      /* k o o o */
+  x4 = glmm_shuff1(x3, 1, 3, 3, 3);                      /* j n n n */
+  x7 = glmm_shuff1(x3, 0, 2, 2, 2);                      /* i m m m */
+
+  x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0));  /* e e i i */
+  x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1));  /* f f j j */
+  x3 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(2, 2, 2, 2));  /* g g k k */
+  x0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(3, 3, 3, 3));  /* h h l l */
+  
+  t0 = _mm_mul_ps(x3, x1);
+  t1 = _mm_mul_ps(x5, x1);
+  t2 = _mm_mul_ps(x5, x2);
+  t3 = _mm_mul_ps(x6, x1);
+  t4 = _mm_mul_ps(x6, x2);
+  t5 = _mm_mul_ps(x6, x4);
+  
+  /* t1[0] = k * p - o * l;
+     t1[0] = k * p - o * l;
+     t2[0] = g * p - o * h;
+     t3[0] = g * l - k * h; */
+  t0 = glmm_fnmadd(x2, x0, t0);
+  
+  /* t1[1] = j * p - n * l;
+     t1[1] = j * p - n * l;
+     t2[1] = f * p - n * h;
+     t3[1] = f * l - j * h; */
+   t1 = glmm_fnmadd(x4, x0, t1);
+  
+  /* t1[2] = j * o - n * k
+     t1[2] = j * o - n * k;
+     t2[2] = f * o - n * g;
+     t3[2] = f * k - j * g; */
+  t2 = glmm_fnmadd(x4, x3, t2);
+  
+  /* t1[3] = i * p - m * l;
+     t1[3] = i * p - m * l;
+     t2[3] = e * p - m * h;
+     t3[3] = e * l - i * h; */
+  t3 = glmm_fnmadd(x7, x0, t3);
+  
+  /* t1[4] = i * o - m * k;
+     t1[4] = i * o - m * k;
+     t2[4] = e * o - m * g;
+     t3[4] = e * k - i * g; */
+  t4 = glmm_fnmadd(x7, x3, t4);
+  
+  /* t1[5] = i * n - m * j;
+     t1[5] = i * n - m * j;
+     t2[5] = e * n - m * f;
+     t3[5] = e * j - i * f; */
+  t5 = glmm_fnmadd(x7, x5, t5);
+  
+  x4 = _mm_movelh_ps(r0, r1);        /* f e b a */
+  x5 = _mm_movehl_ps(r1, r0);        /* h g d c */
+  
+  x0 = glmm_shuff1(x4, 0, 0, 0, 2);  /* a a a e */
+  x1 = glmm_shuff1(x4, 1, 1, 1, 3);  /* b b b f */
+  x2 = glmm_shuff1(x5, 0, 0, 0, 2);  /* c c c g */
+  x3 = glmm_shuff1(x5, 1, 1, 1, 3);  /* d d d h */
+  
+  v2 = _mm_mul_ps(x0, t1);
+  v1 = _mm_mul_ps(x0, t0);
+  v3 = _mm_mul_ps(x0, t2);
+  v0 = _mm_mul_ps(x1, t0);
+  
+  v2 = glmm_fnmadd(x1, t3, v2);
+  v3 = glmm_fnmadd(x1, t4, v3);
+  v0 = glmm_fnmadd(x2, t1, v0);
+  v1 = glmm_fnmadd(x2, t3, v1);
+  
+  v3 = glmm_fmadd(x2, t5, v3);
+  v0 = glmm_fmadd(x3, t2, v0);
+  v2 = glmm_fmadd(x3, t5, v2);
+  v1 = glmm_fmadd(x3, t4, v1);
+
+  /*
+   dest[0][0] =  f * t1[0] - g * t1[1] + h * t1[2];
+   dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]);
+   dest[0][2] =  b * t2[0] - c * t2[1] + d * t2[2];
+   dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */
+  v0 = _mm_xor_ps(v0, x8);
+  
+  /*
+   dest[2][0] =  e * t1[1] - f * t1[3] + h * t1[5];
+   dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
+   dest[2][2] =  a * t2[1] - b * t2[3] + d * t2[5];
+   dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
+  v2 = _mm_xor_ps(v2, x8);
+
+  /*
+   dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]);
+   dest[1][1] =  a * t1[0] - c * t1[3] + d * t1[4];
+   dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]);
+   dest[1][3] =  a * t3[0] - c * t3[3] + d * t3[4]; */
+  v1 = _mm_xor_ps(v1, x9);
+
+  /*
+   dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]);
+   dest[3][1] =  a * t1[2] - b * t1[4] + c * t1[5];
+   dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]);
+   dest[3][3] =  a * t3[2] - b * t3[4] + c * t3[5]; */
+  v3 = _mm_xor_ps(v3, x9);
+
+  /* determinant */
+  x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0));
+  x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0));
+  x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0));
+
+  x0 = _mm_div_ps(glmm_set1(1.0f), glmm_vhadd(_mm_mul_ps(x0, r0)));
+
+  glmm_store(dest[0], _mm_mul_ps(v0, x0));
+  glmm_store(dest[1], _mm_mul_ps(v1, x0));
+  glmm_store(dest[2], _mm_mul_ps(v2, x0));
+  glmm_store(dest[3], _mm_mul_ps(v3, x0));
+}
+#endif
+
+CGLM_INLINE
+void
+glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
+  __m128 r0, r1, r2, r3, s1, s2,
+         v0, v1, v2, v3, v4, v5,
+         t0, t1, t2,
+         x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13;
+
+  /* s1 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */
+  s1 = glmm_float32x4_SIGNMASK_NPNP;
+  s2 = glmm_shuff1(s1, 2, 1, 2, 1);
+
+  /* 127 <- 0 */
+  r1 = glmm_load(mat[1]); /* h g f e */
+  r0 = glmm_load(mat[0]); /* d c b a */
+  r3 = glmm_load(mat[3]); /* p o n m */
+  r2 = glmm_load(mat[2]); /* l k j i */
+  
+  x4  = _mm_unpackhi_ps(r0, r2); /* l d k c */
+  x5  = _mm_unpacklo_ps(r0, r2); /* j b i a */
+  x6  = _mm_unpackhi_ps(r1, r3); /* p h o g */
+  x7  = _mm_unpacklo_ps(r1, r3); /* n f m e */
+
+  x0  = _mm_unpackhi_ps(x7, x5); /* j n b f */
+  x1  = _mm_unpacklo_ps(x7, x5); /* i m a e */
+  x2  = _mm_unpackhi_ps(x6, x4); /* l p d h */
+  x3  = _mm_unpacklo_ps(x6, x4); /* k o c g */
+
+  /* c2 = c * h - d * g   c12 = a * g - c * e    c8  = a * f - b * e
+     c1 = k * p - l * o   c11 = i * o - k * m    c7  = i * n - j * m
+     c4 = a * h - d * e   c6  = b * h - d * f    c10 = b * g - c * f
+     c3 = i * p - l * m   c5  = j * p - l * n    c9  = j * o - k * n */
+
+  x8  = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(3, 1, 3, 1)); /* k c j b */
+  x9  = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(2, 0, 2, 0)); /* o g n f */
+
+  x10 = glmm_shuff1(x2, 2, 0, 2, 0);                     /* p h p h */
+  x11 = glmm_shuff1(x2, 3, 1, 3, 1);                     /* l d l d */
+
+#if 0 /* TODO measure both */
+  x12 = _mm_shuffle_ps(x4, x5, _MM_SHUFFLE(1, 0, 1, 0)); /* i a k c */
+  x13 = _mm_shuffle_ps(x6, x7, _MM_SHUFFLE(1, 0, 1, 0)); /* m e o g */
+#else
+  x12 = _mm_movelh_ps(x4, x5);                           /* i a k c */
+  x13 = _mm_movelh_ps(x6, x7);                           /* m e o g */
+#endif
+  
+  t0 = _mm_mul_ps(x12, x10);
+  t1 = _mm_mul_ps(x5, x6);
+  t2 = _mm_mul_ps(x5, x9);
+
+  t0 = glmm_fnmadd(x11, x13, t0);
+  t1 = glmm_fnmadd(x4, x7, t1);
+  t2 = glmm_fnmadd(x8, x7, t2);
+
+  /* det */
+  /* v0: c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */
+  /* v1: c5 * c12 + c6 * c11 */
+
+  v5 = glmm_set1_rval(1.0f);
+  v0 = glmm_shuff1(t2, 2, 3, 0, 1);
+  v1 = glmm_shuff1(t1, 0, 1, 2, 3);
+  v0 = _mm_mul_ps(t0, v0);
+  v1 = _mm_mul_ps(t1, v1);  
+  v2 = glmm_shuff1(v1, 1, 0, 0, 1);
+  v3 = glmm_shuff1(v0, 0, 1, 2, 3);
+  v1 = _mm_add_ps(v1, v2);
+  v0 = _mm_add_ps(v0, v3);
+  v2 = glmm_shuff1(v0, 1, 0, 0, 1);
+  v0 = _mm_add_ps(v0, v2); 
+
+  v0 = _mm_sub_ps(v0, v1); /* det */
+  v0 = _mm_div_ps(v5, v0); /* idt */
+
+  /* multiply t0,t1,t2 by idt to reduce 1mul below: 2eor+4mul vs 3mul+4eor */
+  t0 = _mm_mul_ps(t0, v0);
+  t1 = _mm_mul_ps(t1, v0);
+  t2 = _mm_mul_ps(t2, v0);
+
+  v0 = glmm_shuff1(t0, 0, 0, 1, 1); /* c2  c2  c1  c1  */
+  v1 = glmm_shuff1(t0, 2, 2, 3, 3); /* c4  c4  c3 c3   */
+  v2 = glmm_shuff1(t1, 0, 0, 1, 1); /* c12 c12 c11 c11 */
+  v3 = glmm_shuff1(t1, 2, 2, 3, 3); /* c6  c6  c5 c5   */
+  v4 = glmm_shuff1(t2, 0, 0, 1, 1); /* c8  c8  c7  c7  */
+  v5 = glmm_shuff1(t2, 2, 2, 3, 3); /* c10 c10 c9 c9   */
+
+  /* result */
+
+  /* dest[0][0] = (f * c1  - g * c5  + h * c9)  * idt;
+     dest[0][1] = (b * c1  - c * c5  + d * c9)  * ndt;
+     dest[0][2] = (n * c2  - o * c6  + p * c10) * idt;
+     dest[0][3] = (j * c2  - k * c6  + l * c10) * ndt;
+  
+     dest[1][0] = (e * c1  - g * c3  + h * c11) * ndt;
+     dest[1][1] = (a * c1  - c * c3  + d * c11) * idt;
+     dest[1][2] = (m * c2  - o * c4  + p * c12) * ndt;
+     dest[1][3] = (i * c2  - k * c4  + l * c12) * idt;
+  
+     dest[2][0] = (e * c5  - f * c3  + h * c7)  * idt;
+     dest[2][1] = (a * c5  - b * c3  + d * c7)  * ndt;
+     dest[2][2] = (m * c6  - n * c4  + p * c8)  * idt;
+     dest[2][3] = (i * c6  - j * c4  + l * c8)  * ndt;
+  
+     dest[3][0] = (e * c9  - f * c11 + g * c7)  * ndt;
+     dest[3][1] = (a * c9  - b * c11 + c * c7)  * idt;
+     dest[3][2] = (m * c10 - n * c12 + o * c8)  * ndt;
+     dest[3][3] = (i * c10 - j * c12 + k * c8)  * idt; */
+
+  r0 = _mm_mul_ps(x0, v0);
+  r1 = _mm_mul_ps(x1, v0);
+  r2 = _mm_mul_ps(x1, v3);
+  r3 = _mm_mul_ps(x1, v5);
+
+  r0 = glmm_fnmadd(x3, v3, r0);
+  r1 = glmm_fnmadd(x3, v1, r1);
+  r2 = glmm_fnmadd(x0, v1, r2);
+  r3 = glmm_fnmadd(x0, v2, r3);
+
+  r0 = glmm_fmadd(x2, v5, r0);
+  r1 = glmm_fmadd(x2, v2, r1);
+  r2 = glmm_fmadd(x2, v4, r2);
+  r3 = glmm_fmadd(x3, v4, r3);
+
+  /* 4xor may be fastart then 4mul, see above  */
+  r0 = _mm_xor_ps(r0, s1);
+  r1 = _mm_xor_ps(r1, s2);
+  r2 = _mm_xor_ps(r2, s1);
+  r3 = _mm_xor_ps(r3, s2);
+
+  glmm_store(dest[0], r0);
+  glmm_store(dest[1], r1);
+  glmm_store(dest[2], r2);
+  glmm_store(dest[3], r3);
+}
+#endif
+#endif /* cglm_mat_sse_h */
diff --git a/external/cglm/simd/sse2/quat.h b/external/cglm/simd/sse2/quat.h
new file mode 100644
index 0000000..def0fe2
--- /dev/null
+++ b/external/cglm/simd/sse2/quat.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_quat_simd_h
+#define cglm_quat_simd_h
+#if defined( __SSE__ ) || defined( __SSE2__ )
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_quat_mul_sse2(versor p, versor q, versor dest) {
+  /*
+   + (a1 b2 + b1 a2 + c1 d2 − d1 c2)i
+   + (a1 c2 − b1 d2 + c1 a2 + d1 b2)j
+   + (a1 d2 + b1 c2 − c1 b2 + d1 a2)k
+     a1 a2 − b1 b2 − c1 c2 − d1 d2
+   */
+
+  __m128 xp, xq, x1, x2, x3, r, x, y, z;
+
+  xp = glmm_load(p); /* 3 2 1 0 */
+  xq = glmm_load(q);
+  x1 = glmm_float32x4_SIGNMASK_NPNP; /* TODO: _mm_set1_ss() + shuff ? */
+  r  = _mm_mul_ps(glmm_splat_w(xp), xq);
+
+  x2 = _mm_unpackhi_ps(x1, x1);
+  x3 = glmm_shuff1(x1, 3, 2, 0, 1);
+  x  = glmm_splat_x(xp);
+  y  = glmm_splat_y(xp);
+  z  = glmm_splat_z(xp);
+
+  x  = _mm_xor_ps(x, x1);
+  y  = _mm_xor_ps(y, x2);
+  z  = _mm_xor_ps(z, x3);
+  
+  x1 = glmm_shuff1(xq, 0, 1, 2, 3);
+  x2 = glmm_shuff1(xq, 1, 0, 3, 2);
+  x3 = glmm_shuff1(xq, 2, 3, 0, 1);
+  
+  r  = glmm_fmadd(x, x1, r);
+  r  = glmm_fmadd(y, x2, r);
+  r  = glmm_fmadd(z, x3, r);
+
+  glmm_store(dest, r);
+}
+
+#endif
+#endif /* cglm_quat_simd_h */
diff --git a/external/cglm/simd/wasm.h b/external/cglm/simd/wasm.h
new file mode 100644
index 0000000..2ced51f
--- /dev/null
+++ b/external/cglm/simd/wasm.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_simd_wasm_h
+#define cglm_simd_wasm_h
+#include "intrin.h"
+#ifdef CGLM_SIMD_WASM
+#include <wasm_simd128.h>
+
+#define glmm_load(p)      wasm_v128_load(p)
+#define glmm_store(p, a)  wasm_v128_store(p, (a))
+
+#define glmm_set1(x)      wasm_f32x4_splat(x)
+#define glmm_set1_ptr(x)  wasm_f32x4_splat(*x)
+#define glmm_set1_rval(x) wasm_f32x4_splat(x)
+#define glmm_128          v128_t
+
+#define glmm_shuff1(xmm, z, y, x, w) wasm_i32x4_shuffle(xmm, xmm, w, x, y, z)
+
+#define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane)
+
+#define glmm_splat_x(x) glmm_splat(x, 0)
+#define glmm_splat_y(x) glmm_splat(x, 1)
+#define glmm_splat_z(x) glmm_splat(x, 2)
+#define glmm_splat_w(x) glmm_splat(x, 3)
+
+#define GLMM_NEGZEROf 0x80000000 /*  0x80000000 ---> -0.0f  */
+
+/* _mm_set_ps(X, Y, Z, W); */
+#define GLMM__SIGNMASKf(X, Y, Z, W) wasm_i32x4_const(X, Y, Z, W)
+
+#define glmm_float32x4_SIGNMASK_PNPN GLMM__SIGNMASKf(0, GLMM_NEGZEROf, 0, GLMM_NEGZEROf)
+#define glmm_float32x4_SIGNMASK_NPNP GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, GLMM_NEGZEROf, 0)
+#define glmm_float32x4_SIGNMASK_NPPN GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, 0, GLMM_NEGZEROf)
+#define glmm_float32x4_SIGNMASK_NEG  wasm_i32x4_const_splat(GLMM_NEGZEROf)
+
+static inline glmm_128 glmm_abs(glmm_128 x)             { return wasm_f32x4_abs(x);     }
+static inline glmm_128 glmm_min(glmm_128 a, glmm_128 b) { return wasm_f32x4_pmin(b, a); }
+static inline glmm_128 glmm_max(glmm_128 a, glmm_128 b) { return wasm_f32x4_pmax(b, a); }
+
+static inline
+glmm_128
+glmm_vhadd(glmm_128 v) {
+  glmm_128 x0;
+  x0 = wasm_f32x4_add(v,  glmm_shuff1(v, 0, 1, 2, 3));
+  x0 = wasm_f32x4_add(x0, glmm_shuff1(x0, 1, 0, 0, 1));
+  return x0;
+}
+
+static inline
+glmm_128
+glmm_vhadds(glmm_128 v) {
+  glmm_128 shuf, sums;
+  shuf = glmm_shuff1(v, 2, 3, 0, 1);
+  sums = wasm_f32x4_add(v, shuf);
+  /* shuf = _mm_movehl_ps(shuf, sums); */
+  shuf = wasm_i32x4_shuffle(shuf, sums, 6, 7, 2, 3);
+  sums = wasm_i32x4_shuffle(sums, wasm_f32x4_add(sums, shuf), 4, 1, 2, 3);
+  return sums;
+}
+
+static inline
+float
+glmm_hadd(glmm_128 v) {
+  return wasm_f32x4_extract_lane(glmm_vhadds(v), 0);
+}
+
+static inline
+glmm_128
+glmm_vhmin(glmm_128 v) {
+  glmm_128 x0, x1, x2;
+  x0 = glmm_shuff1(v, 2, 3, 2, 3);     /* [2, 3, 2, 3] */
+  x1 = wasm_f32x4_pmin(x0, v);         /* [0|2, 1|3, 2|2, 3|3] */
+  x2 = glmm_splat(x1, 1);              /* [1|3, 1|3, 1|3, 1|3] */
+  return wasm_f32x4_pmin(x1, x2);
+}
+
+static inline
+float
+glmm_hmin(glmm_128 v) {
+  return wasm_f32x4_extract_lane(glmm_vhmin(v), 0);
+}
+
+static inline
+glmm_128
+glmm_vhmax(glmm_128 v) {
+  glmm_128 x0, x1, x2;
+  x0 = glmm_shuff1(v, 2, 3, 2, 3);     /* [2, 3, 2, 3] */
+  x1 = wasm_f32x4_pmax(x0, v);         /* [0|2, 1|3, 2|2, 3|3] */
+  x2 = glmm_splat(x1, 1);              /* [1|3, 1|3, 1|3, 1|3] */
+  /* _mm_max_ss */
+  return wasm_i32x4_shuffle(x1, wasm_f32x4_pmax(x1, x2), 4, 1, 2, 3);
+}
+
+static inline
+float
+glmm_hmax(glmm_128 v) {
+  return wasm_f32x4_extract_lane(glmm_vhmax(v), 0);
+}
+
+static inline
+glmm_128
+glmm_vdots(glmm_128 a, glmm_128 b) {
+  return glmm_vhadds(wasm_f32x4_mul(a, b));
+}
+
+static inline
+glmm_128
+glmm_vdot(glmm_128 a, glmm_128 b) {
+  glmm_128 x0;
+  x0 = wasm_f32x4_mul(a, b);
+  x0 = wasm_f32x4_add(x0, glmm_shuff1(x0, 1, 0, 3, 2));
+  return wasm_f32x4_add(x0, glmm_shuff1(x0, 0, 1, 0, 1));
+}
+
+static inline
+float
+glmm_dot(glmm_128 a, glmm_128 b) {
+  return wasm_f32x4_extract_lane(glmm_vdots(a, b), 0);
+}
+
+static inline
+float
+glmm_norm(glmm_128 a) {
+  glmm_128 x0;
+  x0 = glmm_vhadds(wasm_f32x4_mul(a, a));
+  return wasm_f32x4_extract_lane(
+          wasm_i32x4_shuffle(x0, wasm_f32x4_sqrt(x0),4, 1, 2, 3), 0);
+}
+
+static inline
+float
+glmm_norm2(glmm_128 a) {
+  return wasm_f32x4_extract_lane(glmm_vhadds(wasm_f32x4_mul(a, a)), 0);
+}
+
+static inline
+float
+glmm_norm_one(glmm_128 a) {
+  return wasm_f32x4_extract_lane(glmm_vhadds(glmm_abs(a)), 0);
+}
+
+static inline
+float
+glmm_norm_inf(glmm_128 a) {
+  return wasm_f32x4_extract_lane(glmm_vhmax(glmm_abs(a)), 0);
+}
+
+static inline
+glmm_128
+glmm_load3(float v[3]) {
+  glmm_128 xy = wasm_v128_load64_zero(v);
+  return wasm_f32x4_replace_lane(xy, 2, v[2]);
+}
+
+static inline
+void
+glmm_store3(float v[3], glmm_128 vx) {
+  wasm_v128_store64_lane(v, vx, 0);
+  wasm_v128_store32_lane(&v[2], vx, 2);
+}
+
+static inline
+glmm_128
+glmm_div(glmm_128 a, glmm_128 b) {
+  return wasm_f32x4_div(a, b);
+}
+
+static inline
+glmm_128
+glmm_fmadd(glmm_128 a, glmm_128 b, glmm_128 c) {
+  return wasm_f32x4_add(c, wasm_f32x4_mul(a, b));
+}
+
+static inline
+glmm_128
+glmm_fnmadd(glmm_128 a, glmm_128 b, glmm_128 c) {
+  return wasm_f32x4_sub(c, wasm_f32x4_mul(a, b));
+}
+
+static inline
+glmm_128
+glmm_fmsub(glmm_128 a, glmm_128 b, glmm_128 c) {
+  return wasm_f32x4_sub(wasm_f32x4_mul(a, b), c);
+}
+
+static inline
+glmm_128
+glmm_fnmsub(glmm_128 a, glmm_128 b, glmm_128 c) {
+  return wasm_f32x4_neg(wasm_f32x4_add(wasm_f32x4_mul(a, b), c));
+}
+
+#endif
+#endif /* cglm_simd_wasm_h */
diff --git a/external/cglm/simd/wasm/affine.h b/external/cglm/simd/wasm/affine.h
new file mode 100644
index 0000000..80b98fb
--- /dev/null
+++ b/external/cglm/simd/wasm/affine.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_affine_mat_wasm_h
+#define cglm_affine_mat_wasm_h
+#if defined(__wasm__) && defined(__wasm_simd128__)
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_mul_wasm(mat4 m1, mat4 m2, mat4 dest) {
+  /* D = R * L (Column-Major) */
+  glmm_128 l, r0, r1, r2, r3, v0, v1, v2, v3;
+
+  l  = glmm_load(m1[0]);
+  r0 = glmm_load(m2[0]);
+  r1 = glmm_load(m2[1]);
+  r2 = glmm_load(m2[2]);
+  r3 = glmm_load(m2[3]);
+
+  v0 = wasm_f32x4_mul(glmm_splat_x(r0), l);
+  v1 = wasm_f32x4_mul(glmm_splat_x(r1), l);
+  v2 = wasm_f32x4_mul(glmm_splat_x(r2), l);
+  v3 = wasm_f32x4_mul(glmm_splat_x(r3), l);
+
+  l  = glmm_load(m1[1]);
+  v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_y(r3), l, v3);
+
+  l  = glmm_load(m1[2]);
+  v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_z(r3), l, v3);
+
+  l  = glmm_load(m1[3]);
+  v3 = glmm_fmadd(glmm_splat_w(r3), l, v3);
+
+  glmm_store(dest[0], v0);
+  glmm_store(dest[1], v1);
+  glmm_store(dest[2], v2);
+  glmm_store(dest[3], v3);
+}
+
+CGLM_INLINE
+void
+glm_mul_rot_wasm(mat4 m1, mat4 m2, mat4 dest) {
+  /* D = R * L (Column-Major) */
+
+  glmm_128 l, r0, r1, r2, v0, v1, v2;
+
+  l  = glmm_load(m1[0]);
+  r0 = glmm_load(m2[0]);
+  r1 = glmm_load(m2[1]);
+  r2 = glmm_load(m2[2]);
+
+  v0 = wasm_f32x4_mul(glmm_splat_x(r0), l);
+  v1 = wasm_f32x4_mul(glmm_splat_x(r1), l);
+  v2 = wasm_f32x4_mul(glmm_splat_x(r2), l);
+
+  l  = glmm_load(m1[1]);
+  v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
+
+  l  = glmm_load(m1[2]);
+  v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
+
+  glmm_store(dest[0], v0);
+  glmm_store(dest[1], v1);
+  glmm_store(dest[2], v2);
+  glmm_store(dest[3], glmm_load(m1[3]));
+}
+
+CGLM_INLINE
+void
+glm_inv_tr_wasm(mat4 mat) {
+  glmm_128 r0, r1, r2, r3, x0, x1, x2, x3, x4, x5;
+
+  r0 = glmm_load(mat[0]);
+  r1 = glmm_load(mat[1]);
+  r2 = glmm_load(mat[2]);
+  r3 = glmm_load(mat[3]);
+  x1 = wasm_f32x4_const(0.0f, 0.0f, 0.0f, 1.0f);
+
+  /* _MM_TRANSPOSE4_PS(r0, r1, r2, x1); */
+  x2 = wasm_i32x4_shuffle(r0, r1, 0, 4, 1, 5);
+  x3 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7);
+  x4 = wasm_i32x4_shuffle(r2, x1, 0, 4, 1, 5);
+  x5 = wasm_i32x4_shuffle(r2, x1, 2, 6, 3, 7);
+  /* r0 = _mm_movelh_ps(x2, x4); */
+  r0 = wasm_i32x4_shuffle(x2, x4, 0, 1, 4, 5);
+  /* r1 = _mm_movehl_ps(x4, x2); */
+  r1 = wasm_i32x4_shuffle(x4, x2, 6, 7, 2, 3);
+  /* r2 = _mm_movelh_ps(x3, x5); */
+  r2 = wasm_i32x4_shuffle(x3, x5, 0, 1, 4, 5);
+  /* x1 = _mm_movehl_ps(x5, x3); */
+  x1 = wasm_i32x4_shuffle(x5, x3, 6, 7, 2, 3);
+
+  x2 = glmm_shuff1(r3, 0, 0, 0, 0);
+  x3 = glmm_shuff1(r3, 1, 1, 1, 1);
+  x4 = glmm_shuff1(r3, 2, 2, 2, 2);
+
+  x0 = glmm_fmadd(r0, x2,
+                  glmm_fmadd(r1, x3, wasm_f32x4_mul(r2, x4)));
+  x0 = wasm_f32x4_neg(x0);
+
+  x0 = wasm_f32x4_add(x0, x1);
+
+  glmm_store(mat[0], r0);
+  glmm_store(mat[1], r1);
+  glmm_store(mat[2], r2);
+  glmm_store(mat[3], x0);
+}
+
+#endif
+#endif /* cglm_affine_mat_wasm_h */
diff --git a/external/cglm/simd/wasm/mat2.h b/external/cglm/simd/wasm/mat2.h
new file mode 100644
index 0000000..80ce0fb
--- /dev/null
+++ b/external/cglm/simd/wasm/mat2.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_mat2_wasm_h
+#define cglm_mat2_wasm_h
+#if defined(__wasm__) && defined(__wasm_simd128__)
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_mat2_mul_wasm(mat2 m1, mat2 m2, mat2 dest) {
+  glmm_128 x0, x1, x2, x3, x4;
+
+  x1 = glmm_load(m1[0]); /* d c b a */
+  x2 = glmm_load(m2[0]); /* h g f e */
+
+  x3 = glmm_shuff1(x2, 2, 2, 0, 0);
+  x4 = glmm_shuff1(x2, 3, 3, 1, 1);
+  /* x0 = _mm_movelh_ps(x1, x1); */
+  x0 = wasm_i32x4_shuffle(x1, x1, 0, 1, 4, 5);
+  /* x2 = _mm_movehl_ps(x1, x1); */
+  x2 = wasm_i32x4_shuffle(x1, x1, 6, 7, 2, 3);
+
+  /*
+   dest[0][0] = a * e + c * f;
+   dest[0][1] = b * e + d * f;
+   dest[1][0] = a * g + c * h;
+   dest[1][1] = b * g + d * h;
+   */
+  x0 = glmm_fmadd(x0, x3, wasm_f32x4_mul(x2, x4));
+
+  glmm_store(dest[0], x0);
+}
+
+CGLM_INLINE
+void
+glm_mat2_transp_wasm(mat2 m, mat2 dest) {
+  /* d c b a */
+  /* d b c a */
+  glmm_store(dest[0], glmm_shuff1(glmm_load(m[0]), 3, 1, 2, 0));
+}
+
+#endif
+#endif /* cglm_mat2_wasm_h */
diff --git a/external/cglm/simd/wasm/mat3.h b/external/cglm/simd/wasm/mat3.h
new file mode 100644
index 0000000..dfe192d
--- /dev/null
+++ b/external/cglm/simd/wasm/mat3.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_mat3_wasm_h
+#define cglm_mat3_wasm_h
+#if defined(__wasm__) && defined(__wasm_simd128__)
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) {
+  glmm_128 l0, l1, l2, r0, r1, r2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
+  
+  l0 = wasm_v128_load(m1[0]);
+  l1 = wasm_v128_load(&m1[1][1]);
+
+  r0 = wasm_v128_load(m2[0]);
+  r1 = wasm_v128_load(&m2[1][1]);
+
+  x8 = glmm_shuff1(l0, 0, 2, 1, 0);                     /* a00 a02 a01 a00 */
+  x1 = glmm_shuff1(r0, 3, 0, 0, 0);                     /* b10 b00 b00 b00 */
+  x2 = wasm_i32x4_shuffle(l0, l1, 3, 3, 4, 5);          /* a12 a11 a10 a10 */
+  x3 = wasm_i32x4_shuffle(r0, r1, 1, 3, 4, 6);          /* b20 b11 b10 b01 */
+  x0 = wasm_f32x4_mul(x8, x1);
+
+  x6 = glmm_shuff1(l0, 1, 0, 2, 1);                     /* a01 a00 a02 a01 */
+  x7 = glmm_shuff1(x3, 3, 3, 1, 1);                     /* b20 b20 b10 b10 */
+  l2 = wasm_v128_load32_zero(&m1[2][2]);
+  r2 = wasm_v128_load32_zero(&m2[2][2]);
+  x1 = wasm_f32x4_mul(x6, x7);
+  l2 = glmm_shuff1(l2, 0, 0, 1, 0);                     /* a22 a22 0.f a22 */
+  r2 = glmm_shuff1(r2, 0, 0, 1, 0);                     /* b22 b22 0.f b22 */
+
+  x4 = glmm_shuff1(x2, 0, 3, 2, 0);                     /* a10 a12 a11 a10 */
+  x5 = glmm_shuff1(x2, 2, 0, 3, 2);                     /* a11 a10 a12 a11 */
+  x6 = glmm_shuff1(x3, 2, 0, 0, 0);                     /* b11 b01 b01 b01 */
+  x2 = glmm_shuff1(r1, 3, 3, 0, 0);                     /* b21 b21 b11 b11 */
+
+  /* x8 = _mm_unpackhi_ps(x8, x4); */
+  /* x9 = _mm_unpackhi_ps(x7, x2); */
+  x8 = wasm_i32x4_shuffle(x8, x4, 2, 6, 3, 7);          /* a10 a00 a12 a02 */
+  x9 = wasm_i32x4_shuffle(x7, x2, 2, 6, 3, 7);          /* b21 b20 b21 b20 */
+
+  x0 = glmm_fmadd(x4, x6, x0);
+  x1 = glmm_fmadd(x5, x2, x1);
+
+  /* x2 = _mm_movehl_ps(l2, l1); */
+  x2 = wasm_i32x4_shuffle(l2, l1, 6, 7, 2, 3);          /* a22 a22 a21 a20 */
+  x3 = glmm_shuff1(x2, 0, 2, 1, 0);                     /* a20 a22 a21 a20 */
+  x2 = glmm_shuff1(x2, 1, 0, 2, 1);                     /* a21 a20 a22 a21 */
+  x4 = wasm_i32x4_shuffle(r0, r1, 2, 2, 5, 5);          /* b12 b12 b02 b02 */
+  
+  x5 = glmm_shuff1(x4, 3, 0, 0, 0);                     /* b12 b02 b02 b02 */
+  /* x4 = _mm_movehl_ps(r2, x4); */
+  x4 = wasm_i32x4_shuffle(r2, x4, 6, 7, 2, 3);          /* b22 b22 b12 b12 */
+  x0 = glmm_fmadd(x3, x5, x0);
+  x1 = glmm_fmadd(x2, x4, x1);
+
+  /*
+   Dot Product : dest[2][2] =  a02 * b20 +
+                               a12 * b21 +
+                               a22 * b22 +
+                               0   * 00                                    */
+  /* x2 = _mm_movelh_ps(x8, l2); */
+  /* x3 = _mm_movelh_ps(x9, r2); */
+  x2 = wasm_i32x4_shuffle(x8, l2, 0, 1, 4, 5);           /* 0.f a22 a12 a02 */
+  x3 = wasm_i32x4_shuffle(x9, r2, 0, 1, 4, 5);           /* 0.f b22 b21 b20 */
+  x2 = glmm_vdots(x2, x3);
+
+  /* _mm_storeu_ps(&dest[0][0], x0); */
+  wasm_v128_store(&dest[0][0], x0);
+  /* _mm_storeu_ps(&dest[1][1], x1); */
+  wasm_v128_store(&dest[1][1], x1);
+  /* _mm_store_ss (&dest[2][2], x2); */
+  wasm_v128_store32_lane(&dest[2][2], x2, 0);
+}
+
+#endif
+#endif /* cglm_mat3_wasm_h */
diff --git a/external/cglm/simd/wasm/mat4.h b/external/cglm/simd/wasm/mat4.h
new file mode 100644
index 0000000..79ed688
--- /dev/null
+++ b/external/cglm/simd/wasm/mat4.h
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_mat_wasm_h
+#define cglm_mat_wasm_h
+#if defined(__wasm__) && defined(__wasm_simd128__)
+
+#include "../../common.h"
+#include "../intrin.h"
+
+#define glm_mat4_inv_precise_wasm(mat, dest) glm_mat4_inv_wasm(mat, dest)
+
+CGLM_INLINE
+void
+glm_mat4_scale_wasm(mat4 m, float s) {
+  glmm_128 x0;
+  x0 = wasm_f32x4_splat(s);
+
+  glmm_store(m[0], wasm_f32x4_mul(glmm_load(m[0]), x0));
+  glmm_store(m[1], wasm_f32x4_mul(glmm_load(m[1]), x0));
+  glmm_store(m[2], wasm_f32x4_mul(glmm_load(m[2]), x0));
+  glmm_store(m[3], wasm_f32x4_mul(glmm_load(m[3]), x0));
+}
+
+CGLM_INLINE
+void
+glm_mat4_transp_wasm(mat4 m, mat4 dest) {
+  glmm_128 r0, r1, r2, r3, tmp0, tmp1, tmp2, tmp3;
+
+  r0 = glmm_load(m[0]);
+  r1 = glmm_load(m[1]);
+  r2 = glmm_load(m[2]);
+  r3 = glmm_load(m[3]);
+
+  /* _MM_TRANSPOSE4_PS(r0, r1, r2, r3); */
+  tmp0 = wasm_i32x4_shuffle(r0, r1, 0, 4, 1, 5);
+  tmp1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7);
+  tmp2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5);
+  tmp3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7);
+  /* r0 = _mm_movelh_ps(tmp0, tmp2); */
+  r0 = wasm_i32x4_shuffle(tmp0, tmp2, 0, 1, 4, 5);
+  /* r1 = _mm_movehl_ps(tmp2, tmp0); */
+  r1 = wasm_i32x4_shuffle(tmp2, tmp0, 6, 7, 2, 3);
+  /* r2 = _mm_movelh_ps(tmp1, tmp3); */
+  r2 = wasm_i32x4_shuffle(tmp1, tmp3, 0, 1, 4, 5);
+  /* r3 = _mm_movehl_ps(tmp3, tmp1); */
+  r3 = wasm_i32x4_shuffle(tmp3, tmp1, 6, 7, 2, 3);
+
+  glmm_store(dest[0], r0);
+  glmm_store(dest[1], r1);
+  glmm_store(dest[2], r2);
+  glmm_store(dest[3], r3);
+}
+
+CGLM_INLINE
+void
+glm_mat4_mul_wasm(mat4 m1, mat4 m2, mat4 dest) {
+  /* D = R * L (Column-Major) */
+
+  glmm_128 l, r0, r1, r2, r3, v0, v1, v2, v3;
+
+  l  = glmm_load(m1[0]);
+  r0 = glmm_load(m2[0]);
+  r1 = glmm_load(m2[1]);
+  r2 = glmm_load(m2[2]);
+  r3 = glmm_load(m2[3]);
+
+  v0 = wasm_f32x4_mul(glmm_splat_x(r0), l);
+  v1 = wasm_f32x4_mul(glmm_splat_x(r1), l);
+  v2 = wasm_f32x4_mul(glmm_splat_x(r2), l);
+  v3 = wasm_f32x4_mul(glmm_splat_x(r3), l);
+
+  l  = glmm_load(m1[1]);
+  v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_y(r3), l, v3);
+
+  l  = glmm_load(m1[2]);
+  v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_z(r3), l, v3);
+
+  l  = glmm_load(m1[3]);
+  v0 = glmm_fmadd(glmm_splat_w(r0), l, v0);
+  v1 = glmm_fmadd(glmm_splat_w(r1), l, v1);
+  v2 = glmm_fmadd(glmm_splat_w(r2), l, v2);
+  v3 = glmm_fmadd(glmm_splat_w(r3), l, v3);
+
+  glmm_store(dest[0], v0);
+  glmm_store(dest[1], v1);
+  glmm_store(dest[2], v2);
+  glmm_store(dest[3], v3);
+}
+
+CGLM_INLINE
+void
+glm_mat4_mulv_wasm(mat4 m, vec4 v, vec4 dest) {
+  glmm_128 x0, x1, m0, m1, m2, m3, v0, v1, v2, v3;
+
+  m0 = glmm_load(m[0]);
+  m1 = glmm_load(m[1]);
+  m2 = glmm_load(m[2]);
+  m3 = glmm_load(m[3]);
+
+  x0 = glmm_load(v);
+  v0 = glmm_splat_x(x0);
+  v1 = glmm_splat_y(x0);
+  v2 = glmm_splat_z(x0);
+  v3 = glmm_splat_w(x0);
+
+  x1 = wasm_f32x4_mul(m3, v3);
+  x1 = glmm_fmadd(m2, v2, x1);
+  x1 = glmm_fmadd(m1, v1, x1);
+  x1 = glmm_fmadd(m0, v0, x1);
+
+  glmm_store(dest, x1);
+}
+
+CGLM_INLINE
+float
+glm_mat4_det_wasm(mat4 mat) {
+  glmm_128 r0, r1, r2, r3, x0, x1, x2;
+
+  /* 127 <- 0, [square] det(A) = det(At) */
+  r0 = glmm_load(mat[0]); /* d c b a */
+  r1 = glmm_load(mat[1]); /* h g f e */
+  r2 = glmm_load(mat[2]); /* l k j i */
+  r3 = glmm_load(mat[3]); /* p o n m */
+
+  /*
+   t[1] = j * p - n * l;
+   t[2] = j * o - n * k;
+   t[3] = i * p - m * l;
+   t[4] = i * o - m * k;
+   */
+  x0 = glmm_fnmadd(glmm_shuff1(r3, 0, 0, 1, 1), glmm_shuff1(r2, 2, 3, 2, 3),
+                   wasm_f32x4_mul(glmm_shuff1(r2, 0, 0, 1, 1),
+                              glmm_shuff1(r3, 2, 3, 2, 3)));
+  /*
+   t[0] = k * p - o * l;
+   t[0] = k * p - o * l;
+   t[5] = i * n - m * j;
+   t[5] = i * n - m * j;
+   */
+  x1 = glmm_fnmadd(glmm_shuff1(r3, 0, 0, 2, 2), glmm_shuff1(r2, 1, 1, 3, 3),
+                   wasm_f32x4_mul(glmm_shuff1(r2, 0, 0, 2, 2),
+                              glmm_shuff1(r3, 1, 1, 3, 3)));
+
+  /*
+     a * (f * t[0] - g * t[1] + h * t[2])
+   - b * (e * t[0] - g * t[3] + h * t[4])
+   + c * (e * t[1] - f * t[3] + h * t[5])
+   - d * (e * t[2] - f * t[4] + g * t[5])
+   */
+  x2 = glmm_fnmadd(glmm_shuff1(r1, 1, 1, 2, 2), glmm_shuff1(x0, 3, 2, 2, 0),
+                   wasm_f32x4_mul(glmm_shuff1(r1, 0, 0, 0, 1),
+                              wasm_i32x4_shuffle(x1, x0, 0, 0, 4, 5)));
+  x2 = glmm_fmadd(glmm_shuff1(r1, 2, 3, 3, 3),
+                  wasm_i32x4_shuffle(x0, x1, 1, 3, 6, 6),
+                  x2);
+  /* x2 = wasm_v128_xor(x2, wasm_f32x4_const(0.f, -0.f, 0.f, -0.f)); */
+  x2 = wasm_v128_xor(x2, glmm_float32x4_SIGNMASK_PNPN);
+  
+  return glmm_hadd(wasm_f32x4_mul(x2, r0));
+}
+
+CGLM_INLINE
+void
+glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) {
+  glmm_128 r0, r1, r2, r3,
+         v0, v1, v2, v3,
+         t0, t1, t2, t3, t4, t5,
+         x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
+
+  /* x8 = wasm_f32x4_const(0.f, -0.f, 0.f, -0.f); */
+  x8 = glmm_float32x4_SIGNMASK_PNPN;
+  x9 = glmm_shuff1(x8, 2, 1, 2, 1);
+
+  /* 127 <- 0 */
+  r0 = glmm_load(mat[0]); /* d c b a */
+  r1 = glmm_load(mat[1]); /* h g f e */
+  r2 = glmm_load(mat[2]); /* l k j i */
+  r3 = glmm_load(mat[3]); /* p o n m */
+  /* x0 = _mm_movehl_ps(r3, r2); */
+  x0 = wasm_i32x4_shuffle(r3, r2, 6, 7, 2, 3);           /* p o l k */
+  /* x3 = _mm_movelh_ps(r2, r3); */
+  x3 = wasm_i32x4_shuffle(r2, r3, 0, 1, 4, 5);           /* n m j i */
+  x1 = glmm_shuff1(x0, 1, 3, 3 ,3);                      /* l p p p */
+  x2 = glmm_shuff1(x0, 0, 2, 2, 2);                      /* k o o o */
+  x4 = glmm_shuff1(x3, 1, 3, 3, 3);                      /* j n n n */
+  x7 = glmm_shuff1(x3, 0, 2, 2, 2);                      /* i m m m */
+
+  x6 = wasm_i32x4_shuffle(r2, r1, 0, 0, 4, 4);           /* e e i i */
+  x5 = wasm_i32x4_shuffle(r2, r1, 1, 1, 5, 5);           /* f f j j */
+  x3 = wasm_i32x4_shuffle(r2, r1, 2, 2, 6, 6);           /* g g k k */
+  x0 = wasm_i32x4_shuffle(r2, r1, 3, 3, 7, 7);           /* h h l l */
+  
+  t0 = wasm_f32x4_mul(x3, x1);
+  t1 = wasm_f32x4_mul(x5, x1);
+  t2 = wasm_f32x4_mul(x5, x2);
+  t3 = wasm_f32x4_mul(x6, x1);
+  t4 = wasm_f32x4_mul(x6, x2);
+  t5 = wasm_f32x4_mul(x6, x4);
+  
+  /* t1[0] = k * p - o * l;
+     t1[0] = k * p - o * l;
+     t2[0] = g * p - o * h;
+     t3[0] = g * l - k * h; */
+  t0 = glmm_fnmadd(x2, x0, t0);
+  
+  /* t1[1] = j * p - n * l;
+     t1[1] = j * p - n * l;
+     t2[1] = f * p - n * h;
+     t3[1] = f * l - j * h; */
+   t1 = glmm_fnmadd(x4, x0, t1);
+  
+  /* t1[2] = j * o - n * k
+     t1[2] = j * o - n * k;
+     t2[2] = f * o - n * g;
+     t3[2] = f * k - j * g; */
+  t2 = glmm_fnmadd(x4, x3, t2);
+  
+  /* t1[3] = i * p - m * l;
+     t1[3] = i * p - m * l;
+     t2[3] = e * p - m * h;
+     t3[3] = e * l - i * h; */
+  t3 = glmm_fnmadd(x7, x0, t3);
+  
+  /* t1[4] = i * o - m * k;
+     t1[4] = i * o - m * k;
+     t2[4] = e * o - m * g;
+     t3[4] = e * k - i * g; */
+  t4 = glmm_fnmadd(x7, x3, t4);
+  
+  /* t1[5] = i * n - m * j;
+     t1[5] = i * n - m * j;
+     t2[5] = e * n - m * f;
+     t3[5] = e * j - i * f; */
+  t5 = glmm_fnmadd(x7, x5, t5);
+  /* x4 = _mm_movelh_ps(r0, r1); */
+  x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5);           /* f e b a */
+  /* x5 = _mm_movehl_ps(r1, r0); */
+  x5 = wasm_i32x4_shuffle(r1, r0, 6, 7, 2, 3);           /* h g d c */
+  
+  x0 = glmm_shuff1(x4, 0, 0, 0, 2);                      /* a a a e */
+  x1 = glmm_shuff1(x4, 1, 1, 1, 3);                      /* b b b f */
+  x2 = glmm_shuff1(x5, 0, 0, 0, 2);                      /* c c c g */
+  x3 = glmm_shuff1(x5, 1, 1, 1, 3);                      /* d d d h */
+  
+  v2 = wasm_f32x4_mul(x0, t1);
+  v1 = wasm_f32x4_mul(x0, t0);
+  v3 = wasm_f32x4_mul(x0, t2);
+  v0 = wasm_f32x4_mul(x1, t0);
+  
+  v2 = glmm_fnmadd(x1, t3, v2);
+  v3 = glmm_fnmadd(x1, t4, v3);
+  v0 = glmm_fnmadd(x2, t1, v0);
+  v1 = glmm_fnmadd(x2, t3, v1);
+  
+  v3 = glmm_fmadd(x2, t5, v3);
+  v0 = glmm_fmadd(x3, t2, v0);
+  v2 = glmm_fmadd(x3, t5, v2);
+  v1 = glmm_fmadd(x3, t4, v1);
+
+  /*
+   dest[0][0] =  f * t1[0] - g * t1[1] + h * t1[2];
+   dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]);
+   dest[0][2] =  b * t2[0] - c * t2[1] + d * t2[2];
+   dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */
+  v0 = wasm_v128_xor(v0, x8);
+  
+  /*
+   dest[2][0] =  e * t1[1] - f * t1[3] + h * t1[5];
+   dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
+   dest[2][2] =  a * t2[1] - b * t2[3] + d * t2[5];
+   dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
+  v2 = wasm_v128_xor(v2, x8);
+
+  /*
+   dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]);
+   dest[1][1] =  a * t1[0] - c * t1[3] + d * t1[4];
+   dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]);
+   dest[1][3] =  a * t3[0] - c * t3[3] + d * t3[4]; */
+  v1 = wasm_v128_xor(v1, x9);
+
+  /*
+   dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]);
+   dest[3][1] =  a * t1[2] - b * t1[4] + c * t1[5];
+   dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]);
+   dest[3][3] =  a * t3[2] - b * t3[4] + c * t3[5]; */
+  v3 = wasm_v128_xor(v3, x9);
+
+  /* determinant */
+  x0 = wasm_i32x4_shuffle(v0, v1, 0, 0, 4, 4);
+  x1 = wasm_i32x4_shuffle(v2, v3, 0, 0, 4, 4);
+  x0 = wasm_i32x4_shuffle(x0, x1, 0, 2, 4, 6);
+
+  /* x0 = _mm_rcp_ps(glmm_vhadd(wasm_f32x4_mul(x0, r0))); */
+  x0 = wasm_f32x4_div(wasm_f32x4_const_splat(1.0f),
+                      glmm_vhadd(wasm_f32x4_mul(x0, r0)));
+
+  glmm_store(dest[0], wasm_f32x4_mul(v0, x0));
+  glmm_store(dest[1], wasm_f32x4_mul(v1, x0));
+  glmm_store(dest[2], wasm_f32x4_mul(v2, x0));
+  glmm_store(dest[3], wasm_f32x4_mul(v3, x0));
+}
+
+CGLM_INLINE
+void
+glm_mat4_inv_wasm(mat4 mat, mat4 dest) {
+  glmm_128 r0, r1, r2, r3,
+         v0, v1, v2, v3,
+         t0, t1, t2, t3, t4, t5,
+         x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
+
+  /* x8 = wasm_f32x4_const(0.f, -0.f, 0.f, -0.f); */
+  x8 = glmm_float32x4_SIGNMASK_PNPN;
+  x9 = glmm_shuff1(x8, 2, 1, 2, 1);
+
+  /* 127 <- 0 */
+  r0 = glmm_load(mat[0]); /* d c b a */
+  r1 = glmm_load(mat[1]); /* h g f e */
+  r2 = glmm_load(mat[2]); /* l k j i */
+  r3 = glmm_load(mat[3]); /* p o n m */
+  /* x0 = _mm_movehl_ps(r3, r2); */
+  x0 = wasm_i32x4_shuffle(r3, r2, 6, 7, 2, 3);           /* p o l k */
+  /* x3 = _mm_movelh_ps(r2, r3); */
+  x3 = wasm_i32x4_shuffle(r2, r3, 0, 1, 4, 5);           /* n m j i */
+  x1 = glmm_shuff1(x0, 1, 3, 3 ,3);                      /* l p p p */
+  x2 = glmm_shuff1(x0, 0, 2, 2, 2);                      /* k o o o */
+  x4 = glmm_shuff1(x3, 1, 3, 3, 3);                      /* j n n n */
+  x7 = glmm_shuff1(x3, 0, 2, 2, 2);                      /* i m m m */
+
+  x6 = wasm_i32x4_shuffle(r2, r1, 0, 0, 4, 4);           /* e e i i */
+  x5 = wasm_i32x4_shuffle(r2, r1, 1, 1, 5, 5);           /* f f j j */
+  x3 = wasm_i32x4_shuffle(r2, r1, 2, 2, 6, 6);           /* g g k k */
+  x0 = wasm_i32x4_shuffle(r2, r1, 3, 3, 7, 7);           /* h h l l */
+  
+  t0 = wasm_f32x4_mul(x3, x1);
+  t1 = wasm_f32x4_mul(x5, x1);
+  t2 = wasm_f32x4_mul(x5, x2);
+  t3 = wasm_f32x4_mul(x6, x1);
+  t4 = wasm_f32x4_mul(x6, x2);
+  t5 = wasm_f32x4_mul(x6, x4);
+  
+  /* t1[0] = k * p - o * l;
+     t1[0] = k * p - o * l;
+     t2[0] = g * p - o * h;
+     t3[0] = g * l - k * h; */
+  t0 = glmm_fnmadd(x2, x0, t0);
+  
+  /* t1[1] = j * p - n * l;
+     t1[1] = j * p - n * l;
+     t2[1] = f * p - n * h;
+     t3[1] = f * l - j * h; */
+   t1 = glmm_fnmadd(x4, x0, t1);
+  
+  /* t1[2] = j * o - n * k
+     t1[2] = j * o - n * k;
+     t2[2] = f * o - n * g;
+     t3[2] = f * k - j * g; */
+  t2 = glmm_fnmadd(x4, x3, t2);
+  
+  /* t1[3] = i * p - m * l;
+     t1[3] = i * p - m * l;
+     t2[3] = e * p - m * h;
+     t3[3] = e * l - i * h; */
+  t3 = glmm_fnmadd(x7, x0, t3);
+  
+  /* t1[4] = i * o - m * k;
+     t1[4] = i * o - m * k;
+     t2[4] = e * o - m * g;
+     t3[4] = e * k - i * g; */
+  t4 = glmm_fnmadd(x7, x3, t4);
+  
+  /* t1[5] = i * n - m * j;
+     t1[5] = i * n - m * j;
+     t2[5] = e * n - m * f;
+     t3[5] = e * j - i * f; */
+  t5 = glmm_fnmadd(x7, x5, t5);
+  /* x4 = _mm_movelh_ps(r0, r1); */
+  x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5);           /* f e b a */
+  /* x5 = _mm_movehl_ps(r1, r0); */
+  x5 = wasm_i32x4_shuffle(r1, r0, 6, 7, 2, 3);           /* h g d c */
+  
+  x0 = glmm_shuff1(x4, 0, 0, 0, 2);                      /* a a a e */
+  x1 = glmm_shuff1(x4, 1, 1, 1, 3);                      /* b b b f */
+  x2 = glmm_shuff1(x5, 0, 0, 0, 2);                      /* c c c g */
+  x3 = glmm_shuff1(x5, 1, 1, 1, 3);                      /* d d d h */
+  
+  v2 = wasm_f32x4_mul(x0, t1);
+  v1 = wasm_f32x4_mul(x0, t0);
+  v3 = wasm_f32x4_mul(x0, t2);
+  v0 = wasm_f32x4_mul(x1, t0);
+  
+  v2 = glmm_fnmadd(x1, t3, v2);
+  v3 = glmm_fnmadd(x1, t4, v3);
+  v0 = glmm_fnmadd(x2, t1, v0);
+  v1 = glmm_fnmadd(x2, t3, v1);
+  
+  v3 = glmm_fmadd(x2, t5, v3);
+  v0 = glmm_fmadd(x3, t2, v0);
+  v2 = glmm_fmadd(x3, t5, v2);
+  v1 = glmm_fmadd(x3, t4, v1);
+
+  /*
+   dest[0][0] =  f * t1[0] - g * t1[1] + h * t1[2];
+   dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]);
+   dest[0][2] =  b * t2[0] - c * t2[1] + d * t2[2];
+   dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */
+  v0 = wasm_v128_xor(v0, x8);
+  
+  /*
+   dest[2][0] =  e * t1[1] - f * t1[3] + h * t1[5];
+   dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
+   dest[2][2] =  a * t2[1] - b * t2[3] + d * t2[5];
+   dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
+  v2 = wasm_v128_xor(v2, x8);
+
+  /*
+   dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]);
+   dest[1][1] =  a * t1[0] - c * t1[3] + d * t1[4];
+   dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]);
+   dest[1][3] =  a * t3[0] - c * t3[3] + d * t3[4]; */
+  v1 = wasm_v128_xor(v1, x9);
+
+  /*
+   dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]);
+   dest[3][1] =  a * t1[2] - b * t1[4] + c * t1[5];
+   dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]);
+   dest[3][3] =  a * t3[2] - b * t3[4] + c * t3[5]; */
+  v3 = wasm_v128_xor(v3, x9);
+
+  /* determinant */
+  x0 = wasm_i32x4_shuffle(v0, v1, 0, 0, 4, 4);
+  x1 = wasm_i32x4_shuffle(v2, v3, 0, 0, 4, 4);
+  x0 = wasm_i32x4_shuffle(x0, x1, 0, 2, 4, 6);
+
+  x0 = wasm_f32x4_div(wasm_f32x4_splat(1.0f), glmm_vhadd(wasm_f32x4_mul(x0, r0)));
+
+  glmm_store(dest[0], wasm_f32x4_mul(v0, x0));
+  glmm_store(dest[1], wasm_f32x4_mul(v1, x0));
+  glmm_store(dest[2], wasm_f32x4_mul(v2, x0));
+  glmm_store(dest[3], wasm_f32x4_mul(v3, x0));
+}
+
+#endif
+#endif /* cglm_mat_wasm_h */
diff --git a/external/cglm/simd/wasm/quat.h b/external/cglm/simd/wasm/quat.h
new file mode 100644
index 0000000..8d72546
--- /dev/null
+++ b/external/cglm/simd/wasm/quat.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_quat_wasm_h
+#define cglm_quat_wasm_h
+#if defined(__wasm__) && defined(__wasm_simd128__)
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_quat_mul_wasm(versor p, versor q, versor dest) {
+  /*
+   + (a1 b2 + b1 a2 + c1 d2 − d1 c2)i
+   + (a1 c2 − b1 d2 + c1 a2 + d1 b2)j
+   + (a1 d2 + b1 c2 − c1 b2 + d1 a2)k
+     a1 a2 − b1 b2 − c1 c2 − d1 d2
+   */
+
+  glmm_128 xp, xq, x1, x2, x3, r, x, y, z;
+
+  xp = glmm_load(p); /* 3 2 1 0 */
+  xq = glmm_load(q);
+  /* x1 = wasm_f32x4_const(0.f, -0.f, 0.f, -0.f); */
+  x1 = glmm_float32x4_SIGNMASK_PNPN; /* TODO: _mm_set1_ss() + shuff ? */
+  r  = wasm_f32x4_mul(glmm_splat_w(xp), xq);
+  /* x2 = _mm_unpackhi_ps(x1, x1); */
+  x2 = wasm_i32x4_shuffle(x1, x1, 2, 6, 3, 7);
+  x3 = glmm_shuff1(x1, 3, 2, 0, 1);
+  x  = glmm_splat_x(xp);
+  y  = glmm_splat_y(xp);
+  z  = glmm_splat_z(xp);
+
+  x  = wasm_v128_xor(x, x1);
+  y  = wasm_v128_xor(y, x2);
+  z  = wasm_v128_xor(z, x3);
+  
+  x1 = glmm_shuff1(xq, 0, 1, 2, 3);
+  x2 = glmm_shuff1(xq, 1, 0, 3, 2);
+  x3 = glmm_shuff1(xq, 2, 3, 0, 1);
+  
+  r  = glmm_fmadd(x, x1, r);
+  r  = glmm_fmadd(y, x2, r);
+  r  = glmm_fmadd(z, x3, r);
+
+  glmm_store(dest, r);
+}
+
+#endif
+#endif /* cglm_quat_wasm_h */
diff --git a/external/cglm/simd/x86.h b/external/cglm/simd/x86.h
new file mode 100644
index 0000000..2410d0f
--- /dev/null
+++ b/external/cglm/simd/x86.h
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_simd_x86_h
+#define cglm_simd_x86_h
+#include "intrin.h"
+#ifdef CGLM_SIMD_x86
+
+#ifdef CGLM_ALL_UNALIGNED
+#  define glmm_load(p)      _mm_loadu_ps(p)
+#  define glmm_store(p, a)  _mm_storeu_ps(p, a)
+#else
+#  define glmm_load(p)      _mm_load_ps(p)
+#  define glmm_store(p, a)  _mm_store_ps(p, a)
+#endif
+
+#define glmm_128     __m128
+
+#ifdef __AVX__
+#  define glmm_shuff1(xmm, z, y, x, w)                                        \
+     _mm_permute_ps((xmm), _MM_SHUFFLE(z, y, x, w))
+#else
+#  if !defined(CGLM_NO_INT_DOMAIN) && defined(__SSE2__)
+#    define glmm_shuff1(xmm, z, y, x, w)                                      \
+       _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm),              \
+                                          _MM_SHUFFLE(z, y, x, w)))
+#  else
+#    define glmm_shuff1(xmm, z, y, x, w)                                      \
+       _mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
+#  endif
+#endif
+
+#define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane)
+
+#ifdef __AVX__
+#  define glmm_set1(x)      _mm_broadcast_ss(&x)
+#  define glmm_set1_ptr(x)  _mm_broadcast_ss(x)
+#  define glmm_set1_rval(x) _mm_set1_ps(x)
+#  ifdef __AVX2__
+#    define glmm_splat_x(x) _mm_broadcastss_ps(x)
+#  else
+#    define glmm_splat_x(x) _mm_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0))
+#  endif
+#  define glmm_splat_y(x)   _mm_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1))
+#  define glmm_splat_z(x)   _mm_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2))
+#  define glmm_splat_w(x)   _mm_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3))
+#else
+#  define glmm_set1(x)      _mm_set1_ps(x)
+#  define glmm_set1_ptr(x)  _mm_set1_ps(*x)
+#  define glmm_set1_rval(x) _mm_set1_ps(x)
+
+#  define glmm_splat_x(x)   glmm_splat(x, 0)
+#  define glmm_splat_y(x)   glmm_splat(x, 1)
+#  define glmm_splat_z(x)   glmm_splat(x, 2)
+#  define glmm_splat_w(x)   glmm_splat(x, 3)
+#endif
+
+#ifdef __AVX__
+#  ifdef CGLM_ALL_UNALIGNED
+#    define glmm_load256(p)      _mm256_loadu_ps(p)
+#    define glmm_store256(p, a)  _mm256_storeu_ps(p, a)
+#  else
+#    define glmm_load256(p)      _mm256_load_ps(p)
+#    define glmm_store256(p, a)  _mm256_store_ps(p, a)
+#  endif
+#endif
+
+/* Note that `0x80000000` corresponds to `INT_MIN` for a 32-bit int. */
+
+#if defined(__SSE2__)
+#  define GLMM_NEGZEROf ((int)0x80000000) /*  0x80000000 ---> -0.0f  */
+#  define GLMM_POSZEROf ((int)0x00000000) /*  0x00000000 ---> +0.0f  */
+#else
+#  ifdef CGLM_FAST_MATH
+     union { int i; float f; } static GLMM_NEGZEROf_TU = { .i = (int)0x80000000 };
+#    define GLMM_NEGZEROf GLMM_NEGZEROf_TU.f
+#    define GLMM_POSZEROf 0.0f
+#  else
+#    define GLMM_NEGZEROf -0.0f
+#    define GLMM_POSZEROf  0.0f
+#  endif
+#endif
+
+#if defined(__SSE2__)
+#  define GLMM__SIGNMASKf(X, Y, Z, W)                                         \
+   _mm_castsi128_ps(_mm_set_epi32(X, Y, Z, W))
+  /* _mm_set_ps(X, Y, Z, W); */
+#else
+#  define GLMM__SIGNMASKf(X, Y, Z, W)  _mm_set_ps(X, Y, Z, W)
+#endif
+
+#define glmm_float32x4_SIGNMASK_PNPN GLMM__SIGNMASKf(GLMM_POSZEROf, GLMM_NEGZEROf, GLMM_POSZEROf, GLMM_NEGZEROf)
+#define glmm_float32x4_SIGNMASK_NPNP GLMM__SIGNMASKf(GLMM_NEGZEROf, GLMM_POSZEROf, GLMM_NEGZEROf, GLMM_POSZEROf)
+#define glmm_float32x4_SIGNMASK_NPPN GLMM__SIGNMASKf(GLMM_NEGZEROf, GLMM_POSZEROf, GLMM_POSZEROf, GLMM_NEGZEROf)
+
+/* fasth math prevents -0.0f to work */
+#if defined(__SSE2__)
+#  define glmm_float32x4_SIGNMASK_NEG _mm_castsi128_ps(_mm_set1_epi32(GLMM_NEGZEROf)) /* _mm_set1_ps(-0.0f) */
+#else
+#  define glmm_float32x4_SIGNMASK_NEG glmm_set1(GLMM_NEGZEROf)
+#endif
+
+#define glmm_float32x8_SIGNMASK_NEG _mm256_castsi256_ps(_mm256_set1_epi32(GLMM_NEGZEROf))
+
+static inline
+__m128
+glmm_abs(__m128 x) {
+  return _mm_andnot_ps(glmm_float32x4_SIGNMASK_NEG, x);
+}
+
+static inline __m128 glmm_min(__m128 a, __m128 b) { return _mm_min_ps(a, b); }
+static inline __m128 glmm_max(__m128 a, __m128 b) { return _mm_max_ps(a, b); }
+
+static inline
+__m128
+glmm_vhadd(__m128 v) {
+  __m128 x0;
+  x0 = _mm_add_ps(v,  glmm_shuff1(v, 0, 1, 2, 3));
+  x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 0, 1));
+  return x0;
+}
+
+static inline
+__m128
+glmm_vhadds(__m128 v) {
+#if defined(__SSE3__)
+  __m128 shuf, sums;
+  shuf = _mm_movehdup_ps(v);
+  sums = _mm_add_ps(v, shuf);
+  shuf = _mm_movehl_ps(shuf, sums);
+  sums = _mm_add_ss(sums, shuf);
+  return sums;
+#else
+  __m128 shuf, sums;
+  shuf = glmm_shuff1(v, 2, 3, 0, 1);
+  sums = _mm_add_ps(v, shuf);
+  shuf = _mm_movehl_ps(shuf, sums);
+  sums = _mm_add_ss(sums, shuf);
+  return sums;
+#endif
+}
+
+static inline
+float
+glmm_hadd(__m128 v) {
+  return _mm_cvtss_f32(glmm_vhadds(v));
+}
+
+static inline
+__m128
+glmm_vhmin(__m128 v) {
+  __m128 x0, x1, x2;
+  x0 = _mm_movehl_ps(v, v);     /* [2, 3, 2, 3] */
+  x1 = _mm_min_ps(x0, v);       /* [0|2, 1|3, 2|2, 3|3] */
+  x2 = glmm_splat(x1, 1);       /* [1|3, 1|3, 1|3, 1|3] */
+  return _mm_min_ss(x1, x2);
+}
+
+static inline
+float
+glmm_hmin(__m128 v) {
+  return _mm_cvtss_f32(glmm_vhmin(v));
+}
+
+static inline
+__m128
+glmm_vhmax(__m128 v) {
+  __m128 x0, x1, x2;
+  x0 = _mm_movehl_ps(v, v);     /* [2, 3, 2, 3] */
+  x1 = _mm_max_ps(x0, v);       /* [0|2, 1|3, 2|2, 3|3] */
+  x2 = glmm_splat(x1, 1);       /* [1|3, 1|3, 1|3, 1|3] */
+  return _mm_max_ss(x1, x2);
+}
+
+static inline
+float
+glmm_hmax(__m128 v) {
+  return _mm_cvtss_f32(glmm_vhmax(v));
+}
+
+static inline
+__m128
+glmm_vdots(__m128 a, __m128 b) {
+#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT)
+  return _mm_dp_ps(a, b, 0xFF);
+#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT)
+  __m128 x0, x1;
+  x0 = _mm_mul_ps(a, b);
+  x1 = _mm_hadd_ps(x0, x0);
+  return _mm_hadd_ps(x1, x1);
+#else
+  return glmm_vhadds(_mm_mul_ps(a, b));
+#endif
+}
+
+static inline
+__m128
+glmm_vdot(__m128 a, __m128 b) {
+#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT)
+  return _mm_dp_ps(a, b, 0xFF);
+#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT)
+  __m128 x0, x1;
+  x0 = _mm_mul_ps(a, b);
+  x1 = _mm_hadd_ps(x0, x0);
+  return _mm_hadd_ps(x1, x1);
+#else
+  __m128 x0;
+  x0 = _mm_mul_ps(a, b);
+  x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2));
+  return _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 0, 1));
+#endif
+}
+
+static inline
+float
+glmm_dot(__m128 a, __m128 b) {
+  return _mm_cvtss_f32(glmm_vdots(a, b));
+}
+
+static inline
+float
+glmm_norm(__m128 a) {
+  return _mm_cvtss_f32(_mm_sqrt_ss(glmm_vhadds(_mm_mul_ps(a, a))));
+}
+
+static inline
+float
+glmm_norm2(__m128 a) {
+  return _mm_cvtss_f32(glmm_vhadds(_mm_mul_ps(a, a)));
+}
+
+static inline
+float
+glmm_norm_one(__m128 a) {
+  return _mm_cvtss_f32(glmm_vhadds(glmm_abs(a)));
+}
+
+static inline
+float
+glmm_norm_inf(__m128 a) {
+  return _mm_cvtss_f32(glmm_vhmax(glmm_abs(a)));
+}
+
+#if defined(__SSE2__)
+static inline
+__m128
+glmm_load3(float v[3]) {
+  __m128i xy;
+  __m128  z;
+
+  xy = _mm_loadl_epi64(CGLM_CASTPTR_ASSUME_ALIGNED(v, const __m128i));
+  z  = _mm_load_ss(&v[2]);
+
+  return _mm_movelh_ps(_mm_castsi128_ps(xy), z);
+}
+
+static inline
+void
+glmm_store3(float v[3], __m128 vx) {
+  _mm_storel_pi(CGLM_CASTPTR_ASSUME_ALIGNED(v, __m64), vx);
+  _mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
+}
+#endif
+
+static inline
+__m128
+glmm_div(__m128 a, __m128 b) {
+  return _mm_div_ps(a, b);
+}
+
+/* enable FMA macro for MSVC? */
+#if defined(_MSC_VER) && !defined(__FMA__) && defined(__AVX2__)
+#  define __FMA__ 1
+#endif
+
+static inline
+__m128
+glmm_fmadd(__m128 a, __m128 b, __m128 c) {
+#ifdef __FMA__
+  return _mm_fmadd_ps(a, b, c);
+#else
+  return _mm_add_ps(c, _mm_mul_ps(a, b));
+#endif
+}
+
+static inline
+__m128
+glmm_fnmadd(__m128 a, __m128 b, __m128 c) {
+#ifdef __FMA__
+  return _mm_fnmadd_ps(a, b, c);
+#else
+  return _mm_sub_ps(c, _mm_mul_ps(a, b));
+#endif
+}
+
+static inline
+__m128
+glmm_fmsub(__m128 a, __m128 b, __m128 c) {
+#ifdef __FMA__
+  return _mm_fmsub_ps(a, b, c);
+#else
+  return _mm_sub_ps(_mm_mul_ps(a, b), c);
+#endif
+}
+
+static inline
+__m128
+glmm_fnmsub(__m128 a, __m128 b, __m128 c) {
+#ifdef __FMA__
+  return _mm_fnmsub_ps(a, b, c);
+#else
+  return _mm_xor_ps(_mm_add_ps(_mm_mul_ps(a, b), c),
+                    glmm_float32x4_SIGNMASK_NEG);
+#endif
+}
+
+#if defined(__AVX__)
+static inline
+__m256
+glmm256_fmadd(__m256 a, __m256 b, __m256 c) {
+#ifdef __FMA__
+  return _mm256_fmadd_ps(a, b, c);
+#else
+  return _mm256_add_ps(c, _mm256_mul_ps(a, b));
+#endif
+}
+
+static inline
+__m256
+glmm256_fnmadd(__m256 a, __m256 b, __m256 c) {
+#ifdef __FMA__
+  return _mm256_fnmadd_ps(a, b, c);
+#else
+  return _mm256_sub_ps(c, _mm256_mul_ps(a, b));
+#endif
+}
+
+static inline
+__m256
+glmm256_fmsub(__m256 a, __m256 b, __m256 c) {
+#ifdef __FMA__
+  return _mm256_fmsub_ps(a, b, c);
+#else
+  return _mm256_sub_ps(_mm256_mul_ps(a, b), c);
+#endif
+}
+
+static inline
+__m256
+glmm256_fnmsub(__m256 a, __m256 b, __m256 c) {
+#ifdef __FMA__
+  return _mm256_fmsub_ps(a, b, c);
+#else
+  return _mm256_xor_ps(_mm256_sub_ps(_mm256_mul_ps(a, b), c),
+                       glmm_float32x8_SIGNMASK_NEG);
+#endif
+}
+#endif
+
+#endif
+#endif /* cglm_simd_x86_h */
diff --git a/external/cglm/sphere.h b/external/cglm/sphere.h
new file mode 100644
index 0000000..334b83a
--- /dev/null
+++ b/external/cglm/sphere.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_sphere_h
+#define cglm_sphere_h
+
+#include "common.h"
+#include "mat4.h"
+
+/*
+  Sphere Representation in cglm: [center.x, center.y, center.z, radii]
+
+  You could use this representation or you can convert it to vec4 before call
+  any function
+ */
+
+/*!
+ * @brief helper for getting sphere radius
+ *
+ * @param[in]   s  sphere
+ *
+ * @return returns radii
+ */
+CGLM_INLINE
+float
+glm_sphere_radii(vec4 s) {
+  return s[3];
+}
+
+/*!
+ * @brief apply transform to sphere, it is just wrapper for glm_mat4_mulv3
+ *
+ * @param[in]  s    sphere
+ * @param[in]  m    transform matrix
+ * @param[out] dest transformed sphere
+ */
+CGLM_INLINE
+void
+glm_sphere_transform(vec4 s, mat4 m, vec4 dest) {
+  glm_mat4_mulv3(m, s, 1.0f, dest);
+  dest[3] = s[3];
+}
+
+/*!
+ * @brief merges two spheres and creates a new one
+ *
+ * two sphere must be in same space, for instance if one in world space then
+ * the other must be in world space too, not in local space.
+ *
+ * @param[in]  s1   sphere 1
+ * @param[in]  s2   sphere 2
+ * @param[out] dest merged/extended sphere
+ */
+CGLM_INLINE
+void
+glm_sphere_merge(vec4 s1, vec4 s2, vec4 dest) {
+  float dist, radii;
+
+  dist  = glm_vec3_distance(s1, s2);
+  radii = dist + s1[3] + s2[3];
+
+  radii = glm_max(radii, s1[3]);
+  radii = glm_max(radii, s2[3]);
+
+  glm_vec3_center(s1, s2, dest);
+  dest[3] = radii;
+}
+
+/*!
+ * @brief check if two sphere intersects
+ *
+ * @param[in]   s1  sphere
+ * @param[in]   s2  other sphere
+ */
+CGLM_INLINE
+bool
+glm_sphere_sphere(vec4 s1, vec4 s2) {
+  return glm_vec3_distance2(s1, s2) <= glm_pow2(s1[3] + s2[3]);
+}
+
+/*!
+ * @brief check if sphere intersects with point
+ *
+ * @param[in]   s      sphere
+ * @param[in]   point  point
+ */
+CGLM_INLINE
+bool
+glm_sphere_point(vec4 s, vec3 point) {
+  float rr;
+  rr = s[3] * s[3];
+  return glm_vec3_distance2(point, s) <= rr;
+}
+
+#endif /* cglm_sphere_h */
diff --git a/external/cglm/struct.h b/external/cglm/struct.h
new file mode 100644
index 0000000..31ca4e2
--- /dev/null
+++ b/external/cglm/struct.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_structs_h
+#define cglm_structs_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "cglm.h"
+#include "types-struct.h"
+#include "struct/vec2.h"
+#include "struct/vec3.h"
+#include "struct/vec4.h"
+#include "struct/ivec2.h"
+#include "struct/ivec3.h"
+#include "struct/ivec4.h"
+#include "struct/mat2.h"
+#include "struct/mat2x3.h"
+#include "struct/mat2x4.h"
+#include "struct/mat3.h"
+#include "struct/mat3x2.h"
+#include "struct/mat3x4.h"
+#include "struct/mat4.h"
+#include "struct/mat4x2.h"
+#include "struct/mat4x3.h"
+#include "struct/affine.h"
+#include "struct/frustum.h"
+#include "struct/plane.h"
+#include "struct/noise.h"
+#include "struct/box.h"
+#include "struct/color.h"
+#include "struct/io.h"
+#include "struct/cam.h"
+#include "struct/quat.h"
+#include "struct/euler.h"
+#include "struct/project.h"
+#include "struct/sphere.h"
+#include "struct/curve.h"
+#include "struct/affine2d.h"
+#include "struct/ray.h"
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* cglm_structs_h */
diff --git a/external/cglm/struct/aabb2d.h b/external/cglm/struct/aabb2d.h
new file mode 100644
index 0000000..9077069
--- /dev/null
+++ b/external/cglm/struct/aabb2d.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglms_aabb2ds_h
+#define cglms_aabb2ds_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../aabb2d.h"
+#include "vec2.h"
+#include "vec4.h"
+#include "mat4.h"
+
+/* api definition */
+#define glms_aabb2d_(NAME) CGLM_STRUCTAPI(aabb2d, NAME)
+
+/*!
+ * @brief apply transform to Axis-Aligned Bounding Box
+ *
+ * @param[in]  aabb  bounding box
+ * @param[in]  m    transform matrix
+ * @param[out] dest transformed bounding box
+ */
+CGLM_INLINE
+void
+glms_aabb2d_(transform)(vec2s aabb[2], mat3s m, vec2s dest[2]) {
+  vec2 rawAabb[2];
+  vec2 rawDest[2];
+
+  glms_vec2_(unpack)(rawAabb, aabb, 2);
+  glm_aabb2d_transform(rawAabb, m.raw, rawDest);
+  glms_vec2_(pack)(dest, rawDest, 2);
+}
+
+/*!
+ * @brief merges two AABB bounding box and creates new one
+ *
+ * two box must be in same space, if one of box is in different space then
+ * you should consider to convert it's space by glm_box_space
+ *
+ * @param[in]  aabb1 bounding box 1
+ * @param[in]  aabb2 bounding box 2
+ * @param[out] dest merged bounding box
+ */
+CGLM_INLINE
+void
+glms_aabb2d_(merge)(vec2s aabb1[2], vec2s aabb2[2], vec2s dest[2]) {
+  vec2 rawAabb1[2];
+  vec2 rawAabb2[2];
+  vec2 rawDest[2];
+
+  glms_vec2_(unpack)(rawAabb1, aabb1, 2);
+  glms_vec2_(unpack)(rawAabb2, aabb2, 2);
+  glm_aabb2d_merge(rawAabb1, rawAabb2, rawDest);
+  glms_vec2_(pack)(dest, rawDest, 2);
+}
+
+/*!
+ * @brief crops a bounding box with another one.
+ *
+ * this could be useful for getting a bbox which fits with view frustum and
+ * object bounding boxes. In this case you crop view frustum box with objects
+ * box
+ *
+ * @param[in]  aabb     bounding box 1
+ * @param[in]  cropAabb crop box
+ * @param[out] dest    cropped bounding box
+ */
+CGLM_INLINE
+void
+glms_aabb2d_(crop)(vec2s aabb[2], vec2s cropAabb[2], vec2s dest[2]) {
+  vec2 rawAabb[2];
+  vec2 rawCropAabb[2];
+  vec2 rawDest[2];
+
+  glms_vec2_(unpack)(rawAabb, aabb, 2);
+  glms_vec2_(unpack)(rawCropAabb, cropAabb, 2);
+  glm_aabb2d_crop(rawAabb, rawCropAabb, rawDest);
+  glms_vec2_(pack)(dest, rawDest, 2);
+}
+
+/*!
+ * @brief crops a bounding box with another one.
+ *
+ * this could be useful for getting a bbox which fits with view frustum and
+ * object bounding boxes. In this case you crop view frustum box with objects
+ * box
+ *
+ * @param[in]  aabb      bounding box
+ * @param[in]  cropAabb  crop box
+ * @param[in]  clampAabb minimum box
+ * @param[out] dest     cropped bounding box
+ */
+CGLM_INLINE
+void
+glms_aabb2d_(crop_until)(vec2s aabb[2],
+                       vec2s cropAabb[2],
+                       vec2s clampAabb[2],
+                       vec2s dest[2]) {
+  glms_aabb2d_(crop)(aabb, cropAabb, dest);
+  glms_aabb2d_(merge)(clampAabb, dest, dest);
+}
+
+/*!
+ * @brief invalidate AABB min and max values
+ *
+ * @param[in, out]  aabb bounding box
+ */
+CGLM_INLINE
+void
+glms_aabb2d_(invalidate)(vec2s box[2]) {
+  box[0] = glms_vec2_(fill)(FLT_MAX);
+  box[1] = glms_vec2_(fill)(-FLT_MAX);
+}
+
+/*!
+ * @brief check if AABB is valid or not
+ *
+ * @param[in]  aabb bounding box
+ */
+CGLM_INLINE
+bool
+glms_aabb2d_(isvalid)(vec2s aabb[2]) {
+  vec2 rawAabb[2];
+  glms_vec2_(unpack)(rawAabb, aabb, 2);
+  return glm_aabb2d_isvalid(rawAabb);
+}
+
+/*!
+ * @brief distance between of min and max
+ *
+ * @param[in]  aabb bounding box
+ */
+CGLM_INLINE
+float
+glms_aabb2d_(diag)(vec2s aabb[2]) {
+  vec2 rawAabb[2];
+  glms_vec2_(unpack)(rawAabb, aabb, 2);
+  return glm_aabb2d_diag(rawAabb);
+}
+
+
+/*!
+ * @brief size of aabb
+ *
+ * @param[in]  aabb bounding aabb
+ * @param[out]  dest size
+ */
+CGLM_INLINE
+vec2s
+glms_aabb2d_(sizev)(vec2s aabb[2]) {
+  vec2s size;
+  vec2  rawAabb[2];
+  glms_vec2_(unpack)(rawAabb, aabb, 2);
+  glm_aabb2d_sizev(rawAabb, size.raw);
+  return size;
+}
+
+/*!
+ * @brief radius of sphere which surrounds AABB
+ *
+ * @param[in]  aabb bounding box
+ */
+CGLM_INLINE
+float
+glms_aabb2d_(radius)(vec2s aabb[2]) {
+  return glms_aabb2d_(size)(aabb) * 0.5f;
+}
+
+/*!
+ * @brief computes center point of AABB
+ *
+ * @param[in]   aabb  bounding box
+ * @returns center of bounding box
+ */
+CGLM_INLINE
+vec2s
+glms_aabb2d_(center)(vec2s aabb[2]) {
+  return glms_vec2_(center)(aabb[0], aabb[1]);
+}
+
+/*!
+ * @brief check if two AABB intersects
+ *
+ * @param[in]   aabb   bounding box
+ * @param[in]   other  other bounding box
+ */
+CGLM_INLINE
+bool
+glms_aabb2d_(aabb)(vec2s aabb[2], vec2s other[2]) {
+  vec2 rawAabb[2];
+  vec2 rawOther[2];
+
+  glms_vec2_(unpack)(rawAabb, aabb, 2);
+  glms_vec2_(unpack)(rawOther, other, 2);
+  return glm_aabb2d_aabb(rawAabb, rawOther);
+}
+
+/*!
+ * @brief check if AABB intersects with a circle
+ *
+ * https://github.com/erich666/GraphicsGems/blob/master/gems/BoxSphere.c
+ * Solid Box - Solid Sphere test.
+ *
+ * @param[in]   aabb   solid bounding box
+ * @param[in]   s      solid sphere
+ */
+CGLM_INLINE
+bool
+glms_aabb2d_(circle)(vec2s aabb[2], vec3s c) {
+  vec2 rawAabb[2];
+
+  glms_vec2_(unpack)(rawAabb, aabb, 2);
+  return glm_aabb2d_circle(rawAabb, c.raw);
+}
+
+/*!
+ * @brief check if point is inside of AABB
+ *
+ * @param[in]   aabb   bounding box
+ * @param[in]   point  point
+ */
+CGLM_INLINE
+bool
+glms_aabb2d_(point)(vec2s aabb[2], vec2s point) {
+  vec2 rawAabb[2];
+
+  glms_vec2_(unpack)(rawAabb, aabb, 2);
+  return glm_aabb2d_point(rawAabb, point.raw);
+}
+
+/*!
+ * @brief check if AABB contains other AABB
+ *
+ * @param[in]   box    bounding box
+ * @param[in]   other  other bounding box
+ */
+CGLM_INLINE
+bool
+glms_aabb2d_(contains)(vec2s aabb[2], vec2s other[2]) {
+  vec2 rawAabb[2];
+  vec2 rawOther[2];
+
+  glms_vec2_(unpack)(rawAabb, aabb, 2);
+  glms_vec2_(unpack)(rawOther, other, 2);
+  return glm_aabb2d_contains(rawAabb, rawOther);
+}
+
+#endif /* cglms_aabb2ds_h */
diff --git a/external/cglm/struct/affine-mat.h b/external/cglm/struct/affine-mat.h
new file mode 100644
index 0000000..e1d4ff3
--- /dev/null
+++ b/external/cglm/struct/affine-mat.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_mul(mat4 m1, mat4 m2);
+   CGLM_INLINE mat4s glms_mul_rot(mat4 m1, mat4 m2);
+   CGLM_INLINE mat4s glms_inv_tr();
+ */
+
+#ifndef cglms_affine_mat_h
+#define cglms_affine_mat_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../affine-mat.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+
+/*!
+ * @brief this is similar to glms_mat4_mul but specialized to affine transform
+ *
+ * Matrix format should be:
+ *   R  R  R  X
+ *   R  R  R  Y
+ *   R  R  R  Z
+ *   0  0  0  W
+ *
+ * this reduces some multiplications. It should be faster than mat4_mul.
+ * if you are not sure about matrix format then DON'T use this! use mat4_mul
+ *
+ * @param[in]   m1    affine matrix 1
+ * @param[in]   m2    affine matrix 2
+ * @returns         destination matrix
+ */
+CGLM_INLINE
+mat4s
+glms_mul(mat4s m1, mat4s m2){
+  mat4s r;
+  glm_mul(m1.raw, m2.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief this is similar to glm_mat4_mul but specialized to affine transform
+ *
+ * Right Matrix format should be:
+ *   R  R  R  0
+ *   R  R  R  0
+ *   R  R  R  0
+ *   0  0  0  1
+ *
+ * this reduces some multiplications. It should be faster than mat4_mul.
+ * if you are not sure about matrix format then DON'T use this! use mat4_mul
+ *
+ * @param[in]   m1    affine matrix 1
+ * @param[in]   m2    affine matrix 2
+ * @returns         destination matrix
+ */
+CGLM_INLINE
+mat4s
+glms_mul_rot(mat4s m1, mat4s m2){
+  mat4s r;
+  glm_mul_rot(m1.raw, m2.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief inverse orthonormal rotation + translation matrix (ridig-body)
+ *
+ * @code
+ * X = | R  T |   X' = | R' -R'T |
+ *     | 0  1 |        | 0     1 |
+ * @endcode
+ *
+ * @param[in]  m  matrix
+ * @returns      destination matrix
+ */
+CGLM_INLINE
+mat4s
+glms_inv_tr(mat4s m){
+  glm_inv_tr(m.raw);
+  return m;
+}
+#endif /* cglms_affine_mat_h */
diff --git a/external/cglm/struct/affine-post.h b/external/cglm/struct/affine-post.h
new file mode 100644
index 0000000..e155660
--- /dev/null
+++ b/external/cglm/struct/affine-post.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_translated(mat4s m, vec3s v);
+   CGLM_INLINE mat4s glms_translated_x(mat4s m, float x);
+   CGLM_INLINE mat4s glms_translated_y(mat4s m, float y);
+   CGLM_INLINE mat4s glms_translated_z(mat4s m, float z);
+   CGLM_INLINE mat4s glms_rotated_x(mat4s m, float angle);
+   CGLM_INLINE mat4s glms_rotated_y(mat4s m, float angle);
+   CGLM_INLINE mat4s glms_rotated_z(mat4s m, float angle);
+   CGLM_INLINE mat4s glms_rotated(mat4s m, float angle, vec3s axis);
+   CGLM_INLINE mat4s glms_rotated_at(mat4s m, vec3s pivot, float angle, vec3s axis);
+   CGLM_INLINE mat4s glms_spinned(mat4s m, float angle, vec3s axis);
+ */
+
+#ifndef cglms_affines_post_h
+#define cglms_affines_post_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../affine.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+
+/*!
+ * @brief translate existing transform matrix by v vector
+ *        and stores result in same matrix
+ *
+ * @param[in]       m   affine transform
+ * @param[in]       v   translate vector [x, y, z]
+ * @returns             affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_translated(mat4s m, vec3s v) {
+  glm_translated(m.raw, v.raw);
+  return m;
+}
+
+/*!
+ * @brief translate existing transform matrix by x factor
+ *
+ * @param[in]       m   affine transform
+ * @param[in]       x   x factor
+ * @returns             affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_translated_x(mat4s m, float x) {
+  glm_translated_x(m.raw, x);
+  return m;
+}
+
+/*!
+ * @brief translate existing transform matrix by y factor
+ *
+ * @param[in]       m   affine transform
+ * @param[in]       y   y factor
+ * @returns             affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_translated_y(mat4s m, float y) {
+  glm_translated_y(m.raw, y);
+  return m;
+}
+
+/*!
+ * @brief translate existing transform matrix by z factor
+ *
+ * @param[in]       m   affine transform
+ * @param[in]       z   z factor
+ * @returns             affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_translated_z(mat4s m, float z) {
+  glm_translated_z(m.raw, z);
+  return m;
+}
+
+/*!
+ * @brief rotate existing transform matrix around X axis by angle
+ *        and store result in dest
+ *
+ * @param[in]   m       affine transform
+ * @param[in]   angle   angle (radians)
+ * @returns             rotated matrix
+ */
+CGLM_INLINE
+mat4s
+glms_rotated_x(mat4s m, float angle) {
+  mat4s r;
+  glm_rotated_x(m.raw, angle, r.raw);
+  return r;
+}
+
+/*!
+ * @brief rotate existing transform matrix around Y axis by angle
+ *        and store result in dest
+ *
+ * @param[in]   m       affine transform
+ * @param[in]   angle   angle (radians)
+ * @returns             rotated matrix
+ */
+CGLM_INLINE
+mat4s
+glms_rotated_y(mat4s m, float angle) {
+  mat4s r;
+  glm_rotated_y(m.raw, angle, r.raw);
+  return r;
+}
+
+/*!
+ * @brief rotate existing transform matrix around Z axis by angle
+ *        and store result in dest
+ *
+ * @param[in]   m       affine transform
+ * @param[in]   angle   angle (radians)
+ * @returns             rotated matrix
+ */
+CGLM_INLINE
+mat4s
+glms_rotated_z(mat4s m, float angle) {
+  mat4s r;
+  glm_rotated_z(m.raw, angle, r.raw);
+  return r;
+}
+
+/*!
+ * @brief rotate existing transform matrix around given axis by angle
+ *
+ * @param[in]       m       affine transform
+ * @param[in]       angle   angle (radians)
+ * @param[in]       axis    axis
+ * @returns                affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_rotated(mat4s m, float angle, vec3s axis) {
+  glm_rotated(m.raw, angle, axis.raw);
+  return m;
+}
+
+/*!
+ * @brief rotate existing transform
+ *        around given axis by angle at given pivot point (rotation center)
+ *
+ * @param[in]       m       affine transform
+ * @param[in]       pivot   rotation center
+ * @param[in]       angle   angle (radians)
+ * @param[in]       axis    axis
+ * @returns                 affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_rotated_at(mat4s m, vec3s pivot, float angle, vec3s axis) {
+  glm_rotated_at(m.raw, pivot.raw, angle, axis.raw);
+  return m;
+}
+
+/*!
+ * @brief rotate existing transform matrix around given axis by angle around self (doesn't affected by position)
+ *
+ * @param[in]       m       affine transform
+ * @param[in]       angle   angle (radians)
+ * @param[in]       axis    axis
+ * @returns                affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_spinned(mat4s m, float angle, vec3s axis) {
+  glm_spinned(m.raw, angle, axis.raw);
+  return m;
+}
+
+#endif /* cglms_affines_post_h */
diff --git a/external/cglm/struct/affine-pre.h b/external/cglm/struct/affine-pre.h
new file mode 100644
index 0000000..e323ffa
--- /dev/null
+++ b/external/cglm/struct/affine-pre.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_translate(mat4s m, vec3s v);
+   CGLM_INLINE mat4s glms_translate_x(mat4s m, float x);
+   CGLM_INLINE mat4s glms_translate_y(mat4s m, float y);
+   CGLM_INLINE mat4s glms_translate_z(mat4s m, float z);
+   CGLM_INLINE mat4s glms_rotate_x(mat4s m, float angle);
+   CGLM_INLINE mat4s glms_rotate_y(mat4s m, float angle);
+   CGLM_INLINE mat4s glms_rotate_z(mat4s m, float angle);
+   CGLM_INLINE mat4s glms_rotate(mat4s m, float angle, vec3s axis);
+   CGLM_INLINE mat4s glms_rotate_at(mat4s m, vec3s pivot, float angle, vec3s axis);
+   CGLM_INLINE mat4s glms_spin(mat4s m, float angle, vec3s axis);
+ */
+
+#ifndef cglms_affines_pre_h
+#define cglms_affines_pre_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../affine.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+
+/*!
+ * @brief translate existing transform matrix by v vector
+ *        and stores result in same matrix
+ *
+ * @param[in]       m   affine transform
+ * @param[in]       v   translate vector [x, y, z]
+ * @returns             affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_translate(mat4s m, vec3s v) {
+  glm_translate(m.raw, v.raw);
+  return m;
+}
+
+/*!
+ * @brief translate existing transform matrix by x factor
+ *
+ * @param[in]       m   affine transform
+ * @param[in]       x   x factor
+ * @returns             affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_translate_x(mat4s m, float x) {
+  glm_translate_x(m.raw, x);
+  return m;
+}
+
+/*!
+ * @brief translate existing transform matrix by y factor
+ *
+ * @param[in]       m   affine transform
+ * @param[in]       y   y factor
+ * @returns             affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_translate_y(mat4s m, float y) {
+  glm_translate_y(m.raw, y);
+  return m;
+}
+
+/*!
+ * @brief translate existing transform matrix by z factor
+ *
+ * @param[in]       m   affine transform
+ * @param[in]       z   z factor
+ * @returns             affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_translate_z(mat4s m, float z) {
+  glm_translate_z(m.raw, z);
+  return m;
+}
+
+/*!
+ * @brief rotate existing transform matrix around X axis by angle
+ *        and store result in dest
+ *
+ * @param[in]   m       affine transform
+ * @param[in]   angle   angle (radians)
+ * @returns             rotated matrix
+ */
+CGLM_INLINE
+mat4s
+glms_rotate_x(mat4s m, float angle) {
+  mat4s r;
+  glm_rotate_x(m.raw, angle, r.raw);
+  return r;
+}
+
+/*!
+ * @brief rotate existing transform matrix around Y axis by angle
+ *        and store result in dest
+ *
+ * @param[in]   m       affine transform
+ * @param[in]   angle   angle (radians)
+ * @returns             rotated matrix
+ */
+CGLM_INLINE
+mat4s
+glms_rotate_y(mat4s m, float angle) {
+  mat4s r;
+  glm_rotate_y(m.raw, angle, r.raw);
+  return r;
+}
+
+/*!
+ * @brief rotate existing transform matrix around Z axis by angle
+ *        and store result in dest
+ *
+ * @param[in]   m       affine transform
+ * @param[in]   angle   angle (radians)
+ * @returns             rotated matrix
+ */
+CGLM_INLINE
+mat4s
+glms_rotate_z(mat4s m, float angle) {
+  mat4s r;
+  glm_rotate_z(m.raw, angle, r.raw);
+  return r;
+}
+
+/*!
+ * @brief rotate existing transform matrix around given axis by angle
+ *
+ * @param[in]       m       affine transform
+ * @param[in]       angle   angle (radians)
+ * @param[in]       axis    axis
+ * @returns                affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_rotate(mat4s m, float angle, vec3s axis) {
+  glm_rotate(m.raw, angle, axis.raw);
+  return m;
+}
+
+/*!
+ * @brief rotate existing transform
+ *        around given axis by angle at given pivot point (rotation center)
+ *
+ * @param[in]       m       affine transform
+ * @param[in]       pivot   rotation center
+ * @param[in]       angle   angle (radians)
+ * @param[in]       axis    axis
+ * @returns                 affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_rotate_at(mat4s m, vec3s pivot, float angle, vec3s axis) {
+  glm_rotate_at(m.raw, pivot.raw, angle, axis.raw);
+  return m;
+}
+
+/*!
+ * @brief rotate existing transform matrix around given axis by angle around self (doesn't affected by position)
+ *
+ * @param[in]       m       affine transform
+ * @param[in]       angle   angle (radians)
+ * @param[in]       axis    axis
+ * @returns                affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_spin(mat4s m, float angle, vec3s axis) {
+  glm_spin(m.raw, angle, axis.raw);
+  return m;
+}
+
+#endif /* cglms_affines_pre_h */
diff --git a/external/cglm/struct/affine.h b/external/cglm/struct/affine.h
new file mode 100644
index 0000000..37f11be
--- /dev/null
+++ b/external/cglm/struct/affine.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_translate(mat4s m, vec3s v);
+   CGLM_INLINE mat4s glms_translate_x(mat4s m, float x);
+   CGLM_INLINE mat4s glms_translate_y(mat4s m, float y);
+   CGLM_INLINE mat4s glms_translate_z(mat4s m, float z);
+   CGLM_INLINE mat4s glms_translate_make(vec3s v);
+   CGLM_INLINE mat4s glms_scale_to(mat4s m, vec3s v);
+   CGLM_INLINE mat4s glms_scale_make(vec3s v);
+   CGLM_INLINE mat4s glms_scale(mat4s m, vec3s v);
+   CGLM_INLINE mat4s glms_scale_uni(mat4s m, float s);
+   CGLM_INLINE mat4s glms_rotate_x(mat4s m, float angle);
+   CGLM_INLINE mat4s glms_rotate_y(mat4s m, float angle);
+   CGLM_INLINE mat4s glms_rotate_z(mat4s m, float angle);
+   CGLM_INLINE mat4s glms_rotate_make(float angle, vec3s axis);
+   CGLM_INLINE mat4s glms_rotate(mat4s m, float angle, vec3s axis);
+   CGLM_INLINE mat4s glms_rotate_at(mat4s m, vec3s pivot, float angle, vec3s axis);
+   CGLM_INLINE mat4s glms_rotate_atm(vec3s pivot, float angle, vec3s axis);
+   CGLM_INLINE mat4s glms_spin(mat4s m, float angle, vec3s axis);
+   CGLM_INLINE vec3s glms_decompose_scalev(mat4s m);
+   CGLM_INLINE bool  glms_uniscaled(mat4s m);
+   CGLM_INLINE void  glms_decompose_rs(mat4s m, mat4s * r, vec3s * s);
+   CGLM_INLINE void  glms_decompose(mat4s m, vec4s t, mat4s * r, vec3s * s);
+ */
+
+#ifndef cglms_affines_h
+#define cglms_affines_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../affine.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+#include "affine-mat.h"
+
+/*!
+ * @brief creates NEW translate transform matrix by v vector
+ *
+ * @param[in]   v   translate vector [x, y, z]
+ * @returns         affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_translate_make(vec3s v) {
+  mat4s m;
+  glm_translate_make(m.raw, v.raw);
+  return m;
+}
+
+/*!
+ * @brief creates NEW scale matrix by v vector
+ *
+ * @param[in]   v  scale vector [x, y, z]
+ * @returns affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_scale_make(vec3s v) {
+  mat4s m;
+  glm_scale_make(m.raw, v.raw);
+  return m;
+}
+
+/*!
+ * @brief scales existing transform matrix by v vector
+ *        and stores result in same matrix
+ *
+ * @param[in]    m   affine transform
+ * @param[in]    v   scale vector [x, y, z]
+ * @returns          affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_scale(mat4s m, vec3s v) {
+  mat4s r;
+  glm_scale_to(m.raw, v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief applies uniform scale to existing transform matrix v = [s, s, s]
+ *        and stores result in same matrix
+ *
+ * @param[in]    m   affine transform
+ * @param[in]    s   scale factor
+ * @returns          affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_scale_uni(mat4s m, float s) {
+  glm_scale_uni(m.raw, s);
+  return m;
+}
+
+/*!
+ * @brief creates NEW rotation matrix by angle and axis
+ *
+ * axis will be normalized so you don't need to normalize it
+ *
+ * @param[in]  angle  angle (radians)
+ * @param[in]  axis   axis
+ * @returns           affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_rotate_make(float angle, vec3s axis) {
+  mat4s m;
+  glm_rotate_make(m.raw, angle, axis.raw);
+  return m;
+}
+
+/*!
+ * @brief creates NEW rotation matrix by angle and axis at given point
+ *
+ * this creates rotation matrix, it assumes you don't have a matrix
+ *
+ * this should work faster than glm_rotate_at because it reduces
+ * one glm_translate.
+ *
+ * @param[in]  pivot  rotation center
+ * @param[in]  angle  angle (radians)
+ * @param[in]  axis   axis
+ * @returns           affine transform
+ */
+CGLM_INLINE
+mat4s
+glms_rotate_atm(vec3s pivot, float angle, vec3s axis) {
+  mat4s m;
+  glm_rotate_atm(m.raw, pivot.raw, angle, axis.raw);
+  return m;
+}
+
+/*!
+ * @brief decompose scale vector
+ *
+ * @param[in]  m  affine transform
+ * @returns       scale vector (Sx, Sy, Sz)
+ */
+CGLM_INLINE
+vec3s
+glms_decompose_scalev(mat4s m) {
+  vec3s r;
+  glm_decompose_scalev(m.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief returns true if matrix is uniform scaled. This is helpful for
+ *        creating normal matrix.
+ *
+ * @param[in] m m
+ *
+ * @return boolean
+ */
+CGLM_INLINE
+bool
+glms_uniscaled(mat4s m) {
+  return glm_uniscaled(m.raw);
+}
+
+/*!
+ * @brief decompose rotation matrix (mat4) and scale vector [Sx, Sy, Sz]
+ *        DON'T pass projected matrix here
+ *
+ * @param[in]  m affine transform
+ * @param[out] r rotation matrix
+ * @param[out] s scale matrix
+ */
+CGLM_INLINE
+void
+glms_decompose_rs(mat4s m, mat4s * __restrict r, vec3s * __restrict s) {
+  glm_decompose_rs(m.raw, r->raw, s->raw);
+}
+
+/*!
+ * @brief decompose affine transform, TODO: extract shear factors.
+ *        DON'T pass projected matrix here
+ *
+ * @param[in]  m affine transform
+ * @param[out] t translation vector
+ * @param[out] r rotation matrix (mat4)
+ * @param[out] s scaling vector [X, Y, Z]
+ */
+CGLM_INLINE
+void
+glms_decompose(mat4s m, vec4s * __restrict t, mat4s * __restrict r, vec3s * __restrict s) {
+  glm_decompose(m.raw, t->raw, r->raw, s->raw);
+}
+
+#include "affine-pre.h"
+#include "affine-post.h"
+
+#endif /* cglms_affines_h */
diff --git a/external/cglm/struct/affine2d.h b/external/cglm/struct/affine2d.h
new file mode 100644
index 0000000..ade7c32
--- /dev/null
+++ b/external/cglm/struct/affine2d.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat3s glms_translate2d(mat3 m, vec2 v)
+   CGLM_INLINE mat3s glms_translate2d_x(mat3s m, float x)
+   CGLM_INLINE mat3s glms_translate2d_y(mat3s m, float y)
+   CGLM_INLINE mat3s glms_translate2d_make(vec2s v)
+   CGLM_INLINE mat3s glms_scale2d_make(vec2s v)
+   CGLM_INLINE mat3s glms_scale2d(mat3s m, vec2s v)
+   CGLM_INLINE mat3s glms_scale2d_uni(mat3s m, float s)
+   CGLM_INLINE mat3s glms_rotate2d_make(float angle)
+   CGLM_INLINE mat3s glms_rotate2d(mat3s m, float angle)
+   CGLM_INLINE mat3s glms_rotate2d_to(mat3s m, float angle)
+ */
+
+#ifndef cglms_affine2ds_h
+#define cglms_affine2ds_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../affine2d.h"
+#include "vec3.h"
+#include "mat3.h"
+
+/*!
+ * @brief translate existing 2d transform matrix by v vector
+ *        and stores result in same matrix
+ *
+ * @param[in] m  affine transform
+ * @param[in] v  translate vector [x, y]
+ * @returns      affine transform
+ */
+CGLM_INLINE
+mat3s
+glms_translate2d(mat3s m, vec2s v) {
+  glm_translate2d(m.raw, v.raw);
+  return m;
+}
+
+/*!
+ * @brief translate existing 2d transform matrix by x factor
+ *
+ * @param[in] m  affine transform
+ * @param[in] x  x factor
+ * @returns      affine transform
+ */
+CGLM_INLINE
+mat3s
+glms_translate2d_x(mat3s m, float x) {
+  glm_translate2d_x(m.raw, x);
+  return m;
+}
+
+/*!
+ * @brief translate existing 2d transform matrix by y factor
+ *
+ * @param[in] m  affine transform
+ * @param[in] y  y factor
+ * @returns      affine transform
+ */
+CGLM_INLINE
+mat3s
+glms_translate2d_y(mat3s m, float y) {
+  glm_translate2d_y(m.raw, y);
+  return m;
+}
+
+/*!
+ * @brief creates NEW translate 2d transform matrix by v vector
+ *
+ * @param[in] v  translate vector [x, y]
+ * @returns      affine transform
+ */
+CGLM_INLINE
+mat3s
+glms_translate2d_make(vec2s v) {
+  mat3s m;
+  glm_translate2d_make(m.raw, v.raw);
+  return m;
+}
+
+/*!
+ * @brief creates NEW 2d scale matrix by v vector
+ *
+ * @param[in]   v  scale vector [x, y]
+ * @returns affine transform
+ */
+CGLM_INLINE
+mat3s
+glms_scale2d_make(vec2s v) {
+  mat3s m;
+  glm_scale2d_make(m.raw, v.raw);
+  return m;
+}
+
+/*!
+ * @brief scales existing 2d transform matrix by v vector
+ *        and stores result in same matrix
+ *
+ * @param[in]  m  affine transform
+ * @param[in]  v  scale vector [x, y, z]
+ * @returns       affine transform
+ */
+CGLM_INLINE
+mat3s
+glms_scale2d(mat3s m, vec2s v) {
+  mat3s r;
+  glm_scale2d_to(m.raw, v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief applies uniform scale to existing 2d transform matrix v = [s, s, s]
+ *        and stores result in same matrix
+ *
+ * @param[in] m  affine transform
+ * @param[in] s  scale factor
+ * @returns      affine transform
+ */
+CGLM_INLINE
+mat3s
+glms_scale2d_uni(mat3s m, float s) {
+  glm_scale2d_uni(m.raw, s);
+  return m;
+}
+
+/*!
+ * @brief creates NEW 2d rotation matrix by angle and axis
+ *
+ * axis will be normalized so you don't need to normalize it
+ *
+ * @param[in]  angle  angle (radians)
+ * @returns           affine transform
+ */
+CGLM_INLINE
+mat3s
+glms_rotate2d_make(float angle) {
+  mat3s m;
+  glm_rotate2d_make(m.raw, angle);
+  return m;
+}
+
+/*!
+ * @brief rotate existing 2d transform matrix around given axis by angle
+ *
+ * @param[in] m      affine transform
+ * @param[in] angle  angle (radians)
+ * @returns          affine transform
+ */
+CGLM_INLINE
+mat3s
+glms_rotate2d(mat3s m, float angle) {
+  glm_rotate2d(m.raw, angle);
+  return m;
+}
+
+/*!
+ * @brief rotate existing 2d transform matrix around given axis by angle
+ *
+ * @param[in] m      affine transform
+ * @param[in] angle  angle (radians)
+ * @returns          affine transform
+ */
+CGLM_INLINE
+mat3s
+glms_rotate2d_to(mat3s m, float angle) {
+  glm_rotate2d(m.raw, angle);
+  return m;
+}
+
+#endif /* cglms_affine2ds_h */
diff --git a/external/cglm/struct/box.h b/external/cglm/struct/box.h
new file mode 100644
index 0000000..ac32328
--- /dev/null
+++ b/external/cglm/struct/box.h
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglms_boxs_h
+#define cglms_boxs_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../box.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+
+/* api definition */
+#define glms_aabb_(NAME) CGLM_STRUCTAPI(aabb, NAME)
+
+/*!
+ * @brief apply transform to Axis-Aligned Bounding Box
+ *
+ * @param[in]  box  bounding box
+ * @param[in]  m    transform matrix
+ * @param[out] dest transformed bounding box
+ */
+CGLM_INLINE
+void
+glms_aabb_(transform)(vec3s box[2], mat4s m, vec3s dest[2]) {
+  vec3 rawBox[2];
+  vec3 rawDest[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glm_aabb_transform(rawBox, m.raw, rawDest);
+  glms_vec3_(pack)(dest, rawDest, 2);
+}
+
+/*!
+ * @brief merges two AABB bounding box and creates new one
+ *
+ * two box must be in same space, if one of box is in different space then
+ * you should consider to convert it's space by glm_box_space
+ *
+ * @param[in]  box1 bounding box 1
+ * @param[in]  box2 bounding box 2
+ * @param[out] dest merged bounding box
+ */
+CGLM_INLINE
+void
+glms_aabb_(merge)(vec3s box1[2], vec3s box2[2], vec3s dest[2]) {
+  vec3 rawBox1[2];
+  vec3 rawBox2[2];
+  vec3 rawDest[2];
+
+  glms_vec3_(unpack)(rawBox1, box1, 2);
+  glms_vec3_(unpack)(rawBox2, box2, 2);
+  glm_aabb_merge(rawBox1, rawBox2, rawDest);
+  glms_vec3_(pack)(dest, rawDest, 2);
+}
+
+/*!
+ * @brief crops a bounding box with another one.
+ *
+ * this could be useful for getting a bbox which fits with view frustum and
+ * object bounding boxes. In this case you crop view frustum box with objects
+ * box
+ *
+ * @param[in]  box     bounding box 1
+ * @param[in]  cropBox crop box
+ * @param[out] dest    cropped bounding box
+ */
+CGLM_INLINE
+void
+glms_aabb_(crop)(vec3s box[2], vec3s cropBox[2], vec3s dest[2]) {
+  vec3 rawBox[2];
+  vec3 rawCropBox[2];
+  vec3 rawDest[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glms_vec3_(unpack)(rawCropBox, cropBox, 2);
+  glm_aabb_crop(rawBox, rawCropBox, rawDest);
+  glms_vec3_(pack)(dest, rawDest, 2);
+}
+
+/*!
+ * @brief crops a bounding box with another one.
+ *
+ * this could be useful for getting a bbox which fits with view frustum and
+ * object bounding boxes. In this case you crop view frustum box with objects
+ * box
+ *
+ * @param[in]  box      bounding box
+ * @param[in]  cropBox  crop box
+ * @param[in]  clampBox minimum box
+ * @param[out] dest     cropped bounding box
+ */
+CGLM_INLINE
+void
+glms_aabb_(crop_until)(vec3s box[2],
+                       vec3s cropBox[2],
+                       vec3s clampBox[2],
+                       vec3s dest[2]) {
+  glms_aabb_(crop)(box, cropBox, dest);
+  glms_aabb_(merge)(clampBox, dest, dest);
+}
+
+/*!
+ * @brief check if AABB intersects with frustum planes
+ *
+ * this could be useful for frustum culling using AABB.
+ *
+ * OPTIMIZATION HINT:
+ *  if planes order is similar to LEFT, RIGHT, BOTTOM, TOP, NEAR, FAR
+ *  then this method should run even faster because it would only use two
+ *  planes if object is not inside the two planes
+ *  fortunately cglm extracts planes as this order! just pass what you got!
+ *
+ * @param[in]  box     bounding box
+ * @param[in]  planes  frustum planes
+ */
+CGLM_INLINE
+bool
+glms_aabb_(frustum)(vec3s box[2], vec4s planes[6]) {
+  vec3 rawBox[2];
+  vec4 rawPlanes[6];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glms_vec4_(unpack)(rawPlanes, planes, 6);
+  return glm_aabb_frustum(rawBox, rawPlanes);
+}
+
+/*!
+ * @brief invalidate AABB min and max values
+ *
+ * @param[in, out]  box bounding box
+ */
+CGLM_INLINE
+void
+glms_aabb_(invalidate)(vec3s box[2]) {
+  box[0] = glms_vec3_(broadcast)(FLT_MAX);
+  box[1] = glms_vec3_(broadcast)(-FLT_MAX);
+}
+
+/*!
+ * @brief check if AABB is valid or not
+ *
+ * @param[in]  box bounding box
+ */
+CGLM_INLINE
+bool
+glms_aabb_(isvalid)(vec3s box[2]) {
+  vec3 rawBox[2];
+  glms_vec3_(unpack)(rawBox, box, 2);
+  return glm_aabb_isvalid(rawBox);
+}
+
+/*!
+ * @brief distance between of min and max
+ *
+ * @param[in]  box bounding box
+ */
+CGLM_INLINE
+float
+glms_aabb_(size)(vec3s box[2]) {
+  return glm_vec3_distance(box[0].raw, box[1].raw);
+}
+
+/*!
+ * @brief radius of sphere which surrounds AABB
+ *
+ * @param[in]  box bounding box
+ */
+CGLM_INLINE
+float
+glms_aabb_(radius)(vec3s box[2]) {
+  return glms_aabb_(size)(box) * 0.5f;
+}
+
+/*!
+ * @brief computes center point of AABB
+ *
+ * @param[in]   box  bounding box
+ * @returns center of bounding box
+ */
+CGLM_INLINE
+vec3s
+glms_aabb_(center)(vec3s box[2]) {
+  return glms_vec3_(center)(box[0], box[1]);
+}
+
+/*!
+ * @brief check if two AABB intersects
+ *
+ * @param[in]   box    bounding box
+ * @param[in]   other  other bounding box
+ */
+CGLM_INLINE
+bool
+glms_aabb_(aabb)(vec3s box[2], vec3s other[2]) {
+  vec3 rawBox[2];
+  vec3 rawOther[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glms_vec3_(unpack)(rawOther, other, 2);
+  return glm_aabb_aabb(rawBox, rawOther);
+}
+
+/*!
+ * @brief check if AABB intersects with sphere
+ *
+ * https://github.com/erich666/GraphicsGems/blob/master/gems/BoxSphere.c
+ * Solid Box - Solid Sphere test.
+ *
+ * @param[in]   box    solid bounding box
+ * @param[in]   s      solid sphere
+ */
+CGLM_INLINE
+bool
+glms_aabb_(sphere)(vec3s box[2], vec4s s) {
+  vec3 rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  return glm_aabb_sphere(rawBox, s.raw);
+}
+
+/*!
+ * @brief check if point is inside of AABB
+ *
+ * @param[in]   box    bounding box
+ * @param[in]   point  point
+ */
+CGLM_INLINE
+bool
+glms_aabb_(point)(vec3s box[2], vec3s point) {
+  vec3 rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  return glm_aabb_point(rawBox, point.raw);
+}
+
+/*!
+ * @brief check if AABB contains other AABB
+ *
+ * @param[in]   box    bounding box
+ * @param[in]   other  other bounding box
+ */
+CGLM_INLINE
+bool
+glms_aabb_(contains)(vec3s box[2], vec3s other[2]) {
+  vec3 rawBox[2];
+  vec3 rawOther[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glms_vec3_(unpack)(rawOther, other, 2);
+  return glm_aabb_contains(rawBox, rawOther);
+}
+
+#endif /* cglms_boxs_h */
diff --git a/external/cglm/struct/cam.h b/external/cglm/struct/cam.h
new file mode 100644
index 0000000..ab6cbbb
--- /dev/null
+++ b/external/cglm/struct/cam.h
@@ -0,0 +1,646 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_frustum(float left,    float right,
+                                  float bottom,  float top,
+                                  float nearZ,   float farZ)
+   CGLM_INLINE mat4s glms_ortho(float left,    float right,
+                                float bottom,  float top,
+                                float nearZ,   float farZ)
+   CGLM_INLINE mat4s glms_ortho_aabb(vec3s box[2]);
+   CGLM_INLINE mat4s glms_ortho_aabb_p(vec3s box[2],  float padding);
+   CGLM_INLINE mat4s glms_ortho_aabb_pz(vec3s box[2], float padding);
+   CGLM_INLINE mat4s glms_ortho_default(float aspect)
+   CGLM_INLINE mat4s glms_ortho_default_s(float aspect, float size)
+   CGLM_INLINE mat4s glms_perspective(float fovy,
+                                      float aspect,
+                                      float nearZ,
+                                      float farZ)
+   CGLM_INLINE void  glms_persp_move_far(mat4s proj, float deltaFar)
+   CGLM_INLINE mat4s glms_perspective_default(float aspect)
+   CGLM_INLINE void  glms_perspective_resize(mat4s proj, float aspect)
+   CGLM_INLINE mat4s glms_lookat(vec3s eye, vec3s center, vec3s up)
+   CGLM_INLINE mat4s glms_look(vec3s eye, vec3s dir, vec3s up)
+   CGLM_INLINE mat4s glms_look_anyup(vec3s eye, vec3s dir)
+   CGLM_INLINE void  glms_persp_decomp(mat4s  proj,
+                                       float *nearv, float *farv,
+                                       float *top,   float *bottom,
+                                       float *left,  float *right)
+   CGLM_INLINE void  glms_persp_decompv(mat4s proj, float dest[6])
+   CGLM_INLINE void  glms_persp_decomp_x(mat4s proj, float *left, float *right)
+   CGLM_INLINE void  glms_persp_decomp_y(mat4s proj, float *top, float *bottom)
+   CGLM_INLINE void  glms_persp_decomp_z(mat4s proj, float *nearv, float *farv)
+   CGLM_INLINE void  glms_persp_decomp_far(mat4s proj, float *farZ)
+   CGLM_INLINE void  glms_persp_decomp_near(mat4s proj, float *nearZ)
+   CGLM_INLINE float glms_persp_fovy(mat4s proj)
+   CGLM_INLINE float glms_persp_aspect(mat4s proj)
+   CGLM_INLINE vec4s glms_persp_sizes(mat4s proj, float fovy)
+ */
+
+#ifndef cglms_cam_h
+#define cglms_cam_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../plane.h"
+#include "../cam.h"
+
+#ifndef CGLM_CLIPSPACE_INCLUDE_ALL
+#  if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+#    include "clipspace/ortho_lh_zo.h"
+#    include "clipspace/persp_lh_zo.h"
+#    include "clipspace/view_lh_zo.h"
+#  elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+#    include "clipspace/ortho_lh_no.h"
+#    include "clipspace/persp_lh_no.h"
+#    include "clipspace/view_lh_no.h"
+#  elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+#    include "clipspace/ortho_rh_zo.h"
+#    include "clipspace/persp_rh_zo.h"
+#    include "clipspace/view_rh_zo.h"
+#  elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+#    include "clipspace/ortho_rh_no.h"
+#    include "clipspace/persp_rh_no.h"
+#    include "clipspace/view_rh_no.h"
+#  endif
+#else
+#  include "clipspace/ortho_lh_zo.h"
+#  include "clipspace/persp_lh_zo.h"
+#  include "clipspace/ortho_lh_no.h"
+#  include "clipspace/persp_lh_no.h"
+#  include "clipspace/ortho_rh_zo.h"
+#  include "clipspace/persp_rh_zo.h"
+#  include "clipspace/ortho_rh_no.h"
+#  include "clipspace/persp_rh_no.h"
+#  include "clipspace/view_lh_zo.h"
+#  include "clipspace/view_lh_no.h"
+#  include "clipspace/view_rh_zo.h"
+#  include "clipspace/view_rh_no.h"
+#endif
+
+/*!
+ * @brief set up perspective peprojection matrix
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_frustum(float left,   float right,
+             float bottom, float top,
+             float nearZ,  float farZ) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_frustum_lh_zo(left, right, bottom, top, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_frustum_lh_no(left, right, bottom, top, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_frustum_rh_zo(left, right, bottom, top, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_frustum_rh_no(left, right, bottom, top, nearZ, farZ);
+#endif
+}
+
+/*!
+ * @brief set up orthographic projection matrix
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho(float left,   float right,
+           float bottom, float top,
+           float nearZ,  float farZ) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_ortho_lh_zo(left, right, bottom, top, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_ortho_lh_no(left, right, bottom, top, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_ortho_rh_zo(left, right, bottom, top, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_ortho_rh_no(left, right, bottom, top, nearZ, farZ);
+#endif
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box   AABB
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb(vec3s box[2]) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_ortho_aabb_lh_zo(box);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_ortho_aabb_lh_no(box);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_ortho_aabb_rh_zo(box);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_ortho_aabb_rh_no(box);
+#endif
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_p(vec3s box[2], float padding) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_ortho_aabb_p_lh_zo(box, padding);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_ortho_aabb_p_lh_no(box, padding);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_ortho_aabb_p_rh_zo(box, padding);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_ortho_aabb_p_rh_no(box, padding);
+#endif
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding for near and far
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_pz(vec3s box[2], float padding) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_ortho_aabb_pz_lh_zo(box, padding);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_ortho_aabb_pz_lh_no(box, padding);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_ortho_aabb_pz_rh_zo(box, padding);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_ortho_aabb_pz_rh_no(box, padding);
+#endif
+}
+
+/*!
+ * @brief set up unit orthographic projection matrix
+ *
+ * @param[in]  aspect aspect ration ( width / height )
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_default(float aspect) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_ortho_default_lh_zo(aspect);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_ortho_default_lh_no(aspect);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_ortho_default_rh_zo(aspect);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_ortho_default_rh_no(aspect);
+#endif
+}
+
+/*!
+ * @brief set up orthographic projection matrix with given CUBE size
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[in]  size   cube size
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_default_s(float aspect, float size) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_ortho_default_s_lh_zo(aspect, size);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_ortho_default_s_lh_no(aspect, size);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_ortho_default_s_rh_zo(aspect, size);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_ortho_default_s_rh_no(aspect, size);
+#endif
+}
+
+/*!
+ * @brief set up perspective projection matrix
+ *
+ * @param[in]  fovy    field of view angle
+ * @param[in]  aspect  aspect ratio ( width / height )
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping planes
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_perspective(float fovy, float aspect, float nearZ, float farZ) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_perspective_lh_zo(fovy, aspect, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_perspective_lh_no(fovy, aspect, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_perspective_rh_zo(fovy, aspect, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_perspective_rh_no(fovy, aspect, nearZ, farZ);
+#endif
+}
+
+/*!
+ * @brief extend perspective projection matrix's far distance
+ *
+ * NOTE: if you dodn't want to create new matrix then use array api on struct.raw
+ *       like glm_persp_move_far(prooj.raw, deltaFar) to avoid create new mat4
+ *       each time
+ *
+ * this function does not guarantee far >= near, be aware of that!
+ *
+ * @param[in, out] proj      projection matrix to extend
+ * @param[in]      deltaFar  distance from existing far (negative to shink)
+ */
+CGLM_INLINE
+mat4s
+glms_persp_move_far(mat4s proj, float deltaFar) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_persp_move_far_lh_zo(proj, deltaFar);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_persp_move_far_lh_no(proj, deltaFar);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_persp_move_far_rh_zo(proj, deltaFar);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_persp_move_far_rh_no(proj, deltaFar);
+#endif
+}
+
+/*!
+ * @brief set up perspective projection matrix with default near/far
+ *        and angle values
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_default(float aspect) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_perspective_default_lh_zo(aspect);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_perspective_default_lh_no(aspect);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_perspective_default_rh_zo(aspect);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_perspective_default_rh_no(aspect);
+#endif
+}
+
+/*!
+ * @brief resize perspective matrix by aspect ratio ( width / height )
+ *        this makes very easy to resize proj matrix when window /viewport
+ *        reized
+ *
+ * NOTE: if you dodn't want to create new matrix then use array api on struct.raw
+ *       like glms_perspective_resize(proj.raw, aspect) to avoid create new mat4
+ *       each time
+ *
+ * @param[in, out] proj   perspective projection matrix
+ * @param[in]      aspect aspect ratio ( width / height )
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_resize(mat4s proj, float aspect) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_perspective_resize_lh_zo(proj, aspect);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_perspective_resize_lh_no(proj, aspect);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_perspective_resize_rh_zo(proj, aspect);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_perspective_resize_rh_no(proj, aspect);
+#endif
+}
+
+/*!
+ * @brief set up view matrix
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  center center vector
+ * @param[in]  up     up vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_lookat(vec3s eye, vec3s center, vec3s up) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_lookat_lh_zo(eye, center, up);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_lookat_lh_no(eye, center, up);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_lookat_rh_zo(eye, center, up);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_lookat_rh_no(eye, center, up);
+#endif
+}
+
+/*!
+ * @brief set up view matrix
+ *
+ * convenient wrapper for lookat: if you only have direction not target self
+ * then this might be useful. Because you need to get target from direction.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[in]  up     up vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_look(vec3s eye, vec3s dir, vec3s up) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_look_lh_zo(eye, dir, up);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_look_lh_no(eye, dir, up);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_look_rh_zo(eye, dir, up);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_look_rh_no(eye, dir, up);
+#endif
+}
+
+/*!
+ * @brief set up view matrix
+ *
+ * convenient wrapper for look: if you only have direction and if you don't
+ * care what UP vector is then this might be useful to create view matrix
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_look_anyup(vec3s eye, vec3s dir) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_look_anyup_lh_zo(eye, dir);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_look_anyup_lh_no(eye, dir);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_look_anyup_rh_zo(eye, dir);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_look_anyup_rh_no(eye, dir);
+#endif
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection.
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ * @param[out] top     top
+ * @param[out] bottom  bottom
+ * @param[out] left    left
+ * @param[out] right   right
+ */
+CGLM_INLINE
+void
+glms_persp_decomp(mat4s proj,
+                  float * __restrict nearZ, float * __restrict farZ,
+                  float * __restrict top,   float * __restrict bottom,
+                  float * __restrict left,  float * __restrict right) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glms_persp_decomp_lh_zo(proj, nearZ, farZ, top, bottom, left, right);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glms_persp_decomp_lh_no(proj, nearZ, farZ, top, bottom, left, right);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glms_persp_decomp_rh_zo(proj, nearZ, farZ, top, bottom, left, right);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glms_persp_decomp_rh_no(proj, nearZ, farZ, top, bottom, left, right);
+#endif
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection.
+ *        this makes easy to get all values at once
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] dest   array
+ */
+CGLM_INLINE
+void
+glms_persp_decompv(mat4s proj, float dest[6]) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glms_persp_decompv_lh_zo(proj, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glms_persp_decompv_lh_no(proj, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glms_persp_decompv_rh_zo(proj, dest);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glms_persp_decompv_rh_no(proj, dest);
+#endif
+}
+
+/*!
+ * @brief decomposes left and right values of perspective projection.
+ *        x stands for x axis (left / right axis)
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] left  left
+ * @param[out] right right
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_x(mat4s proj,
+                    float * __restrict left,
+                    float * __restrict right) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glms_persp_decomp_x_lh_zo(proj, left, right);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glms_persp_decomp_x_lh_no(proj, left, right);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glms_persp_decomp_x_rh_zo(proj, left, right);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glms_persp_decomp_x_rh_no(proj, left, right);
+#endif
+}
+
+/*!
+ * @brief decomposes top and bottom values of perspective projection.
+ *        y stands for y axis (top / bottom axis)
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] top    top
+ * @param[out] bottom bottom
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_y(mat4s proj,
+                    float * __restrict top,
+                    float * __restrict bottom) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glms_persp_decomp_y_lh_zo(proj, top, bottom);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glms_persp_decomp_y_lh_no(proj, top, bottom);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glms_persp_decomp_y_rh_zo(proj, top, bottom);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glms_persp_decomp_y_rh_no(proj, top, bottom);
+#endif
+}
+
+/*!
+ * @brief decomposes near and far values of perspective projection.
+ *        z stands for z axis (near / far axis)
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_z(mat4s proj,
+                    float * __restrict nearZ,
+                    float * __restrict farZ) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glms_persp_decomp_z_lh_zo(proj, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glms_persp_decomp_z_lh_no(proj, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glms_persp_decomp_z_rh_zo(proj, nearZ, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glms_persp_decomp_z_rh_no(proj, nearZ, farZ);
+#endif
+}
+
+/*!
+ * @brief decomposes far value of perspective projection.
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] farZ   far
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_far(mat4s proj, float * __restrict farZ) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glms_persp_decomp_far_lh_zo(proj, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glms_persp_decomp_far_lh_no(proj, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glms_persp_decomp_far_rh_zo(proj, farZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glms_persp_decomp_far_rh_no(proj, farZ);
+#endif
+}
+
+/*!
+ * @brief decomposes near value of perspective projection.
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] nearZ near
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_near(mat4s proj, float * __restrict nearZ) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  glms_persp_decomp_near_lh_zo(proj, nearZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  glms_persp_decomp_near_lh_no(proj, nearZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  glms_persp_decomp_near_rh_zo(proj, nearZ);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  glms_persp_decomp_near_rh_no(proj, nearZ);
+#endif
+}
+
+/*!
+ * @brief returns field of view angle along the Y-axis (in radians)
+ *
+ * if you need to degrees, use glm_deg to convert it or use this:
+ * fovy_deg = glm_deg(glm_persp_fovy(projMatrix))
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glms_persp_fovy(mat4s proj) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_persp_fovy_lh_zo(proj);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_persp_fovy_lh_no(proj);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_persp_fovy_rh_zo(proj);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_persp_fovy_rh_no(proj);
+#endif
+}
+
+/*!
+ * @brief returns aspect ratio of perspective projection
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glms_persp_aspect(mat4s proj) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_persp_aspect_lh_zo(proj);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_persp_aspect_lh_no(proj);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_persp_aspect_rh_zo(proj);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_persp_aspect_rh_no(proj);
+#endif
+}
+
+/*!
+ * @brief returns sizes of near and far planes of perspective projection
+ *
+ * @param[in]  proj perspective projection matrix
+ * @param[in]  fovy fovy (see brief)
+ * @returns    sizes as vector, sizes order: [Wnear, Hnear, Wfar, Hfar]
+ */
+CGLM_INLINE
+vec4s
+glms_persp_sizes(mat4s proj, float fovy) {
+#if CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_ZO
+  return glms_persp_sizes_lh_zo(proj, fovy);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_LH_NO
+  return glms_persp_sizes_lh_no(proj, fovy);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_ZO
+  return glms_persp_sizes_rh_zo(proj, fovy);
+#elif CGLM_CONFIG_CLIP_CONTROL == CGLM_CLIP_CONTROL_RH_NO
+  return glms_persp_sizes_rh_no(proj, fovy);
+#endif
+}
+
+#endif /* cglms_cam_h */
diff --git a/external/cglm/struct/clipspace/ortho_lh_no.h b/external/cglm/struct/clipspace/ortho_lh_no.h
new file mode 100644
index 0000000..a743fdf
--- /dev/null
+++ b/external/cglm/struct/clipspace/ortho_lh_no.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_ortho_lh_no(float left,    float right,
+                                      float bottom,  float top,
+                                      float nearZ,   float farZ)
+   CGLM_INLINE mat4s glms_ortho_aabb_lh_no(vec3s box[2]);
+   CGLM_INLINE mat4s glms_ortho_aabb_p_lh_no(vec3s box[2],  float padding);
+   CGLM_INLINE mat4s glms_ortho_aabb_pz_lh_no(vec3s box[2], float padding);
+   CGLM_INLINE mat4s glms_ortho_default_lh_no(float aspect)
+   CGLM_INLINE mat4s glms_ortho_default_s_lh_no(float aspect, float size)
+ */
+
+#ifndef cglms_ortho_lh_no_h
+#define cglms_ortho_lh_no_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../vec3.h"
+#include "../../clipspace/ortho_lh_no.h"
+
+/*!
+ * @brief set up orthographic projection matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_lh_no(float left,   float right,
+                 float bottom, float top,
+                 float nearZ,  float farZ) {
+  mat4s dest;
+  glm_ortho_lh_no(left, right, bottom, top, nearZ, farZ, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box   AABB
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_lh_no(vec3s box[2]) {
+  mat4s dest;
+  vec3  rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glm_ortho_aabb_lh_no(rawBox, dest.raw);
+
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_p_lh_no(vec3s box[2], float padding) {
+  mat4s dest;
+  vec3  rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glm_ortho_aabb_p_lh_no(rawBox, padding, dest.raw);
+
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding for near and far
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_pz_lh_no(vec3s box[2], float padding) {
+  mat4s dest;
+  vec3  rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glm_ortho_aabb_pz_lh_no(rawBox, padding, dest.raw);
+
+  return dest;
+}
+
+/*!
+ * @brief set up unit orthographic projection matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  aspect aspect ration ( width / height )
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_default_lh_no(float aspect) {
+  mat4s dest;
+  glm_ortho_default_lh_no(aspect, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix with given CUBE size
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[in]  size   cube size
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_default_s_lh_no(float aspect, float size) {
+  mat4s dest;
+  glm_ortho_default_s_lh_no(aspect, size, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_ortho_lh_no_h */
diff --git a/external/cglm/struct/clipspace/ortho_lh_zo.h b/external/cglm/struct/clipspace/ortho_lh_zo.h
new file mode 100644
index 0000000..4f15656
--- /dev/null
+++ b/external/cglm/struct/clipspace/ortho_lh_zo.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_ortho_lh_zo(float left,    float right,
+                                      float bottom,  float top,
+                                      float nearZ,   float farZ)
+   CGLM_INLINE mat4s glms_ortho_aabb_lh_zo(vec3s box[2]);
+   CGLM_INLINE mat4s glms_ortho_aabb_p_lh_zo(vec3s box[2],  float padding);
+   CGLM_INLINE mat4s glms_ortho_aabb_pz_lh_zo(vec3s box[2], float padding);
+   CGLM_INLINE mat4s glms_ortho_default_lh_zo(float aspect)
+   CGLM_INLINE mat4s glms_ortho_default_s_lh_zo(float aspect, float size)
+ */
+
+#ifndef cglms_ortho_lh_zo_h
+#define cglms_ortho_lh_zo_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../vec3.h"
+#include "../../clipspace/ortho_lh_zo.h"
+
+/*!
+ * @brief set up orthographic projection matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_lh_zo(float left,   float right,
+                 float bottom, float top,
+                 float nearZ,  float farZ) {
+  mat4s dest;
+  glm_ortho_lh_zo(left, right, bottom, top, nearZ, farZ, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box   AABB
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_lh_zo(vec3s box[2]) {
+  mat4s dest;
+  vec3  rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glm_ortho_aabb_lh_zo(rawBox, dest.raw);
+
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_p_lh_zo(vec3s box[2], float padding) {
+  mat4s dest;
+  vec3  rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glm_ortho_aabb_p_lh_zo(rawBox, padding, dest.raw);
+
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding for near and far
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_pz_lh_zo(vec3s box[2], float padding) {
+  mat4s dest;
+  vec3  rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glm_ortho_aabb_pz_lh_zo(rawBox, padding, dest.raw);
+
+  return dest;
+}
+
+/*!
+ * @brief set up unit orthographic projection matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  aspect aspect ration ( width / height )
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_default_lh_zo(float aspect) {
+  mat4s dest;
+  glm_ortho_default_lh_zo(aspect, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix with given CUBE size
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[in]  size   cube size
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_default_s_lh_zo(float aspect, float size) {
+  mat4s dest;
+  glm_ortho_default_s_lh_zo(aspect, size, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_ortho_lh_zo_h */
diff --git a/external/cglm/struct/clipspace/ortho_rh_no.h b/external/cglm/struct/clipspace/ortho_rh_no.h
new file mode 100644
index 0000000..ecb4d32
--- /dev/null
+++ b/external/cglm/struct/clipspace/ortho_rh_no.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_ortho_rh_no(float left,    float right,
+                                      float bottom,  float top,
+                                      float nearZ,   float farZ)
+   CGLM_INLINE mat4s glms_ortho_aabb_rh_no(vec3s box[2]);
+   CGLM_INLINE mat4s glms_ortho_aabb_p_rh_no(vec3s box[2],  float padding);
+   CGLM_INLINE mat4s glms_ortho_aabb_pz_rh_no(vec3s box[2], float padding);
+   CGLM_INLINE mat4s glms_ortho_default_rh_no(float aspect)
+   CGLM_INLINE mat4s glms_ortho_default_s_rh_no(float aspect, float size)
+ */
+
+#ifndef cglms_ortho_rh_no_h
+#define cglms_ortho_rh_no_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../vec3.h"
+#include "../../clipspace/ortho_rh_no.h"
+
+/*!
+ * @brief set up orthographic projection matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_rh_no(float left,   float right,
+                 float bottom, float top,
+                 float nearZ,  float farZ) {
+  mat4s dest;
+  glm_ortho_rh_no(left, right, bottom, top, nearZ, farZ, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box   AABB
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_rh_no(vec3s box[2]) {
+  mat4s dest;
+  vec3  rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glm_ortho_aabb_rh_no(rawBox, dest.raw);
+
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_p_rh_no(vec3s box[2], float padding) {
+  mat4s dest;
+  vec3  rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glm_ortho_aabb_p_rh_no(rawBox, padding, dest.raw);
+
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding for near and far
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_pz_rh_no(vec3s box[2], float padding) {
+  mat4s dest;
+  vec3  rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glm_ortho_aabb_pz_rh_no(rawBox, padding, dest.raw);
+
+  return dest;
+}
+
+/*!
+ * @brief set up unit orthographic projection matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  aspect aspect ration ( width / height )
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_default_rh_no(float aspect) {
+  mat4s dest;
+  glm_ortho_default_rh_no(aspect, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix with given CUBE size
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[in]  size   cube size
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_default_s_rh_no(float aspect, float size) {
+  mat4s dest;
+  glm_ortho_default_s_rh_no(aspect, size, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_ortho_rh_no_h */
diff --git a/external/cglm/struct/clipspace/ortho_rh_zo.h b/external/cglm/struct/clipspace/ortho_rh_zo.h
new file mode 100644
index 0000000..2d50ee1
--- /dev/null
+++ b/external/cglm/struct/clipspace/ortho_rh_zo.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_ortho_rh_zo(float left,    float right,
+                                      float bottom,  float top,
+                                      float nearZ,   float farZ)
+   CGLM_INLINE mat4s glms_ortho_aabb_rh_zo(vec3s box[2]);
+   CGLM_INLINE mat4s glms_ortho_aabb_p_rh_zo(vec3s box[2],  float padding);
+   CGLM_INLINE mat4s glms_ortho_aabb_pz_rh_zo(vec3s box[2], float padding);
+   CGLM_INLINE mat4s glms_ortho_default_rh_zo(float aspect)
+   CGLM_INLINE mat4s glms_ortho_default_s_rh_zo(float aspect, float size)
+ */
+
+#ifndef cglms_ortho_rh_zo_h
+#define cglms_ortho_rh_zo_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../vec3.h"
+#include "../../clipspace/ortho_rh_zo.h"
+
+/*!
+ * @brief set up orthographic projection matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_rh_zo(float left,   float right,
+                 float bottom, float top,
+                 float nearZ,  float farZ) {
+  mat4s dest;
+  glm_ortho_rh_zo(left, right, bottom, top, nearZ, farZ, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box   AABB
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_rh_zo(vec3s box[2]) {
+  mat4s dest;
+  vec3  rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glm_ortho_aabb_rh_zo(rawBox, dest.raw);
+
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_p_rh_zo(vec3s box[2], float padding) {
+  mat4s dest;
+  vec3  rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glm_ortho_aabb_p_rh_zo(rawBox, padding, dest.raw);
+
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix using bounding box
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * bounding box (AABB) must be in view space
+ *
+ * @param[in]  box     AABB
+ * @param[in]  padding padding for near and far
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_aabb_pz_rh_zo(vec3s box[2], float padding) {
+  mat4s dest;
+  vec3  rawBox[2];
+
+  glms_vec3_(unpack)(rawBox, box, 2);
+  glm_ortho_aabb_pz_rh_zo(rawBox, padding, dest.raw);
+
+  return dest;
+}
+
+/*!
+ * @brief set up unit orthographic projection matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  aspect aspect ration ( width / height )
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_default_rh_zo(float aspect) {
+  mat4s dest;
+  glm_ortho_default_rh_zo(aspect, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up orthographic projection matrix with given CUBE size
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @param[in]  size   cube size
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_ortho_default_s_rh_zo(float aspect, float size) {
+  mat4s dest;
+  glm_ortho_default_s_rh_zo(aspect, size, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_ortho_rh_zo_h */
diff --git a/external/cglm/struct/clipspace/persp_lh_no.h b/external/cglm/struct/clipspace/persp_lh_no.h
new file mode 100644
index 0000000..bc35ca0
--- /dev/null
+++ b/external/cglm/struct/clipspace/persp_lh_no.h
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_frustum_lh_no(float left,    float right,
+                                        float bottom,  float top,
+                                        float nearZ,   float farZ)
+   CGLM_INLINE mat4s glms_perspective_lh_no(float fovy,
+                                            float aspect,
+                                            float nearZ,
+                                            float farZ)
+   CGLM_INLINE void  glms_persp_move_far_lh_no(mat4s proj, float deltaFar)
+   CGLM_INLINE mat4s glms_perspective_default_lh_no(float aspect)
+   CGLM_INLINE void  glms_perspective_resize_lh_no(mat4s proj, float aspect)
+   CGLM_INLINE void  glms_persp_decomp_lh_no(mat4s  proj,
+                                             float *nearv, float *farv,
+                                             float *top,   float *bottom,
+                                             float *left,  float *right)
+   CGLM_INLINE void  glms_persp_decompv_lh_no(mat4s proj, float dest[6])
+   CGLM_INLINE void  glms_persp_decomp_x_lh_no(mat4s proj, float *left, float *right)
+   CGLM_INLINE void  glms_persp_decomp_y_lh_no(mat4s proj, float *top, float *bottom)
+   CGLM_INLINE void  glms_persp_decomp_z_lh_no(mat4s proj, float *nearv, float *farv)
+   CGLM_INLINE void  glms_persp_decomp_far_lh_no(mat4s proj, float *farZ)
+   CGLM_INLINE void  glms_persp_decomp_near_lh_no(mat4s proj, float *nearZ)
+   CGLM_INLINE float glms_persp_fovy_lh_no(mat4s proj)
+   CGLM_INLINE float glms_persp_aspect_lh_no(mat4s proj)
+   CGLM_INLINE vec4s glms_persp_sizes_lh_no(mat4s proj, float fovy)
+ */
+
+#ifndef cglms_persp_lh_no_h
+#define cglms_persp_lh_no_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../../clipspace/persp_lh_no.h"
+
+/*!
+ * @brief set up perspective peprojection matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_frustum_lh_no(float left,   float right,
+                   float bottom, float top,
+                   float nearZ,  float farZ) {
+  mat4s dest;
+  glm_frustum_lh_no(left, right, bottom, top, nearZ, farZ, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up perspective projection matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  fovy    field of view angle
+ * @param[in]  aspect  aspect ratio ( width / height )
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping planes
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_lh_no(float fovy, float aspect, float nearZ, float farZ) {
+  mat4s dest;
+  glm_perspective_lh_no(fovy, aspect, nearZ, farZ, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief extend perspective projection matrix's far distance
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * NOTE: if you dodn't want to create new matrix then use array api on struct.raw
+ *       like glms_persp_move_far_lh_no(prooj.raw, deltaFar) to avoid create new mat4
+ *       each time
+ *       
+ * this function does not guarantee far >= near, be aware of that!
+ *
+ * @param[in, out] proj      projection matrix to extend
+ * @param[in]      deltaFar  distance from existing far (negative to shink)
+ */
+CGLM_INLINE
+mat4s
+glms_persp_move_far_lh_no(mat4s proj, float deltaFar) {
+  mat4s dest;
+  dest = proj;
+  glm_persp_move_far_lh_no(dest.raw, deltaFar);
+  return dest;
+}
+
+/*!
+ * @brief set up perspective projection matrix with default near/far
+ *        and angle values with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_default_lh_no(float aspect) {
+  mat4s dest;
+  glm_perspective_default_lh_no(aspect, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief resize perspective matrix by aspect ratio ( width / height )
+ *        this makes very easy to resize proj matrix when window /viewport
+ *        reized with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * NOTE: if you dodn't want to create new matrix then use array api on struct.raw
+ *       like glm_perspective_resize_lh_no(proj.raw, aspect) to avoid create new mat4
+ *       each time
+ *       
+ * @param[in, out] proj   perspective projection matrix
+ * @param[in]      aspect aspect ratio ( width / height )
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_resize_lh_no(mat4s proj, float aspect) {
+  mat4s dest;
+  dest = proj;
+  glm_perspective_resize_lh_no(aspect, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection.
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ * @param[out] top     top
+ * @param[out] bottom  bottom
+ * @param[out] left    left
+ * @param[out] right   right
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_lh_no(mat4s proj,
+                        float * __restrict nearZ, float * __restrict farZ,
+                        float * __restrict top,   float * __restrict bottom,
+                        float * __restrict left,  float * __restrict right) {
+  glm_persp_decomp_lh_no(proj.raw, nearZ, farZ, top, bottom, left, right);
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection.
+ *        this makes easy to get all values at once
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] dest   array
+ */
+CGLM_INLINE
+void
+glms_persp_decompv_lh_no(mat4s proj, float dest[6]) {
+  glm_persp_decompv_lh_no(proj.raw, dest);
+}
+
+/*!
+ * @brief decomposes left and right values of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        x stands for x axis (left / right axis)
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] left  left
+ * @param[out] right right
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_x_lh_no(mat4s proj,
+                          float * __restrict left,
+                          float * __restrict right) {
+  glm_persp_decomp_x_lh_no(proj.raw, left, right);
+}
+
+/*!
+ * @brief decomposes top and bottom values of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        y stands for y axis (top / bottom axis)
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] top    top
+ * @param[out] bottom bottom
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_y_lh_no(mat4s proj,
+                          float * __restrict top,
+                          float * __restrict bottom) {
+  glm_persp_decomp_y_lh_no(proj.raw, top, bottom);
+}
+
+/*!
+ * @brief decomposes near and far values of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        z stands for z axis (near / far axis)
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_z_lh_no(mat4s proj,
+                          float * __restrict nearZ,
+                          float * __restrict farZ) {
+  glm_persp_decomp_z_lh_no(proj.raw, nearZ, farZ);
+}
+
+/*!
+ * @brief decomposes far value of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] farZ   far
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_far_lh_no(mat4s proj, float * __restrict farZ) {
+  glm_persp_decomp_far_lh_no(proj.raw, farZ);
+}
+
+/*!
+ * @brief decomposes near value of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] nearZ near
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_near_lh_no(mat4s proj, float * __restrict nearZ) {
+  glm_persp_decomp_near_lh_no(proj.raw, nearZ);
+}
+
+/*!
+ * @brief returns field of view angle along the Y-axis (in radians)
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * if you need to degrees, use glm_deg to convert it or use this:
+ * fovy_deg = glm_deg(glm_persp_fovy(projMatrix))
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glms_persp_fovy_lh_no(mat4s proj) {
+  return glm_persp_fovy_lh_no(proj.raw);
+}
+
+/*!
+ * @brief returns aspect ratio of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glms_persp_aspect_lh_no(mat4s proj) {
+  return glm_persp_aspect_lh_no(proj.raw);
+}
+
+/*!
+ * @brief returns sizes of near and far planes of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj perspective projection matrix
+ * @param[in]  fovy fovy (see brief)
+ * @returns    sizes as vector, sizes order: [Wnear, Hnear, Wfar, Hfar]
+ */
+CGLM_INLINE
+vec4s
+glms_persp_sizes_lh_no(mat4s proj, float fovy) {
+  vec4s dest;
+  glm_persp_sizes_lh_no(proj.raw, fovy, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_persp_lh_no_h */
diff --git a/external/cglm/struct/clipspace/persp_lh_zo.h b/external/cglm/struct/clipspace/persp_lh_zo.h
new file mode 100644
index 0000000..29af065
--- /dev/null
+++ b/external/cglm/struct/clipspace/persp_lh_zo.h
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_frustum_lh_zo(float left,    float right,
+                                        float bottom,  float top,
+                                        float nearZ,   float farZ)
+   CGLM_INLINE mat4s glms_perspective_lh_zo(float fovy,
+                                            float aspect,
+                                            float nearZ,
+                                            float farZ)
+   CGLM_INLINE void  glms_persp_move_far_lh_zo(mat4s proj, float deltaFar)
+   CGLM_INLINE mat4s glms_perspective_default_lh_zo(float aspect)
+   CGLM_INLINE void  glms_perspective_resize_lh_zo(mat4s proj, float aspect)
+   CGLM_INLINE void  glms_persp_decomp_lh_zo(mat4s  proj,
+                                             float *nearv, float *farv,
+                                             float *top,   float *bottom,
+                                             float *left,  float *right)
+   CGLM_INLINE void  glms_persp_decompv_lh_zo(mat4s proj, float dest[6])
+   CGLM_INLINE void  glms_persp_decomp_x_lh_zo(mat4s proj, float *left, float *right)
+   CGLM_INLINE void  glms_persp_decomp_y_lh_zo(mat4s proj, float *top, float *bottom)
+   CGLM_INLINE void  glms_persp_decomp_z_lh_zo(mat4s proj, float *nearv, float *farv)
+   CGLM_INLINE void  glms_persp_decomp_far_lh_zo(mat4s proj, float *farZ)
+   CGLM_INLINE void  glms_persp_decomp_near_lh_zo(mat4s proj, float *nearZ)
+   CGLM_INLINE float glms_persp_fovy_lh_zo(mat4s proj)
+   CGLM_INLINE float glms_persp_aspect_lh_zo(mat4s proj)
+   CGLM_INLINE vec4s glms_persp_sizes_lh_zo(mat4s proj, float fovy)
+ */
+
+#ifndef cglms_persp_lh_zo_h
+#define cglms_persp_lh_zo_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../../clipspace/persp_lh_zo.h"
+
+/*!
+ * @brief set up perspective peprojection matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_frustum_lh_zo(float left,   float right,
+                   float bottom, float top,
+                   float nearZ,  float farZ) {
+  mat4s dest;
+  glm_frustum_lh_zo(left, right, bottom, top, nearZ, farZ, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up perspective projection matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  fovy    field of view angle
+ * @param[in]  aspect  aspect ratio ( width / height )
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping planes
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_lh_zo(float fovy, float aspect, float nearZ, float farZ) {
+  mat4s dest;
+  glm_perspective_lh_zo(fovy, aspect, nearZ, farZ, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief extend perspective projection matrix's far distance
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * NOTE: if you dodn't want to create new matrix then use array api on struct.raw
+ *       like glms_persp_move_far_lh_zo(prooj.raw, deltaFar) to avoid create new mat4
+ *       each time
+ *
+ * this function does not guarantee far >= near, be aware of that!
+ *
+ * @param[in, out] proj      projection matrix to extend
+ * @param[in]      deltaFar  distance from existing far (negative to shink)
+ */
+CGLM_INLINE
+mat4s
+glms_persp_move_far_lh_zo(mat4s proj, float deltaFar) {
+  mat4s dest;
+  dest = proj;
+  glm_persp_move_far_lh_zo(dest.raw, deltaFar);
+  return dest;
+}
+
+/*!
+ * @brief set up perspective projection matrix with default near/far
+ *        and angle values with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_default_lh_zo(float aspect) {
+  mat4s dest;
+  glm_perspective_default_lh_zo(aspect, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief resize perspective matrix by aspect ratio ( width / height )
+ *        this makes very easy to resize proj matrix when window /viewport
+ *        reized with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * NOTE: if you dodn't want to create new matrix then use array api on struct.raw
+ *       like glms_perspective_resize_lh_zo(proj.raw, aspect) to avoid create new mat4
+ *       each time
+ *       
+ * @param[in, out] proj   perspective projection matrix
+ * @param[in]      aspect aspect ratio ( width / height )
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_resize_lh_zo(mat4s proj, float aspect) {
+  mat4s dest;
+  dest = proj;
+  glm_perspective_resize_lh_zo(aspect, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection.
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ * @param[out] top     top
+ * @param[out] bottom  bottom
+ * @param[out] left    left
+ * @param[out] right   right
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_lh_zo(mat4s proj,
+                        float * __restrict nearZ, float * __restrict farZ,
+                        float * __restrict top,   float * __restrict bottom,
+                        float * __restrict left,  float * __restrict right) {
+  glm_persp_decomp_lh_zo(proj.raw, nearZ, farZ, top, bottom, left, right);
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection.
+ *        this makes easy to get all values at once
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] dest   array
+ */
+CGLM_INLINE
+void
+glms_persp_decompv_lh_zo(mat4s proj, float dest[6]) {
+  glm_persp_decompv_lh_zo(proj.raw, dest);
+}
+
+/*!
+ * @brief decomposes left and right values of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *        x stands for x axis (left / right axis)
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] left  left
+ * @param[out] right right
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_x_lh_zo(mat4s proj,
+                          float * __restrict left,
+                          float * __restrict right) {
+  glm_persp_decomp_x_lh_zo(proj.raw, left, right);
+}
+
+/*!
+ * @brief decomposes top and bottom values of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *        y stands for y axis (top / bottom axis)
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] top    top
+ * @param[out] bottom bottom
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_y_lh_zo(mat4s proj,
+                          float * __restrict top,
+                          float * __restrict bottom) {
+  glm_persp_decomp_y_lh_zo(proj.raw, top, bottom);
+}
+
+/*!
+ * @brief decomposes near and far values of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *        z stands for z axis (near / far axis)
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_z_lh_zo(mat4s proj,
+                          float * __restrict nearZ,
+                          float * __restrict farZ) {
+  glm_persp_decomp_z_lh_zo(proj.raw, nearZ, farZ);
+}
+
+/*!
+ * @brief decomposes far value of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] farZ   far
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_far_lh_zo(mat4s proj, float * __restrict farZ) {
+  glm_persp_decomp_far_lh_zo(proj.raw, farZ);
+}
+
+/*!
+ * @brief decomposes near value of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] nearZ near
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_near_lh_zo(mat4s proj, float * __restrict nearZ) {
+  glm_persp_decomp_near_lh_zo(proj.raw, nearZ);
+}
+
+/*!
+ * @brief returns field of view angle along the Y-axis (in radians)
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * if you need to degrees, use glm_deg to convert it or use this:
+ * fovy_deg = glm_deg(glm_persp_fovy(projMatrix))
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glms_persp_fovy_lh_zo(mat4s proj) {
+  return glm_persp_fovy_lh_zo(proj.raw);
+}
+
+/*!
+ * @brief returns aspect ratio of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glms_persp_aspect_lh_zo(mat4s proj) {
+  return glm_persp_aspect_lh_zo(proj.raw);
+}
+
+/*!
+ * @brief returns sizes of near and far planes of perspective projection
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj perspective projection matrix
+ * @param[in]  fovy fovy (see brief)
+ * @returns    sizes as vector, sizes order: [Wnear, Hnear, Wfar, Hfar]
+ */
+CGLM_INLINE
+vec4s
+glms_persp_sizes_lh_zo(mat4s proj, float fovy) {
+  vec4s dest;
+  glm_persp_sizes_lh_zo(proj.raw, fovy, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_persp_lh_zo_h */
diff --git a/external/cglm/struct/clipspace/persp_rh_no.h b/external/cglm/struct/clipspace/persp_rh_no.h
new file mode 100644
index 0000000..7120fdf
--- /dev/null
+++ b/external/cglm/struct/clipspace/persp_rh_no.h
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_frustum_rh_no(float left,    float right,
+                                        float bottom,  float top,
+                                        float nearZ,   float farZ)
+   CGLM_INLINE mat4s glms_perspective_rh_no(float fovy,
+                                            float aspect,
+                                            float nearZ,
+                                            float farZ)
+   CGLM_INLINE void  glms_persp_move_far_rh_no(mat4s proj, float deltaFar)
+   CGLM_INLINE mat4s glms_perspective_default_rh_no(float aspect)
+   CGLM_INLINE void  glms_perspective_resize_rh_no(mat4s proj, float aspect)
+   CGLM_INLINE void  glms_persp_decomp_rh_no(mat4s  proj,
+                                             float *nearv, float *farv,
+                                             float *top,   float *bottom,
+                                             float *left,  float *right)
+   CGLM_INLINE void  glms_persp_decompv_rh_no(mat4s proj, float dest[6])
+   CGLM_INLINE void  glms_persp_decomp_x_rh_no(mat4s proj, float *left, float *right)
+   CGLM_INLINE void  glms_persp_decomp_y_rh_no(mat4s proj, float *top, float *bottom)
+   CGLM_INLINE void  glms_persp_decomp_z_rh_no(mat4s proj, float *nearv, float *farv)
+   CGLM_INLINE void  glms_persp_decomp_far_rh_no(mat4s proj, float *farZ)
+   CGLM_INLINE void  glms_persp_decomp_near_rh_no(mat4s proj, float *nearZ)
+   CGLM_INLINE float glms_persp_fovy_rh_no(mat4s proj)
+   CGLM_INLINE float glms_persp_aspect_rh_no(mat4s proj)
+   CGLM_INLINE vec4s glms_persp_sizes_rh_no(mat4s proj, float fovy)
+ */
+
+#ifndef cglms_persp_rh_no_h
+#define cglms_persp_rh_no_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../../clipspace/persp_rh_no.h"
+
+/*!
+ * @brief set up perspective peprojection matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_frustum_rh_no(float left,   float right,
+                   float bottom, float top,
+                   float nearZ,  float farZ) {
+  mat4s dest;
+  glm_frustum_rh_no(left, right, bottom, top, nearZ, farZ, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up perspective projection matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  fovy    field of view angle
+ * @param[in]  aspect  aspect ratio ( width / height )
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping planes
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_rh_no(float fovy, float aspect, float nearZ, float farZ) {
+  mat4s dest;
+  glm_perspective_rh_no(fovy, aspect, nearZ, farZ, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief extend perspective projection matrix's far distance
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * NOTE: if you dodn't want to create new matrix then use array api on struct.raw
+ *       like glms_persp_move_far_rh_no(prooj.raw, deltaFar) to avoid create new mat4
+ *       each time
+ *       s
+ * this function does not guarantee far >= near, be aware of that!
+ *
+ * @param[in, out] proj      projection matrix to extend
+ * @param[in]      deltaFar  distance from existing far (negative to shink)
+ */
+CGLM_INLINE
+mat4s
+glms_persp_move_far_rh_no(mat4s proj, float deltaFar) {
+  mat4s dest;
+  dest = proj;
+  glm_persp_move_far_rh_no(dest.raw, deltaFar);
+  return dest;
+}
+
+/*!
+ * @brief set up perspective projection matrix with default near/far
+ *        and angle values with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_default_rh_no(float aspect) {
+  mat4s dest;
+  glm_perspective_default_rh_no(aspect, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief resize perspective matrix by aspect ratio ( width / height )
+ *        this makes very easy to resize proj matrix when window /viewport
+ *        reized with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * NOTE: if you dodn't want to create new matrix then use array api on struct.raw
+ *       like glm_perspective_resize_rh_no(proj.raw, aspect) to avoid create new mat4
+ *       each time
+ *       
+ * @param[in, out] proj   perspective projection matrix
+ * @param[in]      aspect aspect ratio ( width / height )
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_resize_rh_no(mat4s proj, float aspect) {
+  mat4s dest;
+  dest = proj;
+  glm_perspective_resize_rh_no(aspect, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection.
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ * @param[out] top     top
+ * @param[out] bottom  bottom
+ * @param[out] left    left
+ * @param[out] right   right
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_rh_no(mat4s proj,
+                        float * __restrict nearZ, float * __restrict farZ,
+                        float * __restrict top,   float * __restrict bottom,
+                        float * __restrict left,  float * __restrict right) {
+  glm_persp_decomp_rh_no(proj.raw, nearZ, farZ, top, bottom, left, right);
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection.
+ *        this makes easy to get all values at once
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] dest   array
+ */
+CGLM_INLINE
+void
+glms_persp_decompv_rh_no(mat4s proj, float dest[6]) {
+  glm_persp_decompv_rh_no(proj.raw, dest);
+}
+
+/*!
+ * @brief decomposes left and right values of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        x stands for x axis (left / right axis)
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] left  left
+ * @param[out] right right
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_x_rh_no(mat4s proj,
+                          float * __restrict left,
+                          float * __restrict right) {
+  glm_persp_decomp_x_rh_no(proj.raw, left, right);
+}
+
+/*!
+ * @brief decomposes top and bottom values of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        y stands for y axis (top / bottom axis)
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] top    top
+ * @param[out] bottom bottom
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_y_rh_no(mat4s proj,
+                          float * __restrict top,
+                          float * __restrict bottom) {
+  glm_persp_decomp_y_rh_no(proj.raw, top, bottom);
+}
+
+/*!
+ * @brief decomposes near and far values of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *        z stands for z axis (near / far axis)
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_z_rh_no(mat4s proj,
+                          float * __restrict nearZ,
+                          float * __restrict farZ) {
+  glm_persp_decomp_z_rh_no(proj.raw, nearZ, farZ);
+}
+
+/*!
+ * @brief decomposes far value of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] farZ   far
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_far_rh_no(mat4s proj, float * __restrict farZ) {
+  glm_persp_decomp_far_rh_no(proj.raw, farZ);
+}
+
+/*!
+ * @brief decomposes near value of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] nearZ near
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_near_rh_no(mat4s proj, float * __restrict nearZ) {
+  glm_persp_decomp_near_rh_no(proj.raw, nearZ);
+}
+
+/*!
+ * @brief returns field of view angle along the Y-axis (in radians)
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * if you need to degrees, use glm_deg to convert it or use this:
+ * fovy_deg = glm_deg(glm_persp_fovy(projMatrix))
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glms_persp_fovy_rh_no(mat4s proj) {
+  return glm_persp_fovy_rh_no(proj.raw);
+}
+
+/*!
+ * @brief returns aspect ratio of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glms_persp_aspect_rh_no(mat4s proj) {
+  return glm_persp_aspect_rh_no(proj.raw);
+}
+
+/*!
+ * @brief returns sizes of near and far planes of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * @param[in]  proj perspective projection matrix
+ * @param[in]  fovy fovy (see brief)
+ * @returns    sizes as vector, sizes order: [Wnear, Hnear, Wfar, Hfar]
+ */
+CGLM_INLINE
+vec4s
+glms_persp_sizes_rh_no(mat4s proj, float fovy) {
+  vec4s dest;
+  glm_persp_sizes_rh_no(proj.raw, fovy, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_persp_rh_no_h */
diff --git a/external/cglm/struct/clipspace/persp_rh_zo.h b/external/cglm/struct/clipspace/persp_rh_zo.h
new file mode 100644
index 0000000..e3585a2
--- /dev/null
+++ b/external/cglm/struct/clipspace/persp_rh_zo.h
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_frustum_rh_zo(float left,    float right,
+                                        float bottom,  float top,
+                                        float nearZ,   float farZ)
+   CGLM_INLINE mat4s glms_perspective_rh_zo(float fovy,
+                                            float aspect,
+                                            float nearZ,
+                                            float farZ)
+   CGLM_INLINE void  glms_persp_move_far_rh_zo(mat4s proj, float deltaFar)
+   CGLM_INLINE mat4s glms_perspective_default_rh_zo(float aspect)
+   CGLM_INLINE void  glms_perspective_resize_rh_zo(mat4s proj, float aspect)
+   CGLM_INLINE void  glms_persp_decomp_rh_zo(mat4s  proj,
+                                             float *nearv, float *farv,
+                                             float *top,   float *bottom,
+                                             float *left,  float *right)
+   CGLM_INLINE void  glms_persp_decompv_rh_zo(mat4s proj, float dest[6])
+   CGLM_INLINE void  glms_persp_decomp_x_rh_zo(mat4s proj, float *left, float *right)
+   CGLM_INLINE void  glms_persp_decomp_y_rh_zo(mat4s proj, float *top, float *bottom)
+   CGLM_INLINE void  glms_persp_decomp_z_rh_zo(mat4s proj, float *nearv, float *farv)
+   CGLM_INLINE void  glms_persp_decomp_far_rh_zo(mat4s proj, float *farZ)
+   CGLM_INLINE void  glms_persp_decomp_near_rh_zo(mat4s proj, float *nearZ)
+   CGLM_INLINE float glms_persp_fovy_rh_zo(mat4s proj)
+   CGLM_INLINE float glms_persp_aspect_rh_zo(mat4s proj)
+   CGLM_INLINE vec4s glms_persp_sizes_rh_zo(mat4s proj, float fovy)
+ */
+
+#ifndef cglms_persp_rh_zo_h
+#define cglms_persp_rh_zo_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../../clipspace/persp_rh_zo.h"
+
+/*!
+ * @brief set up perspective peprojection matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  left    viewport.left
+ * @param[in]  right   viewport.right
+ * @param[in]  bottom  viewport.bottom
+ * @param[in]  top     viewport.top
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping plane
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_frustum_rh_zo(float left,   float right,
+                   float bottom, float top,
+                   float nearZ,  float farZ) {
+  mat4s dest;
+  glm_frustum_rh_zo(left, right, bottom, top, nearZ, farZ, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up perspective projection matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  fovy    field of view angle
+ * @param[in]  aspect  aspect ratio ( width / height )
+ * @param[in]  nearZ   near clipping plane
+ * @param[in]  farZ    far clipping planes
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_rh_zo(float fovy, float aspect, float nearZ, float farZ) {
+  mat4s dest;
+  glm_perspective_rh_zo(fovy, aspect, nearZ, farZ, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief extend perspective projection matrix's far distance
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * NOTE: if you dodn't want to create new matrix then use array api on struct.raw
+ *       like glms_persp_move_far_rh_zo(prooj.raw, deltaFar) to avoid create new mat4
+ *       each time
+ *       
+ * this function does not guarantee far >= near, be aware of that!
+ *
+ * @param[in, out] proj      projection matrix to extend
+ * @param[in]      deltaFar  distance from existing far (negative to shink)
+ */
+CGLM_INLINE
+mat4s
+glms_persp_move_far_rh_zo(mat4s proj, float deltaFar) {
+  mat4s dest;
+  dest = proj;
+  glm_persp_move_far_rh_zo(dest.raw, deltaFar);
+  return dest;
+}
+
+/*!
+ * @brief set up perspective projection matrix with default near/far
+ *        and angle values with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  aspect aspect ratio ( width / height )
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_default_rh_zo(float aspect) {
+  mat4s dest;
+  glm_perspective_default_rh_zo(aspect, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief resize perspective matrix by aspect ratio ( width / height )
+ *        this makes very easy to resize proj matrix when window /viewport
+ *        reized with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * NOTE: if you dodn't want to create new matrix then use array api on struct.raw
+ *       like glm_perspective_resize_rh_zo(proj.raw, aspect) to avoid create new mat4
+ *       each time
+ *       
+ * @param[in, out] proj   perspective projection matrix
+ * @param[in]      aspect aspect ratio ( width / height )
+ */
+CGLM_INLINE
+mat4s
+glms_perspective_resize_rh_zo(mat4s proj, float aspect) {
+  mat4s dest;
+  dest = proj;
+  glm_perspective_resize_rh_zo(aspect, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection.
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ * @param[out] top     top
+ * @param[out] bottom  bottom
+ * @param[out] left    left
+ * @param[out] right   right
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_rh_zo(mat4s proj,
+                        float * __restrict nearZ, float * __restrict farZ,
+                        float * __restrict top,   float * __restrict bottom,
+                        float * __restrict left,  float * __restrict right) {
+  glm_persp_decomp_rh_zo(proj.raw, nearZ, farZ, top, bottom, left, right);
+}
+
+/*!
+ * @brief decomposes frustum values of perspective projection.
+ *        this makes easy to get all values at once
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] dest   array
+ */
+CGLM_INLINE
+void
+glms_persp_decompv_rh_zo(mat4s proj, float dest[6]) {
+  glm_persp_decompv_rh_zo(proj.raw, dest);
+}
+
+/*!
+ * @brief decomposes left and right values of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *        x stands for x axis (left / right axis)
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] left  left
+ * @param[out] right right
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_x_rh_zo(mat4s proj,
+                          float * __restrict left,
+                          float * __restrict right) {
+  glm_persp_decomp_x_rh_zo(proj.raw, left, right);
+}
+
+/*!
+ * @brief decomposes top and bottom values of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *        y stands for y axis (top / bottom axis)
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] top    top
+ * @param[out] bottom bottom
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_y_rh_zo(mat4s proj,
+                          float * __restrict top,
+                          float * __restrict bottom) {
+  glm_persp_decomp_y_rh_zo(proj.raw, top, bottom);
+}
+
+/*!
+ * @brief decomposes near and far values of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *        z stands for z axis (near / far axis)
+ *
+ * @param[in]  proj    perspective projection matrix
+ * @param[out] nearZ   near
+ * @param[out] farZ    far
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_z_rh_zo(mat4s proj,
+                          float * __restrict nearZ,
+                          float * __restrict farZ) {
+  glm_persp_decomp_z_rh_zo(proj.raw, nearZ, farZ);
+}
+
+/*!
+ * @brief decomposes far value of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj   perspective projection matrix
+ * @param[out] farZ   far
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_far_rh_zo(mat4s proj, float * __restrict farZ) {
+  glm_persp_decomp_far_rh_zo(proj.raw, farZ);
+}
+
+/*!
+ * @brief decomposes near value of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj  perspective projection matrix
+ * @param[out] nearZ near
+ */
+CGLM_INLINE
+void
+glms_persp_decomp_near_rh_zo(mat4s proj, float * __restrict nearZ) {
+  glm_persp_decomp_near_rh_zo(proj.raw, nearZ);
+}
+
+/*!
+ * @brief returns field of view angle along the Y-axis (in radians)
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * if you need to degrees, use glm_deg to convert it or use this:
+ * fovy_deg = glm_deg(glm_persp_fovy(projMatrix))
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glms_persp_fovy_rh_zo(mat4s proj) {
+  return glm_persp_fovy_rh_zo(proj.raw);
+}
+
+/*!
+ * @brief returns aspect ratio of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in] proj perspective projection matrix
+ */
+CGLM_INLINE
+float
+glms_persp_aspect_rh_zo(mat4s proj) {
+  return glm_persp_aspect_rh_zo(proj.raw);
+}
+
+/*!
+ * @brief returns sizes of near and far planes of perspective projection
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * @param[in]  proj perspective projection matrix
+ * @param[in]  fovy fovy (see brief)
+ * @returns    sizes as vector, sizes order: [Wnear, Hnear, Wfar, Hfar]
+ */
+CGLM_INLINE
+vec4s
+glms_persp_sizes_rh_zo(mat4s proj, float fovy) {
+  vec4s dest;
+  glm_persp_sizes_rh_zo(proj.raw, fovy, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_persp_rh_zo_h */
diff --git a/external/cglm/struct/clipspace/project_no.h b/external/cglm/struct/clipspace/project_no.h
new file mode 100644
index 0000000..1a28d47
--- /dev/null
+++ b/external/cglm/struct/clipspace/project_no.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE vec3s glms_unprojecti_no(vec3s pos, mat4s invMat, vec4s vp)
+   CGLM_INLINE vec3s glms_project_no(vec3s pos, mat4s m, vec4s vp)
+   CGLM_INLINE float glms_project_z_no(vec3s v, mat4s m)
+ */
+
+#ifndef cglms_project_no_h
+#define cglms_project_no_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../../clipspace/project_no.h"
+
+/*!
+ * @brief maps the specified viewport coordinates into specified space [1]
+ *        the matrix should contain projection matrix.
+ *
+ * if you don't have ( and don't want to have ) an inverse matrix then use
+ * glm_unproject version. You may use existing inverse of matrix in somewhere
+ * else, this is why glm_unprojecti exists to save save inversion cost
+ *
+ * [1] space:
+ *  1- if m = invProj:     View Space
+ *  2- if m = invViewProj: World Space
+ *  3- if m = invMVP:      Object Space
+ *
+ * You probably want to map the coordinates into object space
+ * so use invMVP as m
+ *
+ * Computing viewProj:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *   glm_mat4_inv(viewProj, invMVP);
+ *
+ * @param[in]  pos          point/position in viewport coordinates
+ * @param[in]  invMat   matrix (see brief)
+ * @param[in]  vp            viewport as [x, y, width, height]
+ *
+ * @returns unprojected coordinates
+ */
+CGLM_INLINE
+vec3s
+glms_unprojecti_no(vec3s pos, mat4s invMat, vec4s vp) {
+  vec3s dest;
+  glm_unprojecti_no(pos.raw, invMat.raw, vp.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief map object coordinates to window coordinates
+ *
+ * Computing MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * @param[in]  pos      object coordinates
+ * @param[in]  m          MVP matrix
+ * @param[in]  vp        viewport as [x, y, width, height]
+ *
+ * @returns projected coordinates
+ */
+CGLM_INLINE
+vec3s
+glms_project_no(vec3s pos, mat4s m, vec4s vp) {
+  vec3s dest;
+  glm_project_no(pos.raw, m.raw, vp.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief map object's z coordinate to window coordinates
+ *
+ * Computing MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * @param[in]  v object coordinates
+ * @param[in]  m MVP matrix
+ *
+ * @returns projected z coordinate
+ */
+CGLM_INLINE
+float
+glms_project_z_no(vec3s v, mat4s m) {
+  return glm_project_z_no(v.raw, m.raw);
+}
+
+#endif /* cglms_project_rh_no_h */
diff --git a/external/cglm/struct/clipspace/project_zo.h b/external/cglm/struct/clipspace/project_zo.h
new file mode 100644
index 0000000..13065f1
--- /dev/null
+++ b/external/cglm/struct/clipspace/project_zo.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE vec3s glms_unprojecti_no(vec3s pos, mat4s invMat, vec4s vp)
+   CGLM_INLINE vec3s glms_project_no(vec3s pos, mat4s m, vec4s vp)
+   CGLM_INLINE float glms_project_z_zo(vec3s v, mat4s m)
+ */
+
+#ifndef cglms_project_zo_h
+#define cglms_project_zo_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../../clipspace/project_zo.h"
+
+/*!
+ * @brief maps the specified viewport coordinates into specified space [1]
+ *        the matrix should contain projection matrix.
+ *
+ * if you don't have ( and don't want to have ) an inverse matrix then use
+ * glm_unproject version. You may use existing inverse of matrix in somewhere
+ * else, this is why glm_unprojecti exists to save save inversion cost
+ *
+ * [1] space:
+ *  1- if m = invProj:     View Space
+ *  2- if m = invViewProj: World Space
+ *  3- if m = invMVP:      Object Space
+ *
+ * You probably want to map the coordinates into object space
+ * so use invMVP as m
+ *
+ * Computing viewProj:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *   glm_mat4_inv(viewProj, invMVP);
+ *
+ * @param[in]  pos          point/position in viewport coordinates
+ * @param[in]  invMat   matrix (see brief)
+ * @param[in]  vp            viewport as [x, y, width, height]
+ *
+ * @returns unprojected coordinates
+ */
+CGLM_INLINE
+vec3s
+glms_unprojecti_zo(vec3s pos, mat4s invMat, vec4s vp) {
+  vec3s dest;
+  glm_unprojecti_zo(pos.raw, invMat.raw, vp.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief map object coordinates to window coordinates
+ *
+ * Computing MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * @param[in]  pos      object coordinates
+ * @param[in]  m          MVP matrix
+ * @param[in]  vp        viewport as [x, y, width, height]
+ *
+ * @returns projected coordinates
+ */
+CGLM_INLINE
+vec3s
+glms_project_zo(vec3s pos, mat4s m, vec4s vp) {
+  vec3s dest;
+  glm_project_zo(pos.raw, m.raw, vp.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief map object's z coordinate to window coordinates
+ *
+ * Computing MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * @param[in]  v object coordinates
+ * @param[in]  m MVP matrix
+ *
+ * @returns projected z coordinate
+ */
+CGLM_INLINE
+float
+glms_project_z_zo(vec3s v, mat4s m) {
+  return glm_project_z_zo(v.raw, m.raw);
+}
+
+#endif /* cglm_project_zo_h */
diff --git a/external/cglm/struct/clipspace/view_lh_no.h b/external/cglm/struct/clipspace/view_lh_no.h
new file mode 100644
index 0000000..e4ca5ba
--- /dev/null
+++ b/external/cglm/struct/clipspace/view_lh_no.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_lookat_lh_no(vec3s eye, vec3s center, vec3s up)
+   CGLM_INLINE mat4s glms_look_lh_no(vec3s eye, vec3s dir, vec3s up)
+   CGLM_INLINE mat4s glms_look_anyup_lh_no(vec3s eye, vec3s dir)
+ */
+
+#ifndef cglms_view_lh_no_h
+#define cglms_view_lh_no_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../../clipspace/view_lh_no.h"
+
+/*!
+ * @brief set up view matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  center center vector
+ * @param[in]  up     up vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_lookat_lh_no(vec3s eye, vec3s center, vec3s up) {
+  mat4s dest;
+  glm_lookat_lh_no(eye.raw, center.raw, up.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up view matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * convenient wrapper for lookat: if you only have direction not target self
+ * then this might be useful. Because you need to get target from direction.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[in]  up     up vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_look_lh_no(vec3s eye, vec3s dir, vec3s up) {
+  mat4s dest;
+  glm_look_lh_no(eye.raw, dir.raw, up.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up view matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * convenient wrapper for look: if you only have direction and if you don't
+ * care what UP vector is then this might be useful to create view matrix
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_look_anyup_lh_no(vec3s eye, vec3s dir) {
+  mat4s dest;
+  glm_look_anyup_lh_no(eye.raw, dir.raw, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_view_lh_no_h */
diff --git a/external/cglm/struct/clipspace/view_lh_zo.h b/external/cglm/struct/clipspace/view_lh_zo.h
new file mode 100644
index 0000000..ac1ada9
--- /dev/null
+++ b/external/cglm/struct/clipspace/view_lh_zo.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_lookat_lh_zo(vec3s eye, vec3s center, vec3s up)
+   CGLM_INLINE mat4s glms_look_lh_zo(vec3s eye, vec3s dir, vec3s up)
+   CGLM_INLINE mat4s glms_look_anyup_lh_zo(vec3s eye, vec3s dir)
+ */
+
+#ifndef cglms_view_lh_zo_h
+#define cglms_view_lh_zo_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../../clipspace/view_lh_zo.h"
+
+/*!
+ * @brief set up view matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  center center vector
+ * @param[in]  up     up vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_lookat_lh_zo(vec3s eye, vec3s center, vec3s up) {
+  mat4s dest;
+  glm_lookat_lh_zo(eye.raw, center.raw, up.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up view matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * convenient wrapper for lookat: if you only have direction not target self
+ * then this might be useful. Because you need to get target from direction.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[in]  up     up vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_look_lh_zo(vec3s eye, vec3s dir, vec3s up) {
+  mat4s dest;
+  glm_look_lh_zo(eye.raw, dir.raw, up.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up view matrix
+ *        with a left-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * convenient wrapper for look: if you only have direction and if you don't
+ * care what UP vector is then this might be useful to create view matrix
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_look_anyup_lh_zo(vec3s eye, vec3s dir) {
+  mat4s dest;
+  glm_look_anyup_lh_zo(eye.raw, dir.raw, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_view_lh_zo_h */
diff --git a/external/cglm/struct/clipspace/view_rh_no.h b/external/cglm/struct/clipspace/view_rh_no.h
new file mode 100644
index 0000000..99b03c3
--- /dev/null
+++ b/external/cglm/struct/clipspace/view_rh_no.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_lookat_rh_no(vec3s eye, vec3s center, vec3s up)
+   CGLM_INLINE mat4s glms_look_rh_no(vec3s eye, vec3s dir, vec3s up)
+   CGLM_INLINE mat4s glms_look_anyup_rh_no(vec3s eye, vec3s dir)
+ */
+
+#ifndef cglms_view_rh_no_h
+#define cglms_view_rh_no_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../../clipspace/view_rh_no.h"
+
+/*!
+ * @brief set up view matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  center center vector
+ * @param[in]  up     up vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_lookat_rh_no(vec3s eye, vec3s center, vec3s up) {
+  mat4s dest;
+  glm_lookat_rh_no(eye.raw, center.raw, up.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up view matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * convenient wrapper for lookat: if you only have direction not target self
+ * then this might be useful. Because you need to get target from direction.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[in]  up     up vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_look_rh_no(vec3s eye, vec3s dir, vec3s up) {
+  mat4s dest;
+  glm_look_rh_no(eye.raw, dir.raw, up.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up view matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [-1, 1].
+ *
+ * convenient wrapper for look: if you only have direction and if you don't
+ * care what UP vector is then this might be useful to create view matrix
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_look_anyup_rh_no(vec3s eye, vec3s dir) {
+  mat4s dest;
+  glm_look_anyup_rh_no(eye.raw, dir.raw, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_view_rh_no_h */
diff --git a/external/cglm/struct/clipspace/view_rh_zo.h b/external/cglm/struct/clipspace/view_rh_zo.h
new file mode 100644
index 0000000..14ffe32
--- /dev/null
+++ b/external/cglm/struct/clipspace/view_rh_zo.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE mat4s glms_lookat_rh_zo(vec3s eye, vec3s center, vec3s up)
+   CGLM_INLINE mat4s glms_look_rh_zo(vec3s eye, vec3s dir, vec3s up)
+   CGLM_INLINE mat4s glms_look_anyup_rh_zo(vec3s eye, vec3s dir)
+ */
+
+#ifndef cglms_view_rh_zo_h
+#define cglms_view_rh_zo_h
+
+#include "../../common.h"
+#include "../../types-struct.h"
+#include "../../plane.h"
+#include "../../cam.h"
+#include "../../clipspace/view_rh_zo.h"
+
+/*!
+ * @brief set up view matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  center center vector
+ * @param[in]  up     up vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_lookat_rh_zo(vec3s eye, vec3s center, vec3s up) {
+  mat4s dest;
+  glm_lookat_rh_zo(eye.raw, center.raw, up.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up view matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * convenient wrapper for lookat: if you only have direction not target self
+ * then this might be useful. Because you need to get target from direction.
+ *
+ * NOTE: The UP vector must not be parallel to the line of sight from
+ *       the eye point to the reference point
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @param[in]  up     up vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_look_rh_zo(vec3s eye, vec3s dir, vec3s up) {
+  mat4s dest;
+  glm_look_rh_zo(eye.raw, dir.raw, up.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief set up view matrix
+ *        with a right-hand coordinate system and a
+ *        clip-space of [0, 1].
+ *
+ * convenient wrapper for look: if you only have direction and if you don't
+ * care what UP vector is then this might be useful to create view matrix
+ *
+ * @param[in]  eye    eye vector
+ * @param[in]  dir    direction vector
+ * @returns    result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_look_anyup_rh_zo(vec3s eye, vec3s dir) {
+  mat4s dest;
+  glm_look_anyup_rh_zo(eye.raw, dir.raw, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_view_rh_zo_h */
diff --git a/external/cglm/struct/color.h b/external/cglm/struct/color.h
new file mode 100644
index 0000000..3ce78da
--- /dev/null
+++ b/external/cglm/struct/color.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglms_colors_h
+#define cglms_colors_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../color.h"
+#include "vec3.h"
+
+/*!
+ * @brief averages the color channels into one value
+ *
+ * @param[in]  rgb RGB color
+ */
+CGLM_INLINE
+float
+glms_luminance(vec3s rgb) {
+  return glm_luminance(rgb.raw);
+}
+
+#endif /* cglms_colors_h */
diff --git a/external/cglm/struct/curve.h b/external/cglm/struct/curve.h
new file mode 100644
index 0000000..53ea359
--- /dev/null
+++ b/external/cglm/struct/curve.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglms_curves_h
+#define cglms_curves_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../curve.h"
+#include "vec4.h"
+#include "mat4.h"
+
+/*!
+ * @brief helper function to calculate S*M*C multiplication for curves
+ *
+ * This function does not encourage you to use SMC,
+ * instead it is a helper if you use SMC.
+ *
+ * if you want to specify S as vector then use more generic glm_mat4_rmc() func.
+ *
+ * Example usage:
+ *  B(s) = glm_smc(s, GLM_BEZIER_MAT, (vec4){p0, c0, c1, p1})
+ *
+ * @param[in]  s  parameter between 0 and 1 (this will be [s3, s2, s, 1])
+ * @param[in]  m  basis matrix
+ * @param[in]  c  position/control vector
+ *
+ * @return B(s)
+ */
+CGLM_INLINE
+float
+glms_smc(float s, mat4s m, vec4s c) {
+  return glm_smc(s, m.raw, c.raw);
+}
+
+#endif /* cglms_curves_h */
diff --git a/external/cglm/struct/euler.h b/external/cglm/struct/euler.h
new file mode 100644
index 0000000..19697f7
--- /dev/null
+++ b/external/cglm/struct/euler.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ NOTE:
+  angles must be passed as [X-Angle, Y-Angle, Z-angle] order
+  For instance you don't pass angles as [Z-Angle, X-Angle, Y-angle] to
+  glm_euler_zxy function, All RELATED functions accept angles same order
+  which is [X, Y, Z].
+ */
+
+/*
+ Types:
+   enum glm_euler_seq
+
+ Functions:
+   CGLM_INLINE vec3s glms_euler_angles(mat4s m)
+   CGLM_INLINE mat4s glms_euler_xyz(vec3s angles)
+   CGLM_INLINE mat4s glms_euler_xzy(vec3s angles)
+   CGLM_INLINE mat4s glms_euler_yxz(vec3s angles)
+   CGLM_INLINE mat4s glms_euler_yzx(vec3s angles)
+   CGLM_INLINE mat4s glms_euler_zxy(vec3s angles)
+   CGLM_INLINE mat4s glms_euler_zyx(vec3s angles)
+   CGLM_INLINE mat4s glms_euler_by_order(vec3s angles, glm_euler_seq ord)
+   CGLM_INLINE versors glms_euler_xyz_quat(vec3s angles)
+   CGLM_INLINE versors glms_euler_xzy_quat(vec3s angles)
+   CGLM_INLINE versors glms_euler_yxz_quat(vec3s angles)
+   CGLM_INLINE versors glms_euler_yzx_quat(vec3s angles)
+   CGLM_INLINE versors glms_euler_zxy_quat(vec3s angles)
+   CGLM_INLINE versors glms_euler_zyx_quat(vec3s angles)
+ */
+
+#ifndef cglms_euler_h
+#define cglms_euler_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../euler.h"
+
+/*!
+ * @brief extract euler angles (in radians) using xyz order
+ *
+ * @param[in]  m    affine transform
+ * @returns angles vector [x, y, z]
+ */
+CGLM_INLINE
+vec3s
+glms_euler_angles(mat4s m) {
+  vec3s dest;
+  glm_euler_angles(m.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @returns rotation matrix
+ */
+CGLM_INLINE
+mat4s
+glms_euler_xyz(vec3s angles) {
+  mat4s dest;
+  glm_euler_xyz(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @returns rotation matrix
+ */
+CGLM_INLINE
+mat4s
+glms_euler_xzy(vec3s angles) {
+  mat4s dest;
+  glm_euler_xzy(angles.raw, dest.raw);
+  return dest;
+}
+
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @returns rotation matrix
+ */
+CGLM_INLINE
+mat4s
+glms_euler_yxz(vec3s angles) {
+  mat4s dest;
+  glm_euler_yxz(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @returns rotation matrix
+ */
+CGLM_INLINE
+mat4s
+glms_euler_yzx(vec3s angles) {
+  mat4s dest;
+  glm_euler_yzx(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @returns rotation matrix
+ */
+CGLM_INLINE
+mat4s
+glms_euler_zxy(vec3s angles) {
+  mat4s dest;
+  glm_euler_zxy(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @returns rotation matrix
+ */
+CGLM_INLINE
+mat4s
+glms_euler_zyx(vec3s angles) {
+  mat4s dest;
+  glm_euler_zyx(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief build rotation matrix from euler angles
+ *
+ * @param[in]  angles angles as vector [Xangle, Yangle, Zangle]
+ * @param[in]  ord    euler order
+ * @returns rotation matrix
+ */
+CGLM_INLINE
+mat4s
+glms_euler_by_order(vec3s angles, glm_euler_seq ord) {
+  mat4s dest;
+  glm_euler_by_order(angles.raw, ord, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in x y z order (roll pitch yaw)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @returns quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_xyz_quat(vec3s angles) {
+  versors dest;
+  glm_euler_xyz_quat(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in x z y order (roll yaw pitch)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @returns quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_xzy_quat(vec3s angles) {
+  versors dest;
+  glm_euler_xzy_quat(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in y x z order (pitch roll yaw)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @returns quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_yxz_quat(vec3s angles) {
+  versors dest;
+  glm_euler_yxz_quat(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in y z x order (pitch yaw roll)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @returns quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_yzx_quat(vec3s angles) {
+  versors dest;
+  glm_euler_yzx_quat(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in z x y order (yaw roll pitch)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @returns quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_zxy_quat(vec3s angles) {
+  versors dest;
+  glm_euler_zxy_quat(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in z y x order (yaw pitch roll)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @returns quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_zyx_quat(vec3s angles) {
+  versors dest;
+  glm_euler_zyx_quat(angles.raw, dest.raw);
+  return dest;
+}
+
+
+#endif /* cglms_euler_h */
diff --git a/external/cglm/struct/frustum.h b/external/cglm/struct/frustum.h
new file mode 100644
index 0000000..81b5b7b
--- /dev/null
+++ b/external/cglm/struct/frustum.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglms_frustums_h
+#define cglms_frustums_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../frustum.h"
+#include "plane.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+
+/* you can override clip space coords
+   but you have to provide all with same name
+   e.g.: define GLM_CSCOORD_LBN {0.0f, 0.0f, 1.0f, 1.0f} */
+#ifndef GLM_CUSTOM_CLIPSPACE
+
+/* near */
+#define GLMS_CSCOORD_LBN {-1.0f, -1.0f, -1.0f, 1.0f}
+#define GLMS_CSCOORD_LTN {-1.0f,  1.0f, -1.0f, 1.0f}
+#define GLMS_CSCOORD_RTN { 1.0f,  1.0f, -1.0f, 1.0f}
+#define GLMS_CSCOORD_RBN { 1.0f, -1.0f, -1.0f, 1.0f}
+
+/* far */
+#define GLMS_CSCOORD_LBF {-1.0f, -1.0f,  1.0f, 1.0f}
+#define GLMS_CSCOORD_LTF {-1.0f,  1.0f,  1.0f, 1.0f}
+#define GLMS_CSCOORD_RTF { 1.0f,  1.0f,  1.0f, 1.0f}
+#define GLMS_CSCOORD_RBF { 1.0f, -1.0f,  1.0f, 1.0f}
+
+#endif
+
+/*!
+ * @brief extracts view frustum planes
+ *
+ * planes' space:
+ *  1- if m = proj:     View Space
+ *  2- if m = viewProj: World Space
+ *  3- if m = MVP:      Object Space
+ *
+ * You probably want to extract planes in world space so use viewProj as m
+ * Computing viewProj:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *
+ * Exracted planes order: [left, right, bottom, top, near, far]
+ *
+ * @param[in]  m    matrix (see brief)
+ * @param[out] dest extracted view frustum planes (see brief)
+ */
+CGLM_INLINE
+void
+glms_frustum_planes(mat4s m, vec4s dest[6]) {
+  vec4 rawDest[6];
+  glm_frustum_planes(m.raw, rawDest);
+  glms_vec4_(pack)(dest, rawDest, 6);
+}
+
+/*!
+ * @brief extracts view frustum corners using clip-space coordinates
+ *
+ * corners' space:
+ *  1- if m = invViewProj: World Space
+ *  2- if m = invMVP:      Object Space
+ *
+ * You probably want to extract corners in world space so use invViewProj
+ * Computing invViewProj:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   ...
+ *   glm_mat4_inv(viewProj, invViewProj);
+ *
+ * if you have a near coord at i index, you can get it's far coord by i + 4
+ *
+ * Find center coordinates:
+ *   for (j = 0; j < 4; j++) {
+ *     glm_vec3_center(corners[i], corners[i + 4], centerCorners[i]);
+ *   }
+ *
+ * @param[in]  invMat matrix (see brief)
+ * @param[out] dest   exracted view frustum corners (see brief)
+ */
+CGLM_INLINE
+void
+glms_frustum_corners(mat4s invMat, vec4s dest[8]) {
+  vec4 rawDest[8];
+  glm_frustum_corners(invMat.raw, rawDest);
+  glms_vec4_(pack)(dest, rawDest, 8);
+}
+
+/*!
+ * @brief finds center of view frustum
+ *
+ * @param[in]  corners view frustum corners
+ * @returns            view frustum center
+ */
+CGLM_INLINE
+vec4s
+glms_frustum_center(vec4s corners[8]) {
+  vec4 rawCorners[8];
+  vec4s r;
+
+  glms_vec4_(unpack)(rawCorners, corners, 8);
+  glm_frustum_center(rawCorners, r.raw);
+  return r;
+}
+
+/*!
+ * @brief finds bounding box of frustum relative to given matrix e.g. view mat
+ *
+ * @param[in]  corners view frustum corners
+ * @param[in]  m       matrix to convert existing conners
+ * @param[out] box     bounding box as array [min, max]
+ */
+CGLM_INLINE
+void
+glms_frustum_box(vec4s corners[8], mat4s m, vec3s box[2]) {
+  vec4 rawCorners[8];
+  vec3 rawBox[2];
+
+  glms_vec4_(unpack)(rawCorners, corners, 8);
+  glm_frustum_box(rawCorners, m.raw, rawBox);
+  glms_vec3_(pack)(box, rawBox, 2);
+}
+
+/*!
+ * @brief finds planes corners which is between near and far planes (parallel)
+ *
+ * this will be helpful if you want to split a frustum e.g. CSM/PSSM. This will
+ * find planes' corners but you will need to one more plane.
+ * Actually you have it, it is near, far or created previously with this func ;)
+ *
+ * @param[in]  corners view  frustum corners
+ * @param[in]  splitDist     split distance
+ * @param[in]  farDist       far distance (zFar)
+ * @param[out] planeCorners  plane corners [LB, LT, RT, RB]
+ */
+CGLM_INLINE
+void
+glms_frustum_corners_at(vec4s corners[8],
+                        float splitDist,
+                        float farDist,
+                        vec4s planeCorners[4]) {
+  vec4 rawCorners[8];
+  vec4 rawPlaneCorners[4];
+
+  glms_vec4_(unpack)(rawCorners, corners, 8);
+  glm_frustum_corners_at(rawCorners, splitDist, farDist, rawPlaneCorners);
+  glms_vec4_(pack)(planeCorners, rawPlaneCorners, 8);
+}
+
+#endif /* cglms_frustums_h */
diff --git a/external/cglm/struct/handed/euler_to_quat_lh.h b/external/cglm/struct/handed/euler_to_quat_lh.h
new file mode 100644
index 0000000..3964e51
--- /dev/null
+++ b/external/cglm/struct/handed/euler_to_quat_lh.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glms_euler_xyz_quat_lh(vec3 angles, versor dest);
+   CGLM_INLINE void glms_euler_xzy_quat_lh(vec3 angles, versor dest);
+   CGLM_INLINE void glms_euler_yxz_quat_lh(vec3 angles, versor dest);
+   CGLM_INLINE void glms_euler_yzx_quat_lh(vec3 angles, versor dest);
+   CGLM_INLINE void glms_euler_zxy_quat_lh(vec3 angles, versor dest);
+   CGLM_INLINE void glms_euler_zyx_quat_lh(vec3 angles, versor dest);
+ */
+
+#ifndef cglms_euler_to_quat_lh_h
+#define cglms_euler_to_quat_lh_h
+
+#include "../common.h"
+
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in x y z order in left hand (roll pitch yaw)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_xyz_quat_lh(vec3s angles) {
+  versors dest;
+  glm_euler_xyz_quat_lh(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in x z y order in left hand (roll yaw pitch)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_xzy_quat_lh(vec3s angles) {
+  versors dest;
+  glm_euler_xzy_quat_lh(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in y x z order in left hand (pitch roll yaw)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_yxz_quat_lh(vec3s angles) {
+  versors dest;
+  glm_euler_yxz_quat_lh(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in y z x order in left hand (pitch yaw roll)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_yzx_quat_lh(vec3s angles) {
+  versors dest;
+  glm_euler_yzx_quat_lh(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in z x y order in left hand (yaw roll pitch)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_zxy_quat_lh(vec3s angles) {
+  versors dest;
+  glm_euler_zxy_quat_lh(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in z y x order in left hand (yaw pitch roll)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_zyx_quat_lh(vec3s angles) {
+  versors dest;
+  glm_euler_zyx_quat_lh(angles.raw, dest.raw);
+  return dest;
+}
+
+
+#endif /* cglms_euler_to_quat_lh_h */
diff --git a/external/cglm/struct/handed/euler_to_quat_rh.h b/external/cglm/struct/handed/euler_to_quat_rh.h
new file mode 100644
index 0000000..6c7f400
--- /dev/null
+++ b/external/cglm/struct/handed/euler_to_quat_rh.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glms_euler_xyz_quat_rh(vec3 angles, versor dest);
+   CGLM_INLINE void glms_euler_xzy_quat_rh(vec3 angles, versor dest);
+   CGLM_INLINE void glms_euler_yxz_quat_rh(vec3 angles, versor dest);
+   CGLM_INLINE void glms_euler_yzx_quat_rh(vec3 angles, versor dest);
+   CGLM_INLINE void glms_euler_zxy_quat_rh(vec3 angles, versor dest);
+   CGLM_INLINE void glms_euler_zyx_quat_rh(vec3 angles, versor dest);
+ */
+
+#ifndef cglms_euler_to_quat_rh_h
+#define cglms_euler_to_quat_rh_h
+
+#include "../common.h"
+
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in x y z order in right hand (roll pitch yaw)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_xyz_quat_rh(vec3s angles) {
+  versors dest;
+  glm_euler_xyz_quat_rh(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in x z y order in right hand (roll yaw pitch)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_xzy_quat_rh(vec3s angles) {
+  versors dest;
+  glm_euler_xzy_quat_rh(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in y x z order in right hand (pitch roll yaw)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_yxz_quat_rh(vec3s angles) {
+  versors dest;
+  glm_euler_yxz_quat_rh(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in y z x order in right hand (pitch yaw roll)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_yzx_quat_rh(vec3s angles) {
+  versors dest;
+  glm_euler_yzx_quat_rh(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in z x y order in right hand (yaw roll pitch)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_zxy_quat_rh(vec3s angles) {
+  versors dest;
+  glm_euler_zxy_quat_rh(angles.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion using rotation angles and does
+ *        rotations in z y x order in right hand (yaw pitch roll)
+ * 
+ * @param[in]   angles angles x y z (radians)
+ * @param[out]  dest   quaternion
+ */
+CGLM_INLINE
+versors
+glms_euler_zyx_quat_rh(vec3s angles) {
+  versors dest;
+  glm_euler_zyx_quat_rh(angles.raw, dest.raw);
+  return dest;
+}
+
+
+#endif /* cglms_euler_to_quat_rh_h */
diff --git a/external/cglm/struct/io.h b/external/cglm/struct/io.h
new file mode 100644
index 0000000..900c2a8
--- /dev/null
+++ b/external/cglm/struct/io.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void glms_mat4_print(mat4s matrix, FILE *ostream);
+   CGLM_INLINE void glms_mat3_print(mat3s matrix, FILE *ostream);
+   CGLM_INLINE void glms_vec4_print(vec4s vec, FILE *ostream);
+   CGLM_INLINE void glms_ivec4_print(ivec3s vec, FILE *ostream);
+   CGLM_INLINE void glms_vec3_print(vec3s vec, FILE *ostream);
+   CGLM_INLINE void glms_ivec3_print(ivec3s vec, FILE *ostream);
+   CGLM_INLINE void glms_vec2_print(vec2s vec, FILE *ostream);
+   CGLM_INLINE void glms_ivec2_print(ivec3s vec, FILE *ostream);
+   CGLM_INLINE void glms_versor_print(versor vec, FILE *ostream);
+   CGLM_INLINE void glms_aabb_print(vec3s bbox[2], const char *tag, FILE *ostream);
+ */
+
+#ifndef cglms_ios_h
+#define cglms_ios_h
+
+#include "../common.h"
+#include "../io.h"
+#include "mat4.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+CGLM_INLINE
+void
+glms_mat4_print(mat4s             matrix,
+                FILE * __restrict ostream) {
+
+  glm_mat4_print(matrix.raw, ostream);
+}
+
+CGLM_INLINE
+void
+glms_mat3_print(mat3s             matrix,
+                FILE * __restrict ostream) {
+  glm_mat3_print(matrix.raw, ostream);
+}
+
+CGLM_INLINE
+void
+glms_vec4_print(vec4s             vec,
+                FILE * __restrict ostream) {
+  glm_vec4_print(vec.raw, ostream);
+}
+
+CGLM_INLINE
+void
+glms_ivec4_print(ivec4s            vec,
+                 FILE * __restrict ostream) {
+  glm_ivec4_print(vec.raw, ostream);
+}
+
+CGLM_INLINE
+void
+glms_vec3_print(vec3s             vec,
+                FILE * __restrict ostream) {
+  glm_vec3_print(vec.raw, ostream);
+}
+
+CGLM_INLINE
+void
+glms_ivec3_print(ivec3s            vec,
+                 FILE * __restrict ostream) {
+  glm_ivec3_print(vec.raw, ostream);
+}
+
+CGLM_INLINE
+void
+glms_vec2_print(vec2s             vec,
+                FILE * __restrict ostream) {
+  glm_vec2_print(vec.raw, ostream);
+}
+
+CGLM_INLINE
+void
+glms_ivec2_print(ivec2s            vec,
+                 FILE * __restrict ostream) {
+  glm_ivec2_print(vec.raw, ostream);
+}
+
+CGLM_INLINE
+void
+glms_versor_print(versors           vec,
+                  FILE * __restrict ostream) {
+  glm_versor_print(vec.raw, ostream);
+}
+
+CGLM_INLINE
+void
+glms_aabb_print(vec3s                   bbox[2],
+                const char * __restrict tag,
+                FILE       * __restrict ostream) {
+  vec3 rawBbox[2];
+
+  glms_vec3_(unpack)(rawBbox, bbox, 2);
+  glm_aabb_print(rawBbox, tag, ostream);
+}
+
+#endif /* cglms_ios_h */
diff --git a/external/cglm/struct/ivec2.h b/external/cglm/struct/ivec2.h
new file mode 100644
index 0000000..d53c9f6
--- /dev/null
+++ b/external/cglm/struct/ivec2.h
@@ -0,0 +1,708 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+  GLMS_IVEC2_ONE_INIT
+  GLMS_IVEC2_ZERO_INIT
+  GLMS_IVEC2_ONE
+  GLMS_IVEC2_ZERO
+
+ Functions:
+  CGLM_INLINE ivec2s glms_ivec2(int * __restrict v)
+  CGLM_INLINE void glms_ivec2_pack(ivec2s dst[], ivec2s src[], size_t len)
+  CGLM_INLINE void glms_ivec2_unpack(ivec2 dst[], ivec2 src[], size_t len)
+  CGLM_INLINE ivec2s glms_ivec2_zero(ivec2s v)
+  CGLM_INLINE ivec2s glms_ivec2_one(ivec2s v)
+  CGLM_INLINE int glms_ivec2_dot(ivec2s a, ivec2s b)
+  CGLM_INLINE int glms_ivec2_cross(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_add(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_adds(ivec2s v, int s)
+  CGLM_INLINE ivec2s glms_ivec2_sub(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_subs(ivec2s v, int s)
+  CGLM_INLINE ivec2s glms_ivec2_mul(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_scale(ivec2s v, int s)
+  CGLM_INLINE ivec2s glms_ivec2_div(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_divs(ivec2s v, int s)
+  CGLM_INLINE ivec2s glms_ivec2_mod(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_addadd(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_addadds(ivec2s a, int s)
+  CGLM_INLINE ivec2s glms_ivec2_subadd(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_subadds(ivec2s a, int s)
+  CGLM_INLINE ivec2s glms_ivec2_muladd(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_muladds(ivec2s a, int s)
+  CGLM_INLINE ivec2s glms_ivec2_maxadd(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_minadd(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_subsub(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_subsubs(ivec2s a, int s)
+  CGLM_INLINE ivec2s glms_ivec2_addsub(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_addsubs(ivec2s a, int s)
+  CGLM_INLINE ivec2s glms_ivec2_mulsub(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_mulsubs(ivec2s a, int s)
+  CGLM_INLINE ivec2s glms_ivec2_maxsub(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_minsub(ivec2s a, ivec2s b)
+  CGLM_INLINE int glms_ivec2_distance2(ivec2s a, ivec2s b)
+  CGLM_INLINE float glms_ivec2_distance(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_fill(int val)
+  CGLM_INLINE bool glms_ivec2_eq(ivec2s v, int val);
+  CGLM_INLINE bool glms_ivec2_eqv(ivec2s a, ivec2s b);
+  CGLM_INLINE ivec2s glms_ivec2_maxv(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_minv(ivec2s a, ivec2s b)
+  CGLM_INLINE ivec2s glms_ivec2_clamp(ivec2s v, int minVal, int maxVal)
+  CGLM_INLINE ivec2s glms_ivec2_abs(ivec2s v)
+ */
+
+#ifndef cglms_ivec2_h
+#define cglms_ivec2_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../ivec2.h"
+
+#define glms_ivec2_(NAME) CGLM_STRUCTAPI(ivec2, NAME)
+
+#define GLMS_IVEC2_ONE_INIT   {GLM_IVEC2_ONE_INIT}
+#define GLMS_IVEC2_ZERO_INIT  {GLM_IVEC2_ZERO_INIT}
+
+#define GLMS_IVEC2_ONE  ((ivec2s)GLMS_IVEC2_ONE_INIT)
+#define GLMS_IVEC2_ZERO ((ivec2s)GLMS_IVEC2_ZERO_INIT)
+
+/*!
+ * @brief init ivec2 using ivec3 or ivec4
+ *
+ * @param[in]  v    vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2(int * __restrict v) {
+  ivec2s r;
+  glm_ivec2(v, r.raw);
+  return r;
+}
+
+/*!
+ * @brief pack an array of ivec2 into an array of ivec2s
+ *
+ * @param[out] dst array of ivec2s
+ * @param[in]  src array of ivec2
+ * @param[in]  len number of elements
+ */
+CGLM_INLINE
+void
+glms_ivec2_(pack)(ivec2s dst[], ivec2 src[], size_t len) {
+  size_t i;
+
+  for (i = 0; i < len; i++) {
+    glm_ivec2_copy(src[i], dst[i].raw);
+  }
+}
+
+/*!
+ * @brief unpack an array of ivec2s into an array of ivec2
+ *
+ * @param[out] dst array of ivec2
+ * @param[in]  src array of ivec2s
+ * @param[in]  len number of elements
+ */
+CGLM_INLINE
+void
+glms_ivec2_(unpack)(ivec2 dst[], ivec2s src[], size_t len) {
+  size_t i;
+
+  for (i = 0; i < len; i++) {
+    glm_ivec2_copy(src[i].raw, dst[i]);
+  }
+}
+
+/*!
+ * @brief set all members of [v] to zero
+ *
+ * @returns vector
+ */
+CGLM_INLINE
+ivec2s 
+glms_ivec2_(zero)(void) {
+  ivec2s r;
+  glm_ivec2_zero(r.raw);
+  return r;
+}
+
+/*!
+ * @brief set all members of [v] to one
+ *
+ * @returns vector
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(one)(void) {
+  ivec2s r;
+  glm_ivec2_one(r.raw);
+  return r;
+}
+
+/*!
+ * @brief ivec2 dot product
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ *
+ * @return dot product
+ */
+CGLM_INLINE
+int
+glms_ivec2_(dot)(ivec2s a, ivec2s b) {
+  return glm_ivec2_dot(a.raw, b.raw);
+}
+
+/*!
+ * @brief ivec2 cross product
+ *
+ * REF: http://allenchou.net/2013/07/cross-product-of-2d-vectors/
+ *
+ * @param[in]  a vector1
+ * @param[in]  b vector2
+ *
+ * @return Z component of cross product
+ */
+CGLM_INLINE
+int
+glms_ivec2_(cross)(ivec2s a, ivec2s b) {
+  return glm_ivec2_cross(a.raw, b.raw);
+}
+
+/*!
+ * @brief add vector [a] to vector [b] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(add)(ivec2s a, ivec2s b) {
+  ivec2s r;
+  glm_ivec2_add(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief add scalar s to vector [v] and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(adds)(ivec2s v, int s) {
+  ivec2s r;
+  glm_ivec2_adds(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief subtract vector [b] from vector [a] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(sub)(ivec2s a, ivec2s b) {
+  ivec2s r;
+  glm_ivec2_sub(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief subtract scalar s from vector [v] and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(subs)(ivec2s v, int s) {
+  ivec2s r;
+  glm_ivec2_subs(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply vector [a] with vector [b] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(mul)(ivec2s a, ivec2s b) {
+  ivec2s r;
+  glm_ivec2_mul(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply vector [a] with scalar s and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(scale)(ivec2s v, int s) {
+  ivec2s r;
+  glm_ivec2_scale(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief div vector with another component-wise division: d = a / b
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         result = (a[0]/b[0], a[1]/b[1])
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(div)(ivec2s a, ivec2s b) {
+  ivec2s r;
+  glm_ivec2_div(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief div vector with scalar: d = v / s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         result = (a[0]/s, a[1]/s)
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(divs)(ivec2s v, int s) {
+  ivec2s r;
+  glm_ivec2_divs(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief mod vector with another component-wise modulo: d = a % b
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         result = (a[0]%b[0], a[1]%b[1])
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(mod)(ivec2s a, ivec2s b) {
+  ivec2s r;
+  glm_ivec2_mod(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief add vector [a] with vector [b] and add result to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += (a + b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(addadd)(ivec2s a, ivec2s b, ivec2s dest) {
+  glm_ivec2_addadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add scalar [s] onto vector [a] and add result to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest += (a + s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(addadds)(ivec2s a, int s, ivec2s dest) {
+  glm_ivec2_addadds(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract vector [a] from vector [b] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += (a - b)  
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(subadd)(ivec2s a, ivec2s b, ivec2s dest) {
+  glm_ivec2_subadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract scalar [s] from vector [a] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first
+ * @param[in]  s    scalar
+ * @param[in]  dest dest += (a - s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(subadds)(ivec2s a, int s, ivec2s dest) {
+  glm_ivec2_subadds(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiply vector [a] with vector [b] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += (a * b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(muladd)(ivec2s a, ivec2s b, ivec2s dest) {
+  glm_ivec2_muladd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiply vector [a] with scalar [s] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest += (a * s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(muladds)(ivec2s a, int s, ivec2s dest) {
+  glm_ivec2_muladds(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add maximum of vector [a] and vector [b] to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += max(a, b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(maxadd)(ivec2s a, ivec2s b, ivec2s dest) {
+  glm_ivec2_maxadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add minimum of vector [a] and vector [b] to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += min(a, b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(minadd)(ivec2s a, ivec2s b, ivec2s dest) {
+  glm_ivec2_minadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract vector [a] from vector [b] and subtract result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest -= (a - b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(subsub)(ivec2s a, ivec2s b, ivec2s dest) {
+  glm_ivec2_subsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract scalar [s] from vector [a] and subtract result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest -= (a - s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(subsubs)(ivec2s a, int s, ivec2s dest) {
+  glm_ivec2_subsubs(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add vector [a] to vector [b] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  b    scalar
+ * @param[in]  dest dest -= (a + b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(addsub)(ivec2s a, ivec2s b, ivec2s dest) {
+  glm_ivec2_addsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add scalar [s] to vector [a] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest -= (a + b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(addsubs)(ivec2s a, int s, ivec2s dest) {
+  glm_ivec2_addsubs(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiply vector [a] and vector [b] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  b    scalar
+ * @param[in]  dest dest -= (a * b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(mulsub)(ivec2s a, ivec2s b, ivec2s dest) {
+  glm_ivec2_mulsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiply vector [a] with scalar [s] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest -= (a * s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(mulsubs)(ivec2s a, int s, ivec2s dest) {
+  glm_ivec2_mulsubs(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract maximum of vector [a] and vector [b] from vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest -= max(a, b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(maxsub)(ivec2s a, ivec2s b, ivec2s dest) {
+  glm_ivec2_maxsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract minimum of vector [a] and vector [b] from vector [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest -= min(a, b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec2s 
+glms_ivec2_(minsub)(ivec2s a, ivec2s b, ivec2s dest) {
+  glm_ivec2_minsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief squared distance between two vectors
+ *
+ * @param[in] a first vector
+ * @param[in] b second vector
+ * @return returns squared distance (distance * distance)
+ */
+CGLM_INLINE
+int
+glms_ivec2_(distance2)(ivec2s a, ivec2s b) {
+  return glm_ivec2_distance2(a.raw, b.raw);
+}
+
+/*!
+ * @brief distance between two vectors
+ *
+ * @param[in] a first vector
+ * @param[in] b second vector
+ * @return returns distance
+ */
+CGLM_INLINE
+float
+glms_ivec2_(distance)(ivec2s a, ivec2s b) {
+  return glm_ivec2_distance(a.raw, b.raw);
+}
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param[in]  val value
+ * @returns        dest
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(fill)(int val) {
+  ivec2s r;
+  glm_ivec2_fill(r.raw, val);
+  return r;
+}
+
+/*!
+ * @brief check if vector is equal to value
+ *
+ * @param[in] v   vector
+ * @param[in] val value
+ */
+CGLM_INLINE
+bool
+glms_ivec2_(eq)(ivec2s v, int val) {
+  return glm_ivec2_eq(v.raw, val);
+}
+
+/*!
+ * @brief check if vector is equal to another
+ *
+ * @param[in] a vector
+ * @param[in] b vector
+ */
+CGLM_INLINE
+bool
+glms_ivec2_(eqv)(ivec2s a, ivec2s b) {
+  return glm_ivec2_eqv(a.raw, b.raw);
+}
+
+/*!
+ * @brief set each member of dest to greater of vector a and b
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(maxv)(ivec2s a, ivec2s b) {
+  ivec2s r;
+  glm_ivec2_maxv(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief set each member of dest to lesser of vector a and b
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(minv)(ivec2s a, ivec2s b) {
+  ivec2s r;
+  glm_ivec2_minv(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief clamp each member of [v] between minVal and maxVal (inclusive)
+ *
+ * @param[in]      v      vector
+ * @param[in]      minVal minimum value
+ * @param[in]      maxVal maximum value
+ * @returns               clamped vector
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(clamp)(ivec2s v, int minVal, int maxVal) {
+  glm_ivec2_clamp(v.raw, minVal, maxVal);
+  return v;
+}
+
+/*!
+ * @brief absolute value of v
+ *
+ * @param[in]	v	vector
+ * @returns     destination
+ */
+CGLM_INLINE
+ivec2s
+glms_ivec2_(abs)(ivec2s v) {
+  ivec2s r;
+  glm_ivec2_abs(v.raw, r.raw);
+  return r;
+}
+
+#endif /* cglms_ivec2_h */
diff --git a/external/cglm/struct/ivec3.h b/external/cglm/struct/ivec3.h
new file mode 100644
index 0000000..c2c5f3b
--- /dev/null
+++ b/external/cglm/struct/ivec3.h
@@ -0,0 +1,725 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+  GLMS_IVEC3_ONE_INIT
+  GLMS_IVEC3_ZERO_INIT
+  GLMS_IVEC3_ONE
+  GLMS_IVEC3_ZERO
+
+ Functions:
+  CGLM_INLINE ivec3s glms_ivec3(ivec4s v4)
+  CGLM_INLINE void glms_ivec3_pack(ivec3s dst[], ivec3 src[], size_t len)
+  CGLM_INLINE void glms_ivec3_unpack(ivec3 dst[], ivec3s src[], size_t len)
+  CGLM_INLINE ivec3s glms_ivec3_zero(void)
+  CGLM_INLINE ivec3s glms_ivec3_one(void)
+  CGLM_INLINE int glms_ivec3_dot(ivec3s a, ivec3s b)
+  CGLM_INLINE int glms_ivec3_norm2(ivec3s v)
+  CGLM_INLINE int glms_ivec3_norm(ivec3s v)
+  CGLM_INLINE ivec3s glms_ivec3_add(ivec3s a, ivec3s b)
+  CGLM_INLINE ivec3s glms_ivec3_adds(ivec3s v, int s)
+  CGLM_INLINE ivec3s glms_ivec3_sub(ivec3s a, ivec3s b)
+  CGLM_INLINE ivec3s glms_ivec3_subs(ivec3s v, int s)
+  CGLM_INLINE ivec3s glms_ivec3_mul(ivec3s a, ivec3s b)
+  CGLM_INLINE ivec3s glms_ivec3_scale(ivec3s v, int s)
+  CGLM_INLINE ivec3s glms_ivec3_div(ivec3s a, ivec3s b)
+  CGLM_INLINE ivec3s glms_ivec3_divs(ivec3s v, int s)
+  CGLM_INLINE ivec3s glms_ivec3_mod(ivec3s a, ivec3s b)
+  CGLM_INLINE ivec3s glms_ivec3_addadd(ivec3s a, ivec3s b, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_addadds(ivec3s a, int s, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_subadd(ivec3s a, ivec3s b, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_subadds(ivec3s a, int s, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_muladd(ivec3s a, ivec3s b, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_muladds(ivec3s a, int s, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_minadd(ivec3s a, ivec3s b, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_subsub(ivec3s a, ivec3s b, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_subsubs(ivec3s a, int s, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_addsub(ivec3s a, ivec3s b, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_addsubs(ivec3s a, int s, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_mulsub(ivec3s a, ivec3s b, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_mulsubs(ivec3s a, int s, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_maxsub(ivec3s a, ivec3s b, ivec3s dest)
+  CGLM_INLINE ivec3s glms_ivec3_minsub(ivec3s a, ivec3s b, ivec3s dest)
+  CGLM_INLINE int glms_ivec3_distance2(ivec3s a, ivec3s b)
+  CGLM_INLINE float glms_ivec3_distance(ivec3s a, ivec3s b)
+  CGLM_INLINE ivec3s glms_ivec3_fill(int val)
+  CGLM_INLINE bool glms_ivec3_eq(ivec3s v, int val)
+  CGLM_INLINE bool glms_ivec3_eqv(ivec3s a, ivec3s b)
+  CGLM_INLINE ivec3s glms_ivec3_maxv(ivec3s a, ivec3s b)
+  CGLM_INLINE ivec3s glms_ivec3_minv(ivec3s a, ivec3s b)
+  CGLM_INLINE ivec3s glms_ivec3_clamp(ivec3s v, int minVal, int maxVal)
+  CGLM_INLINE ivec3s glms_ivec3_abs(ivec3s v)
+ */
+
+#ifndef cglms_ivec3_h
+#define cglms_ivec3_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../ivec3.h"
+
+#define glms_ivec3_(NAME) CGLM_STRUCTAPI(ivec3, NAME)
+
+#define GLMS_IVEC3_ONE_INIT   {GLM_IVEC3_ONE_INIT}
+#define GLMS_IVEC3_ZERO_INIT  {GLM_IVEC3_ZERO_INIT}
+
+#define GLMS_IVEC3_ONE  ((ivec3s)GLMS_IVEC3_ONE_INIT)
+#define GLMS_IVEC3_ZERO ((ivec3s)GLMS_IVEC3_ZERO_INIT)
+
+/*!
+ * @brief init ivec3 using ivec4
+ *
+ * @param[in]  v4   vector4
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3(ivec4s v4) {
+  ivec3s r;
+  glm_ivec3(v4.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief pack an array of ivec3 into an array of ivec3s
+ *
+ * @param[out] dst array of ivec3s
+ * @param[in]  src array of ivec3
+ * @param[in]  len number of elements
+ */
+CGLM_INLINE
+void
+glms_ivec3_(pack)(ivec3s dst[], ivec3 src[], size_t len) {
+  size_t i;
+
+  for (i = 0; i < len; i++) {
+    glm_ivec3_copy(src[i], dst[i].raw);
+  }
+}
+
+/*!
+ * @brief unpack an array of ivec3s into an array of ivec3
+ *
+ * @param[out] dst array of ivec3
+ * @param[in]  src array of ivec3s
+ * @param[in]  len number of elements
+ */
+CGLM_INLINE
+void
+glms_ivec3_(unpack)(ivec3 dst[], ivec3s src[], size_t len) {
+  size_t i;
+
+  for (i = 0; i < len; i++) {
+    glm_ivec3_copy(src[i].raw, dst[i]);
+  }
+}
+
+/*!
+ * @brief set all members of [v] to zero
+ *
+ * @returns vector
+ */
+CGLM_INLINE
+ivec3s 
+glms_ivec3_(zero)(void) {
+  ivec3s r;
+  glm_ivec3_zero(r.raw);
+  return r;
+}
+
+/*!
+ * @brief set all members of [v] to one
+ *
+ * @returns vector
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(one)(void) {
+  ivec3s r;
+  glm_ivec3_one(r.raw);
+  return r;
+}
+
+/*!
+ * @brief ivec3 dot product
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ *
+ * @return dot product
+ */
+CGLM_INLINE
+int
+glms_ivec3_(dot)(ivec3s a, ivec3s b) {
+  return glm_ivec3_dot(a.raw, b.raw);
+}
+
+/*!
+ * @brief norm * norm (magnitude) of vec
+ *
+ * we can use this func instead of calling norm * norm, because it would call
+ * sqrtf function twice but with this func we can avoid func call, maybe this is
+ * not good name for this func
+ *
+ * @param[in] v vector
+ *
+ * @return norm * norm
+ */
+CGLM_INLINE
+int
+glms_ivec3_(norm2)(ivec3s v) {
+  return glm_ivec3_norm2(v.raw);
+}
+
+/*!
+ * @brief euclidean norm (magnitude), also called L2 norm
+ *        this will give magnitude of vector in euclidean space
+ *
+ * @param[in] v vector
+ *
+ * @return norm
+ */
+CGLM_INLINE
+int
+glms_ivec3_(norm)(ivec3s v) {
+  return glm_ivec3_norm(v.raw);
+}
+
+/*!
+ * @brief add vector [a] to vector [b] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(add)(ivec3s a, ivec3s b) {
+  ivec3s r;
+  glm_ivec3_add(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief add scalar s to vector [v] and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(adds)(ivec3s v, int s) {
+  ivec3s r;
+  glm_ivec3_adds(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief subtract vector [b] from vector [a] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(sub)(ivec3s a, ivec3s b) {
+  ivec3s r;
+  glm_ivec3_sub(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief subtract scalar s from vector [v] and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(subs)(ivec3s v, int s) {
+  ivec3s r;
+  glm_ivec3_subs(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply vector [a] with vector [b] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(mul)(ivec3s a, ivec3s b) {
+  ivec3s r;
+  glm_ivec3_mul(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply vector [a] with scalar s and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(scale)(ivec3s v, int s) {
+  ivec3s r;
+  glm_ivec3_scale(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief div vector with another component-wise division: d = a / b
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         result = (a[0]/b[0], a[1]/b[1], a[2]/b[2])
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(div)(ivec3s a, ivec3s b) {
+  ivec3s r;
+  glm_ivec3_div(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief div vector with scalar: d = v / s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         result = (a[0]/s, a[1]/s, a[2]/s)
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(divs)(ivec3s v, int s) {
+  ivec3s r;
+  glm_ivec3_divs(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief Element-wise modulo operation on ivec3 vectors: dest = a % b
+ *
+ * Performs element-wise modulo on each component of vectors `a` and `b`.
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         result = (a[0]%b[0], a[1]%b[1], a[2]%b[2])
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(mod)(ivec3s a, ivec3s b) {
+  ivec3s r;
+  glm_ivec3_mod(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief add vector [a] with vector [b] and add result to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += (a + b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(addadd)(ivec3s a, ivec3s b, ivec3s dest) {
+  glm_ivec3_addadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add scalar [s] onto vector [a] and add result to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest += (a + s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(addadds)(ivec3s a, int s, ivec3s dest) {
+  glm_ivec3_addadds(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract vector [a] from vector [b] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += (a - b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(subadd)(ivec3s a, ivec3s b, ivec3s dest) {
+  glm_ivec3_subadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract scalar [s] from vector [a] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first
+ * @param[in]  s    scalar
+ * @param[in]  dest dest += (a - s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(subadds)(ivec3s a, int s, ivec3s dest) {
+  glm_ivec3_subadds(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiply vector [a] with vector [b] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += (a * b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(muladd)(ivec3s a, ivec3s b, ivec3s dest) {
+  glm_ivec3_muladd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiply vector [a] with scalar [s] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest += (a * s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(muladds)(ivec3s a, int s, ivec3s dest) {
+  glm_ivec3_muladds(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add maximum of vector [a] and vector [b] to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += max(a, b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(maxadd)(ivec3s a, ivec3s b, ivec3s dest) {
+  glm_ivec3_maxadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add minimum of vector [a] and vector [b] to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += min(a, b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(minadd)(ivec3s a, ivec3s b, ivec3s dest) {
+  glm_ivec3_minadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract vector [a] from vector [b] and subtract result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest -= (a - b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(subsub)(ivec3s a, ivec3s b, ivec3s dest) {
+  glm_ivec3_subsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract scalar [s] from vector [a] and subtract result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest -= (a - s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(subsubs)(ivec3s a, int s, ivec3s dest) {
+  glm_ivec3_subsubs(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add vector [a] to vector [b] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  b    scalar
+ * @param[in]  dest dest -= (a + b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(addsub)(ivec3s a, ivec3s b, ivec3s dest) {
+  glm_ivec3_addsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add scalar [s] to vector [a] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest -= (a + b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(addsubs)(ivec3s a, int s, ivec3s dest) {
+  glm_ivec3_addsubs(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiply vector [a] and vector [b] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  b    scalar
+ * @param[in]  dest dest -= (a * b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(mulsub)(ivec3s a, ivec3s b, ivec3s dest) {
+  glm_ivec3_mulsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiply vector [a] with scalar [s] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest -= (a * s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(mulsubs)(ivec3s a, int s, ivec3s dest) {
+  glm_ivec3_mulsubs(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract maximum of vector [a] and vector [b] from vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest -= max(a, b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(maxsub)(ivec3s a, ivec3s b, ivec3s dest) {
+  glm_ivec3_maxsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract minimum of vector [a] and vector [b] from vector [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest -= min(a, b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec3s 
+glms_ivec3_(minsub)(ivec3s a, ivec3s b, ivec3s dest) {
+  glm_ivec3_minsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief squared distance between two vectors
+ *
+ * @param[in] a first vector
+ * @param[in] b second vector
+ * @return returns squared distance (distance * distance)
+ */
+CGLM_INLINE
+int
+glms_ivec3_(distance2)(ivec3s a, ivec3s b) {
+  return glm_ivec3_distance2(a.raw, b.raw);
+}
+
+/*!
+ * @brief distance between two vectors
+ *
+ * @param[in] a first vector
+ * @param[in] b second vector
+ * @return returns distance
+ */
+CGLM_INLINE
+float
+glms_ivec3_(distance)(ivec3s a, ivec3s b) {
+  return glm_ivec3_distance(a.raw, b.raw);
+}
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param[in]  val value
+ * @returns        destination
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(fill)(int val) {
+  ivec3s r;
+  glm_ivec3_fill(r.raw, val);
+  return r;
+}
+
+/*!
+ * @brief check if vector is equal to value
+ *
+ * @param[in] v   vector
+ * @param[in] val value
+ */
+CGLM_INLINE
+bool
+glms_ivec3_(eq)(ivec3s v, int val) {
+  return glm_ivec3_eq(v.raw, val);
+}
+
+/*!
+ * @brief check if vector is equal to another
+ *
+ * @param[in] a vector
+ * @param[in] b vector
+ */
+CGLM_INLINE
+bool
+glms_ivec3_(eqv)(ivec3s a, ivec3s b) {
+  return glm_ivec3_eqv(a.raw, b.raw);
+}
+
+/*!
+ * @brief set each member of dest to greater of vector a and b
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(maxv)(ivec3s a, ivec3s b) {
+  ivec3s r;
+  glm_ivec3_maxv(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief set each member of dest to lesser of vector a and b
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(minv)(ivec3s a, ivec3s b) {
+  ivec3s r;
+  glm_ivec3_minv(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief clamp each member of [v] between minVal and maxVal (inclusive)
+ *
+ * @param[in]      v      vector
+ * @param[in]      minVal minimum value
+ * @param[in]      maxVal maximum value
+ * @returns               clamped vector
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(clamp)(ivec3s v, int minVal, int maxVal) {
+  glm_ivec3_clamp(v.raw, minVal, maxVal);
+  return v;
+}
+
+/*!
+ * @brief absolute value of v
+ *
+ * @param[in]	v	vector
+ * @returns     destination
+ */
+CGLM_INLINE
+ivec3s
+glms_ivec3_(abs)(ivec3s v) {
+  ivec3s r;
+  glm_ivec3_abs(v.raw, r.raw);
+  return r;
+}
+
+#endif /* cglms_ivec3_h */
diff --git a/external/cglm/struct/ivec4.h b/external/cglm/struct/ivec4.h
new file mode 100644
index 0000000..103e887
--- /dev/null
+++ b/external/cglm/struct/ivec4.h
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+  GLMS_IVEC4_ONE_INIT
+  GLMS_IVEC4_ZERO_INIT
+  GLMS_IVEC4_ONE
+  GLMS_IVEC4_ZERO
+
+ Functions:
+  CGLM_INLINE ivec4s glms_ivec4(ivec3s v3, int last)
+  CGLM_INLINE void glms_ivec4_pack(ivec4s dst[], ivec4 src[], size_t len)
+  CGLM_INLINE void glms_ivec4_unpack(ivec4 dst[], ivec4s src[], size_t len)
+  CGLM_INLINE ivec4s  glms_ivec4_zero(void)
+  CGLM_INLINE ivec4s glms_ivec4_one(void)
+  CGLM_INLINE ivec4s glms_ivec4_add(ivec4s a, ivec4s b)
+  CGLM_INLINE ivec4s glms_ivec4_adds(ivec4s v, int s)
+  CGLM_INLINE ivec4s glms_ivec4_sub(ivec4s a, ivec4s b)
+  CGLM_INLINE ivec4s glms_ivec4_subs(ivec4s v, int s)
+  CGLM_INLINE ivec4s glms_ivec4_mul(ivec4s a, ivec4s b)
+  CGLM_INLINE ivec4s glms_ivec4_scale(ivec4s v, int s)
+  CGLM_INLINE ivec4s glms_ivec4_addadd(ivec4s a, ivec4s b, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_addadds(ivec4s a, int s, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_subadd(ivec4s a, ivec4s b, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_subadds(ivec4s a, int s, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_muladd(ivec4s a, ivec4s b, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_muladds(ivec4s a, int s, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_maxadd(ivec4s a, ivec4s b, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_minadd(ivec4s a, ivec4s b, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_subsub(ivec4s a, ivec4s b, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_subsubs(ivec4s a, int s, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_addsub(ivec4s a, ivec4s b, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_addsubs(ivec4s a, int s, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_mulsub(ivec4s a, ivec4s b, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_mulsubs(ivec4s a, int s, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_maxsub(ivec4s a, ivec4s b, ivec4s dest)
+  CGLM_INLINE ivec4s glms_ivec4_minsub(ivec4s a, ivec4s b, ivec4s dest)
+  CGLM_INLINE int glms_ivec4_distance2(ivec4s a, ivec4s b)
+  CGLM_INLINE float glms_ivec4_distance(ivec4s a, ivec4s b)
+  CGLM_INLINE ivec4s glms_ivec4_maxv(ivec4s a, ivec4s b)
+  CGLM_INLINE ivec4s glms_ivec4_minv(ivec4s a, ivec4s b)
+  CGLM_INLINE ivec4s glms_ivec4_clamp(ivec4s v, int minVal, int maxVal)
+  CGLM_INLINE ivec4s glms_ivec4_abs(ivec4s v)
+ */
+
+#ifndef cglms_ivec4_h
+#define cglms_ivec4_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../ivec4.h"
+
+#define glms_ivec4_(NAME) CGLM_STRUCTAPI(ivec4, NAME)
+
+#define GLMS_IVEC4_ONE_INIT   {GLM_IVEC4_ONE_INIT}
+#define GLMS_IVEC4_ZERO_INIT  {GLM_IVEC4_ZERO_INIT}
+
+#define GLMS_IVEC4_ONE  ((ivec4s)GLMS_IVEC4_ONE_INIT)
+#define GLMS_IVEC4_ZERO ((ivec4s)GLMS_IVEC4_ZERO_INIT)
+
+/*!
+ * @brief init ivec4 using ivec3
+ *
+ * @param[in]  v3   vector3
+ * @param[in]  last last item
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec4s
+glms_ivec4(ivec3s v3, int last) {
+  ivec4s r;
+  glm_ivec4(v3.raw, last, r.raw);
+  return r;
+}
+
+/*!
+ * @brief pack an array of ivec4 into an array of ivec4s
+ *
+ * @param[out] dst array of ivec4s
+ * @param[in]  src array of ivec4
+ * @param[in]  len number of elements
+ */
+CGLM_INLINE
+void
+glms_ivec4_(pack)(ivec4s dst[], ivec4 src[], size_t len) {
+  size_t i;
+
+  for (i = 0; i < len; i++) {
+    glm_ivec4_copy(src[i], dst[i].raw);
+  }
+}
+
+/*!
+ * @brief unpack an array of ivec4s into an array of ivec4
+ *
+ * @param[out] dst array of ivec4
+ * @param[in]  src array of ivec4s
+ * @param[in]  len number of elements
+ */
+CGLM_INLINE
+void
+glms_ivec4_(unpack)(ivec4 dst[], ivec4s src[], size_t len) {
+  size_t i;
+
+  for (i = 0; i < len; i++) {
+    glm_ivec4_copy(src[i].raw, dst[i]);
+  }
+}
+
+/*!
+ * @brief set all members of [v] to zero
+ *
+ * @returns vector
+ */
+CGLM_INLINE
+ivec4s 
+glms_ivec4_(zero)(void) {
+  ivec4s r;
+  glm_ivec4_zero(r.raw);
+  return r;
+}
+
+/*!
+ * @brief set all members of [v] to one
+ *
+ * @returns vector
+ */
+CGLM_INLINE
+ivec4s
+glms_ivec4_(one)(void) {
+  ivec4s r;
+  glm_ivec4_one(r.raw);
+  return r;
+}
+
+/*!
+ * @brief add vector [a] to vector [b] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec4s
+glms_ivec4_(add)(ivec4s a, ivec4s b) {
+  ivec4s r;
+  glm_ivec4_add(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief add scalar s to vector [v] and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec4s
+glms_ivec4_(adds)(ivec4s v, int s) {
+  ivec4s r;
+  glm_ivec4_adds(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief subtract vector [b] from vector [a] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec4s
+glms_ivec4_(sub)(ivec4s a, ivec4s b) {
+  ivec4s r;
+  glm_ivec4_sub(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief subtract scalar s from vector [v] and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec4s
+glms_ivec4_(subs)(ivec4s v, int s) {
+  ivec4s r;
+  glm_ivec4_subs(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply vector [a] with vector [b] and store result in [dest]
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec4s
+glms_ivec4_(mul)(ivec4s a, ivec4s b) {
+  ivec4s r;
+  glm_ivec4_mul(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply vector [a] with scalar s and store result in [dest]
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec4s
+glms_ivec4_(scale)(ivec4s v, int s) {
+  ivec4s r;
+  glm_ivec4_scale(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief add vector [a] with vector [b] and add result to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += (a + b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(addadd)(ivec4s a, ivec4s b, ivec4s dest) {
+  glm_ivec4_addadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add scalar [s] onto vector [a] and add result to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest += (a + s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(addadds)(ivec4s a, int s, ivec4s dest) {
+  glm_ivec4_addadds(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract vector [a] from vector [b] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += (a - b)  
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(subadd)(ivec4s a, ivec4s b, ivec4s dest) {
+  glm_ivec4_subadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract scalar [s] from vector [a] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first
+ * @param[in]  s    scalar
+ * @param[in]  dest dest += (a - s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(subadds)(ivec4s a, int s, ivec4s dest) {
+  glm_ivec4_subadds(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiply vector [a] with vector [b] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += (a * b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(muladd)(ivec4s a, ivec4s b, ivec4s dest) {
+  glm_ivec4_muladd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiply vector [a] with scalar [s] and add result to [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest += (a * s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(muladds)(ivec4s a, int s, ivec4s dest) {
+  glm_ivec4_muladds(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add maximum of vector [a] and vector [b] to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += max(a, b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(maxadd)(ivec4s a, ivec4s b, ivec4s dest) {
+  glm_ivec4_maxadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add minimum of vector [a] and vector [b] to vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest += min(a, b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(minadd)(ivec4s a, ivec4s b, ivec4s dest) {
+  glm_ivec4_minadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract vector [a] from vector [b] and subtract result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest -= (a - b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(subsub)(ivec4s a, ivec4s b, ivec4s dest) {
+  glm_ivec4_subsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract scalar [s] from vector [a] and subtract result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest -= (a - s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(subsubs)(ivec4s a, int s, ivec4s dest) {
+  glm_ivec4_subsubs(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add vector [a] to vector [b] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  b    scalar
+ * @param[in]  dest dest -= (a + b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(addsub)(ivec4s a, ivec4s b, ivec4s dest) {
+  glm_ivec4_addsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add scalar [s] to vector [a] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest -= (a + b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(addsubs)(ivec4s a, int s, ivec4s dest) {
+  glm_ivec4_addsubs(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiply vector [a] and vector [b] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  b    scalar
+ * @param[in]  dest dest -= (a * b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(mulsub)(ivec4s a, ivec4s b, ivec4s dest) {
+  glm_ivec4_mulsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiply vector [a] with scalar [s] and subtract the result from [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[in]  dest dest -= (a * s)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(mulsubs)(ivec4s a, int s, ivec4s dest) {
+  glm_ivec4_mulsubs(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract maximum of vector [a] and vector [b] from vector [dest]
+ *
+ * applies += operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest -= max(a, b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(maxsub)(ivec4s a, ivec4s b, ivec4s dest) {
+  glm_ivec4_maxsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract minimum of vector [a] and vector [b] from vector [dest]
+ *
+ * applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @param[in]  dest dest -= min(a, b)
+ * @returns         dest
+ */
+CGLM_INLINE 
+ivec4s 
+glms_ivec4_(minsub)(ivec4s a, ivec4s b, ivec4s dest) {
+  glm_ivec4_minsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief squared distance between two vectors
+ *
+ * @param[in] a first vector
+ * @param[in] b second vector
+ * @return returns squared distance (distance * distance)
+ */
+CGLM_INLINE
+int
+glms_ivec4_(distance2)(ivec4s a, ivec4s b) {
+  return glm_ivec4_distance2(a.raw, b.raw);
+}
+
+/*!
+ * @brief distance between two vectors
+ *
+ * @param[in] a first vector
+ * @param[in] b second vector
+ * @return returns distance
+ */
+CGLM_INLINE
+float
+glms_ivec4_(distance)(ivec4s a, ivec4s b) {
+  return glm_ivec4_distance(a.raw, b.raw);
+}
+
+/*!
+ * @brief set each member of dest to greater of vector a and b
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec4s
+glms_ivec4_(maxv)(ivec4s a, ivec4s b) {
+  ivec4s r;
+  glm_ivec4_maxv(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief set each member of dest to lesser of vector a and b
+ *
+ * @param[in]  a    first vector
+ * @param[in]  b    second vector
+ * @returns         destination
+ */
+CGLM_INLINE
+ivec4s
+glms_ivec4_(minv)(ivec4s a, ivec4s b) {
+  ivec4s r;
+  glm_ivec4_minv(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief clamp each member of [v] between minVal and maxVal (inclusive)
+ *
+ * @param[in]      v      vector
+ * @param[in]      minVal minimum value
+ * @param[in]      maxVal maximum value
+ * @returns               clamped vector
+ */
+CGLM_INLINE
+ivec4s
+glms_ivec4_(clamp)(ivec4s v, int minVal, int maxVal) {
+  glm_ivec4_clamp(v.raw, minVal, maxVal);
+  return v;
+}
+
+/*!
+ * @brief absolute value of v
+ *
+ * @param[in]	v	vector
+ * @returns     destination
+ */
+CGLM_INLINE
+ivec4s
+glms_ivec4_(abs)(ivec4s v) {
+  ivec4s r;
+  glm_ivec4_abs(v.raw, r.raw);
+  return r;
+}
+
+#endif /* cglms_ivec4_h */
diff --git a/external/cglm/struct/mat2.h b/external/cglm/struct/mat2.h
new file mode 100644
index 0000000..915c1be
--- /dev/null
+++ b/external/cglm/struct/mat2.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_MAT2_IDENTITY_INIT
+   GLM_MAT2_ZERO_INIT
+   GLM_MAT2_IDENTITY
+   GLM_MAT2_ZERO
+
+ Functions:
+   CGLM_INLINE mat2s glms_mat2_make(const float * __restrict src);
+   CGLM_INLINE mat2s glms_mat2_identity(void)
+   CGLM_INLINE void  glms_mat2_identity_array(mat2 * restrict mats, size_t count)
+   CGLM_INLINE mat2s glms_mat2_zero(void)
+   CGLM_INLINE mat2s glms_mat2_mul(mat2 m1, mat2 m2)
+   CGLM_INLINE vec2s glms_mat2_mulv(mat2 m, vec2 v)
+   CGLM_INLINE mat2s glms_mat2_transpose(mat2 m)
+   CGLM_INLINE mat2s glms_mat2_scale(mat2 m, float s)
+   CGLM_INLINE mat2s glms_mat2_inv(mat2 m)
+   CGLM_INLINE mat2s glms_mat2_swap_col(mat2 mat, int col1, int col2)
+   CGLM_INLINE mat2s glms_mat2_swap_row(mat2 mat, int row1, int row2)
+   CGLM_INLINE float glms_mat2_det(mat2 m)
+   CGLM_INLINE float glms_mat2_trace(mat2 m)
+   CGLM_INLINE float glms_mat2_rmc(vec2 r, mat2 m, vec2 c)
+ */
+
+#ifndef cglms_mat2_h
+#define cglms_mat2_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../mat2.h"
+
+/* api definition */
+#define glms_mat2_(NAME) CGLM_STRUCTAPI(mat2, NAME)
+
+#define GLMS_MAT2_IDENTITY_INIT {GLM_MAT2_IDENTITY_INIT}
+#define GLMS_MAT2_ZERO_INIT     {GLM_MAT2_ZERO_INIT}
+
+/* for C only */
+#define GLMS_MAT2_IDENTITY ((mat2s)GLMS_MAT2_IDENTITY_INIT)
+#define GLMS_MAT2_ZERO     ((mat2s)GLMS_MAT2_ZERO_INIT)
+
+/*!
+ * @brief Returns mat2s (r) from pointer (src).
+ *
+ * @param[in]   src pointer to an array of floats
+ * @return[out] r   constructed mat2s from raw pointer
+ */
+CGLM_INLINE
+mat2s
+glms_mat2_(make)(const float * __restrict src) {
+  mat2s r;
+  glm_mat2_make(src, r.raw);
+  return r;
+}
+
+/*!
+ * @brief Return a identity mat2s (r).
+ *
+ *        The same thing may be achieved with either of bellow methods,
+ *        but it is more easy to do that with this func especially for members
+ *        e.g. glm_mat2_identity(aStruct->aMatrix);
+ *
+ * @code
+ * glm_mat2_copy(GLM_MAT2_IDENTITY, mat); // C only
+ *
+ * // or
+ * mat2 mat = GLM_MAT2_IDENTITY_INIT;
+ * @endcode
+ *
+ * @return[out] r constructed mat2s from raw pointer
+ */
+CGLM_INLINE
+mat2s
+glms_mat2_(identity)(void) {
+  mat2s r;
+  glm_mat2_identity(r.raw);
+  return r;
+}
+
+/*!
+ * @brief Given an array of mat2s’s (mats) make each matrix an identity matrix.
+ *
+ * @param[in, out] mats  Array of mat2s’s (must be aligned (16/32) if alignment is not disabled)
+ * @param[in]      count Array size of mats or number of matrices
+ */
+CGLM_INLINE
+void
+glms_mat2_(identity_array)(mat2s * __restrict mats, size_t count) {
+  CGLM_ALIGN_MAT mat2s t = GLMS_MAT2_IDENTITY_INIT;
+  size_t i;
+
+  for (i = 0; i < count; i++) {
+    glm_mat2_copy(t.raw, mats[i].raw);
+  }
+}
+
+/*!
+ * @brief Return zero'd out mat2 (r).
+ *
+ * @return[out] r constructed mat2s from raw pointer
+ */
+CGLM_INLINE
+mat2s
+glms_mat2_(zero)(void) {
+  mat2s r;
+  glm_mat2_zero(r.raw);
+  return r;
+}
+
+/*!
+ * @brief Multiply mat2 (m1) by mat2 (m2) and return in mat2s (r)
+ *
+ *        m1 and m2 matrices can be the same matrix, it is possible to write this:
+ *
+ * @code
+ * mat2 m = GLM_MAT2_IDENTITY_INIT;
+ * mat2s r = glms_mat2_mul(m, m);
+ * @endcode
+ *
+ * @param[in]   m1 mat2s (left)
+ * @param[in]   m2 mat2s (right)
+ * @return[out] r  constructed mat2s from raw pointers
+ */
+CGLM_INLINE
+mat2s
+glms_mat2_(mul)(mat2s m1, mat2s m2) {
+  mat2s r;
+  glm_mat2_mul(m1.raw, m2.raw, r.raw);
+  return r;
+}
+
+/*
+ * @brief Multiply mat2s (m) by vec2s (v) and return in vec2s (r).
+ *
+ * @param[in]   m mat2s (left)
+ * @param[in]   v vec2s (right, column vector)
+ * @return[out] r constructed vec2s from raw pointers
+ */
+CGLM_INLINE
+vec2s
+glms_mat2_(mulv)(mat2s m, vec2s v) {
+  vec2s r;
+  glm_mat2_mulv(m.raw, v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief Transpose mat2s (m) and store result in the same matrix.
+ *
+ * @param[in]   m mat2s (src)
+ * @return[out] m constructed mat2s from raw pointers
+ */
+CGLM_INLINE
+mat2s
+glms_mat2_(transpose)(mat2s m) {
+  glm_mat2_transpose(m.raw);
+  return m;
+}
+
+/*!
+ * @brief Multiply mat2s (m) by scalar constant (s)
+ *
+ * @param[in]   m mat2s (src)
+ * @param[in]   s scalar value
+ * @return[out] m constructed mat2s from raw pointers
+ */
+CGLM_INLINE
+mat2s
+glms_mat2_(scale)(mat2s m, float s) {
+  glm_mat2_scale(m.raw, s);
+  return m;
+}
+
+/*!
+ * @brief Inverse mat2s (m) and return in mat2s (r).
+ *
+ * @param[in]   m mat2s (left, src)
+ * @return[out] r constructed mat2s from raw pointers
+ */
+CGLM_INLINE
+mat2s
+glms_mat2_(inv)(mat2s m) {
+  mat2s r;
+  glm_mat2_inv(m.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief Swap two columns in mat2s (mat) and store in same matrix.
+ *
+ * @param[in]   mat  mat2s
+ * @param[in]   col1 column 1 array index
+ * @param[in]   col2 column 2 array index
+ * @return[out] mat  constructed mat2s from raw pointers columns swapped
+ */
+CGLM_INLINE
+mat2s
+glms_mat2_(swap_col)(mat2s mat, int col1, int col2) {
+  glm_mat2_swap_col(mat.raw, col1, col2);
+  return mat;
+}
+
+/*!
+ * @brief Swap two rows in mat2s (mat) and store in same matrix.
+ *
+ * @param[in]   mat  mat2s
+ * @param[in]   row1 row 1 array index
+ * @param[in]   row2 row 2 array index
+ * @return[out] mat  constructed mat2s from raw pointers rows swapped
+ */
+CGLM_INLINE
+mat2s
+glms_mat2_(swap_row)(mat2s mat, int row1, int row2) {
+  glm_mat2_swap_row(mat.raw, row1, row2);
+  return mat;
+}
+
+/*!
+ * @brief Returns mat2 determinant.
+ *
+ * @param[in] m mat2 (src)
+ *
+ * @return[out] mat2s raw pointers determinant (float)
+ */
+CGLM_INLINE
+float
+glms_mat2_(det)(mat2s m) {
+  return glm_mat2_det(m.raw);
+}
+
+/*!
+ * @brief Returns trace of matrix. Which is:
+ *
+ *        The sum of the elements on the main diagonal from
+ *        upper left corner to the bottom right corner.
+ *
+ * @param[in] m mat2 (m)
+ *
+ * @return[out] mat2s raw pointers trace (float)
+ */
+CGLM_INLINE
+float
+glms_mat2_(trace)(mat2s m) {
+  return glm_mat2_trace(m.raw);
+}
+
+/*!
+ * @brief Helper for  R (row vector) * M (matrix) * C (column vector)
+ *
+ *        rmc stands for Row * Matrix * Column
+ *
+ *        the result is scalar because M * C = ResC (1x2, column vector),
+ *        then if you take the dot_product(R (2x1), ResC (1x2)) = scalar value.
+ *
+ * @param[in] r vec2s (2x1, row vector)
+ * @param[in] m mat2s (2x2, matrix)
+ * @param[in] c vec2s (1x2, column vector)
+ *
+ * @return[out] Scalar value (float, 1x1)
+ */
+CGLM_INLINE
+float
+glms_mat2_(rmc)(vec2s r, mat2s m, vec2s c) {
+  return glm_mat2_rmc(r.raw, m.raw, c.raw);
+}
+
+#endif /* cglms_mat2_h */
diff --git a/external/cglm/struct/mat2x3.h b/external/cglm/struct/mat2x3.h
new file mode 100644
index 0000000..5b061ba
--- /dev/null
+++ b/external/cglm/struct/mat2x3.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLMS_MAT2X3_ZERO_INIT
+   GLMS_MAT2X3_ZERO
+
+ Functions:
+   CGLM_INLINE mat2x3s glms_mat2x3_zero(void);
+   CGLM_INLINE mat2x3s glms_mat2x3_make(const float * __restrict src);
+   CGLM_INLINE mat2s   glms_mat2x3_mul(mat2x3s m1, mat3x2s m2);
+   CGLM_INLINE vec3s   glms_mat2x3_mulv(mat2x3s m, vec2s v);
+   CGLM_INLINE mat3x2s glms_mat2x3_transpose(mat2x3s m);
+   CGLM_INLINE mat2x3s glms_mat2x3_scale(mat2x3s m, float s);
+ */
+
+#ifndef cglms_mat2x3_h
+#define cglms_mat2x3_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../mat2x3.h"
+
+/* api definition */
+#define glms_mat2x3_(NAME) CGLM_STRUCTAPI(mat2x3, NAME)
+
+#define GLMS_MAT2X3_ZERO_INIT {GLM_MAT2X3_ZERO_INIT}
+
+/* for C only */
+#define GLMS_MAT2X3_ZERO ((mat2x3s)GLMS_MAT2X3_ZERO_INIT)
+
+/*!
+ * @brief Zero out the mat2x3s (dest).
+ *
+ * @return[out] dest constructed mat2x3s from raw pointer
+ */
+CGLM_INLINE
+mat2x3s
+glms_mat2x3_(zero)(void) {
+  mat2x3s dest;
+  glm_mat2x3_zero(dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Create mat2x3s (dest) from pointer (src).
+ *
+ * @param[in]   src  pointer to an array of floats
+ * @return[out] dest constructed mat2x3s from raw pointer
+ */
+CGLM_INLINE
+mat2x3s
+glms_mat2x3_(make)(const float * __restrict src) {
+  mat2x3s dest;
+  glm_mat2x3_make(src, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat2x3s (m1) by mat3x2s (m2) and store in mat3s (dest).
+ *
+ * @code
+ * r = glms_mat2x3_mul(mat2x3s, mat3x2s);
+ * @endcode
+ *
+ * @param[in]   m1   mat2x3s (left)
+ * @param[in]   m2   mat3x2s (right)
+ * @return[out] dest constructed mat3s from raw pointers
+ */
+CGLM_INLINE
+mat3s
+glms_mat2x3_(mul)(mat2x3s m1, mat3x2s m2) {
+  mat3s dest;
+  glm_mat2x3_mul(m1.raw, m2.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat2x3s (m) by vec2s (v) and store in vec3s (dest).
+ *
+ * @param[in]   m    mat2x3s (left)
+ * @param[in]   v    vec2s (right, column vector)
+ * @return[out] dest constructed vec3s from raw pointers
+ */
+CGLM_INLINE
+vec3s
+glms_mat2x3_(mulv)(mat2x3s m, vec2s v) {
+  vec3s dest;
+  glm_mat2x3_mulv(m.raw, v.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Transpose mat2x3s (m) and store in mat3x2s (dest).
+ *
+ * @param[in]   m    mat2x3s (left)
+ * @return[out] dest constructed mat3x2s from raw pointers
+ */
+CGLM_INLINE
+mat3x2s
+glms_mat2x3_(transpose)(mat2x3s m) {
+  mat3x2s dest;
+  glm_mat2x3_transpose(m.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat2x3s (m) by scalar constant (s).
+ *
+ * @param[in, out] m mat2x3 (src, dest)
+ * @param[in]      s float (scalar)
+ */
+CGLM_INLINE
+mat2x3s
+glms_mat2x3_(scale)(mat2x3s m, float s) {
+  glm_mat2x3_scale(m.raw, s);
+  return m;
+}
+
+#endif /* cglms_mat2x3_h */
diff --git a/external/cglm/struct/mat2x4.h b/external/cglm/struct/mat2x4.h
new file mode 100644
index 0000000..7e3e75a
--- /dev/null
+++ b/external/cglm/struct/mat2x4.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLMS_MAT2X4_ZERO_INIT
+   GLMS_MAT2X4_ZERO
+
+ Functions:
+   CGLM_INLINE mat2x4s glms_mat2x4_zero(void);
+   CGLM_INLINE mat2x4s glms_mat2x4_make(const float * __restrict src);
+   CGLM_INLINE mat2s   glms_mat2x4_mul(mat2x4s m1, mat4x2s m2);
+   CGLM_INLINE vec4s   glms_mat2x4_mulv(mat2x4s m, vec2s v);
+   CGLM_INLINE mat4x2s glms_mat2x4_transpose(mat2x4s m);
+   CGLM_INLINE mat2x4s glms_mat2x4_scale(mat2x4s m, float s);
+ */
+
+#ifndef cglms_mat2x4_h
+#define cglms_mat2x4_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../mat2x4.h"
+
+/* api definition */
+#define glms_mat2x4_(NAME) CGLM_STRUCTAPI(mat2x4, NAME)
+
+#define GLMS_MAT2X4_ZERO_INIT {GLM_MAT2X4_ZERO_INIT}
+
+/* for C only */
+#define GLMS_MAT2X4_ZERO ((mat2x4s)GLMS_MAT2X4_ZERO_INIT)
+
+/*!
+ * @brief Zero out the mat2x4s (dest).
+ *
+ * @return[out] dest constructed mat2x4s from raw pointer
+ */
+CGLM_INLINE
+mat2x4s
+glms_mat2x4_(zero)(void) {
+  mat2x4s dest;
+  glm_mat2x4_zero(dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Create mat2x4s (dest) from pointer (src).
+ *
+ * @param[in]   src  pointer to an array of floats
+ * @return[out] dest constructed mat2x4s from raw pointer
+ */
+CGLM_INLINE
+mat2x4s
+glms_mat2x4_(make)(const float * __restrict src) {
+  mat2x4s dest;
+  glm_mat2x4_make(src, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat2x4s (m1) by mat4x2s (m2) and store in mat4s (dest).
+ *
+ * @code
+ * r = glms_mat2x4_mul(mat2x4s, mat4x2s);
+ * @endcode
+ *
+ * @param[in]   m1   mat2x4s (left)
+ * @param[in]   m2   mat4x2s (right)
+ * @return[out] dest constructed mat4s from raw pointers
+ */
+CGLM_INLINE
+mat4s
+glms_mat2x4_(mul)(mat2x4s m1, mat4x2s m2) {
+  mat4s dest;
+  glm_mat2x4_mul(m1.raw, m2.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat2x4s (m) by vec2s (v) and store in vec4s (dest).
+ *
+ * @param[in]   m    mat2x4s (left)
+ * @param[in]   v    vec2s (right, column vector)
+ * @return[out] dest constructed vec4s from raw pointers
+ */
+CGLM_INLINE
+vec4s
+glms_mat2x4_(mulv)(mat2x4s m, vec2s v) {
+  vec4s dest;
+  glm_mat2x4_mulv(m.raw, v.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Transpose mat2x4s (m) and store in mat4x2s (dest).
+ *
+ * @param[in]   m    mat2x4s (left)
+ * @return[out] dest constructed mat4x2s from raw pointers
+ */
+CGLM_INLINE
+mat4x2s
+glms_mat2x4_(transpose)(mat2x4s m) {
+  mat4x2s dest;
+  glm_mat2x4_transpose(m.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat2x4s (m) by scalar constant (s).
+ *
+ * @param[in, out] m mat2x4s (src, dest)
+ * @param[in]      s float (scalar)
+ */
+CGLM_INLINE
+mat2x4s
+glms_mat2x4_(scale)(mat2x4s m, float s) {
+  glm_mat2x4_scale(m.raw, s);
+  return m;
+}
+
+#endif /* cglms_mat2x4_h */
diff --git a/external/cglm/struct/mat3.h b/external/cglm/struct/mat3.h
new file mode 100644
index 0000000..2fae073
--- /dev/null
+++ b/external/cglm/struct/mat3.h
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLMS_MAT3_IDENTITY_INIT
+   GLMS_MAT3_ZERO_INIT
+   GLMS_MAT3_IDENTITY
+   GLMS_MAT3_ZERO
+
+ Functions:
+   CGLM_INLINE mat3s  glms_mat3_copy(mat3s mat);
+   CGLM_INLINE mat3s  glms_mat3_identity(void);
+   CGLM_INLINE void   glms_mat3_identity_array(mat3s * __restrict mat, size_t count);
+   CGLM_INLINE mat3s  glms_mat3_zero(void);
+   CGLM_INLINE mat3s  glms_mat3_mul(mat3s m1, mat3s m2);
+   CGLM_INLINE ma3s   glms_mat3_transpose(mat3s m);
+   CGLM_INLINE vec3s  glms_mat3_mulv(mat3s m, vec3s v);
+   CGLM_INLINE float  glms_mat3_trace(mat3s m);
+   CGLM_INLINE versor glms_mat3_quat(mat3s m);
+   CGLM_INLINE mat3s  glms_mat3_scale(mat3s m, float s);
+   CGLM_INLINE float  glms_mat3_det(mat3s mat);
+   CGLM_INLINE mat3s  glms_mat3_inv(mat3s mat);
+   CGLM_INLINE mat3s  glms_mat3_swap_col(mat3s mat, int col1, int col2);
+   CGLM_INLINE mat3s  glms_mat3_swap_row(mat3s mat, int row1, int row2);
+   CGLM_INLINE float  glms_mat3_rmc(vec3s r, mat3s m, vec3s c);
+   CGLM_INLINE mat3s  glms_mat3_make(const float * __restrict src);
+   CGLM_INLINE mat3s  glms_mat3_textrans(float sx, float sy, float rot, float tx, float ty);
+ */
+
+#ifndef cglms_mat3s_h
+#define cglms_mat3s_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../mat3.h"
+#include "vec3.h"
+
+/* api definition */
+#define glms_mat3_(NAME) CGLM_STRUCTAPI(mat3, NAME)
+
+#define GLMS_MAT3_IDENTITY_INIT  {GLM_MAT3_IDENTITY_INIT}
+#define GLMS_MAT3_ZERO_INIT      {GLM_MAT3_ZERO_INIT}
+
+/* for C only */
+#define GLMS_MAT3_IDENTITY ((mat3s)GLMS_MAT3_IDENTITY_INIT)
+#define GLMS_MAT3_ZERO     ((mat3s)GLMS_MAT3_ZERO_INIT)
+
+/*!
+ * @brief copy all members of [mat] to [dest]
+ *
+ * @param[in]  mat  source
+ * @returns         destination
+ */
+CGLM_INLINE
+mat3s
+glms_mat3_(copy)(mat3s mat) {
+  mat3s r;
+  glm_mat3_copy(mat.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief make given matrix identity. It is identical with below,
+ *        but it is more easy to do that with this func especially for members
+ *        e.g. glm_mat3_identity(aStruct->aMatrix);
+ *
+ * @code
+ * glm_mat3_copy(GLM_MAT3_IDENTITY, mat); // C only
+ *
+ * // or
+ * mat3 mat = GLM_MAT3_IDENTITY_INIT;
+ * @endcode
+ *
+ * @returns  destination
+ */
+CGLM_INLINE
+mat3s
+glms_mat3_(identity)(void) {
+  mat3s r;
+  glm_mat3_identity(r.raw);
+  return r;
+}
+
+/*!
+ * @brief make given matrix array's each element identity matrix
+ *
+ * @param[in, out]  mat   matrix array (must be aligned (16/32)
+ *                        if alignment is not disabled)
+ *
+ * @param[in]       count count of matrices
+ */
+CGLM_INLINE
+void
+glms_mat3_(identity_array)(mat3s * __restrict mat, size_t count) {
+  CGLM_ALIGN_MAT mat3s t = GLMS_MAT3_IDENTITY_INIT;
+  size_t i;
+
+  for (i = 0; i < count; i++) {
+    glm_mat3_copy(t.raw, mat[i].raw);
+  }
+}
+
+/*!
+ * @brief make given matrix zero.
+ *
+ * @returns  matrix
+ */
+CGLM_INLINE
+mat3s
+glms_mat3_(zero)(void) {
+  mat3s r;
+  glm_mat3_zero(r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply m1 and m2 to dest
+ *
+ * m1, m2 and dest matrices can be same matrix, it is possible to write this:
+ *
+ * @code
+ * mat3 m = GLM_MAT3_IDENTITY_INIT;
+ * r = glms_mat3_mul(m, m);
+ * @endcode
+ *
+ * @param[in]  m1   left matrix
+ * @param[in]  m2   right matrix
+ * @returns destination matrix
+ */
+CGLM_INLINE
+mat3s
+glms_mat3_(mul)(mat3s m1, mat3s m2) {
+  mat3s r;
+  glm_mat3_mul(m1.raw, m2.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief transpose mat3 and store result in same matrix
+ *
+ * @param[in, out] m source and dest
+ */
+CGLM_INLINE
+mat3s
+glms_mat3_(transpose)(mat3s m) {
+  glm_mat3_transpose(m.raw);
+  return m;
+}
+
+/*!
+ * @brief multiply mat3 with vec3 (column vector) and store in dest vector
+ *
+ * @param[in]  m    mat3 (left)
+ * @param[in]  v    vec3 (right, column vector)
+ * @returns         vec3 (result, column vector)
+ */
+CGLM_INLINE
+vec3s
+glms_mat3_(mulv)(mat3s m, vec3s v) {
+  vec3s r;
+  glm_mat3_mulv(m.raw, v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief trace of matrix
+ *
+ * sum of the elements on the main diagonal from upper left to the lower right
+ *
+ * @param[in]  m matrix
+ */
+CGLM_INLINE
+float
+glms_mat3_(trace)(mat3s m) {
+  return glm_mat3_trace(m.raw);
+}
+
+/*!
+ * @brief convert mat3 to quaternion
+ *
+ * @param[in]  m    rotation matrix
+ * @returns         destination quaternion
+ */
+CGLM_INLINE
+versors
+glms_mat3_(quat)(mat3s m) {
+  versors r;
+  glm_mat3_quat(m.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief scale (multiply with scalar) matrix
+ *
+ * multiply matrix with scalar
+ *
+ * @param[in]      m matrix
+ * @param[in]      s scalar
+ * @returns          scaled matrix
+ */
+CGLM_INLINE
+mat3s
+glms_mat3_(scale)(mat3s m, float s) {
+  glm_mat3_scale(m.raw, s);
+  return m;
+}
+
+/*!
+ * @brief mat3 determinant
+ *
+ * @param[in] mat matrix
+ *
+ * @return determinant
+ */
+CGLM_INLINE
+float
+glms_mat3_(det)(mat3s mat) {
+  return glm_mat3_det(mat.raw);
+}
+
+/*!
+ * @brief inverse mat3 and store in dest
+ *
+ * @param[in]  mat  matrix
+ * @returns         inverse matrix
+ */
+CGLM_INLINE
+mat3s
+glms_mat3_(inv)(mat3s mat) {
+  mat3s r;
+  glm_mat3_inv(mat.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief swap two matrix columns
+ *
+ * @param[in]     mat  matrix
+ * @param[in]     col1 col1
+ * @param[in]     col2 col2
+ * @returns            matrix
+ */
+CGLM_INLINE
+mat3s
+glms_mat3_(swap_col)(mat3s mat, int col1, int col2) {
+  glm_mat3_swap_col(mat.raw, col1, col2);
+  return mat;
+}
+
+/*!
+ * @brief swap two matrix rows
+ *
+ * @param[in]     mat  matrix
+ * @param[in]     row1 row1
+ * @param[in]     row2 row2
+ * @returns            matrix
+ */
+CGLM_INLINE
+mat3s
+glms_mat3_(swap_row)(mat3s mat, int row1, int row2) {
+  glm_mat3_swap_row(mat.raw, row1, row2);
+  return mat;
+}
+
+/*!
+ * @brief helper for  R (row vector) * M (matrix) * C (column vector)
+ *
+ * rmc stands for Row * Matrix * Column
+ *
+ * the result is scalar because R * M = Matrix1x3 (row vector),
+ * then Matrix1x3 * Vec3 (column vector) = Matrix1x1 (Scalar)
+ *
+ * @param[in]  r   row vector or matrix1x3
+ * @param[in]  m   matrix3x3
+ * @param[in]  c   column vector or matrix3x1
+ *
+ * @return scalar value e.g. Matrix1x1
+ */
+CGLM_INLINE
+float
+glms_mat3_(rmc)(vec3s r, mat3s m, vec3s c) {
+  return glm_mat3_rmc(r.raw, m.raw, c.raw);
+}
+
+/*!
+ * @brief Create mat3 matrix from pointer
+ *
+ * @param[in]  src  pointer to an array of floats
+ * @return constructed matrix from raw pointer
+ */
+CGLM_INLINE
+mat3s
+glms_mat3_(make)(const float * __restrict src) {
+  mat3s r;
+  glm_mat3_make(src, r.raw);
+  return r;
+}
+
+/*!
+ * @brief Create mat3 matrix from texture transform parameters
+ *
+ * @param[in]  sx  scale x
+ * @param[in]  sy  scale y
+ * @param[in]  rot rotation in radians CCW/RH
+ * @param[in]  tx  translate x
+ * @param[in]  ty  translate y
+ * @return texture transform matrix
+ */
+CGLM_INLINE
+mat3s
+glms_mat3_(textrans)(float sx, float sy, float rot, float tx, float ty) {
+  mat3s r;
+  glm_mat3_textrans(sx, sy, rot, tx, ty, r.raw);
+  return r;
+}
+
+#endif /* cglms_mat3s_h */
diff --git a/external/cglm/struct/mat3x2.h b/external/cglm/struct/mat3x2.h
new file mode 100644
index 0000000..ab2d388
--- /dev/null
+++ b/external/cglm/struct/mat3x2.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLMS_MAT3X2_ZERO_INIT
+   GLMS_MAT3X2_ZERO
+
+ Functions:
+   CGLM_INLINE mat3x2s glms_mat3x2_zero(void);
+   CGLM_INLINE mat3x2s glms_mat3x2_make(const float * __restrict src);
+   CGLM_INLINE mat2s   glms_mat3x2_mul(mat3x2s m1, mat2x3s m2);
+   CGLM_INLINE vec2s   glms_mat3x2_mulv(mat3x2s m, vec3s v);
+   CGLM_INLINE mat2x3s glms_mat3x2_transpose(mat3x2s m);
+   CGLM_INLINE mat3x2s glms_mat3x2_scale(mat3x2s m, float s);
+ */
+
+#ifndef cglms_mat3x2_h
+#define cglms_mat3x2_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../mat3x2.h"
+
+/* api definition */
+#define glms_mat3x2_(NAME) CGLM_STRUCTAPI(mat3x2, NAME)
+
+#define GLMS_MAT3X2_ZERO_INIT {GLM_MAT3X2_ZERO_INIT}
+
+/* for C only */
+#define GLMS_MAT3X2_ZERO ((mat3x2s)GLMS_MAT3X2_ZERO_INIT)
+
+/*!
+ * @brief Zero out the mat3x2s (dest).
+ *
+ * @return[out] dest constructed mat3x2s from raw pointer
+ */
+CGLM_INLINE
+mat3x2s
+glms_mat3x2_(zero)(void) {
+  mat3x2s dest;
+  glm_mat3x2_zero(dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Create mat3x2s (dest) from pointer (src).
+ *
+ * @param[in]   src  pointer to an array of floats
+ * @return[out] dest constructed mat3x2s from raw pointer
+ */
+CGLM_INLINE
+mat3x2s
+glms_mat3x2_(make)(const float * __restrict src) {
+  mat3x2s dest;
+  glm_mat3x2_make(src, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat3x2s (m1) by mat2x3s (m2) and store in mat2s (dest).
+ *
+ * @code
+ * r = glms_mat3x2_mul(mat3x2s, mat2x3s);
+ * @endcode
+ *
+ * @param[in]   m1   mat3x2s (left)
+ * @param[in]   m2   mat2x3s (right)
+ * @return[out] dest constructed mat2s from raw pointers
+ */
+CGLM_INLINE
+mat2s
+glms_mat3x2_(mul)(mat3x2s m1, mat2x3s m2) {
+  mat2s dest;
+  glm_mat3x2_mul(m1.raw, m2.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat3x2s (m) by vec3s (v) and store in vec2s (dest).
+ *
+ * @param[in]   m    mat3x2s (left)
+ * @param[in]   v    vec3s (right, column vector)
+ * @return[out] dest constructed vec2s from raw pointers
+ */
+CGLM_INLINE
+vec2s
+glms_mat3x2_(mulv)(mat3x2s m, vec3s v) {
+  vec2s dest;
+  glm_mat3x2_mulv(m.raw, v.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Transpose mat3x2s (m) and store in mat2x3s (dest).
+ *
+ * @param[in]   m    mat3x2s (left)
+ * @return[out] dest constructed mat2x3s from raw pointers
+ */
+CGLM_INLINE
+mat2x3s
+glms_mat3x2_(transpose)(mat3x2s m) {
+  mat2x3s dest;
+  glm_mat3x2_transpose(m.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat3x2s (m) by scalar constant (s).
+ *
+ * @param[in, out] m mat3x2s (src, dest)
+ * @param[in]      s float (scalar)
+ */
+CGLM_INLINE
+mat3x2s
+glms_mat3x2_(scale)(mat3x2s m, float s) {
+  glm_mat3x2_scale(m.raw, s);
+  return m;
+}
+
+#endif /* cglms_mat3x2_h */
diff --git a/external/cglm/struct/mat3x4.h b/external/cglm/struct/mat3x4.h
new file mode 100644
index 0000000..436b36c
--- /dev/null
+++ b/external/cglm/struct/mat3x4.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLMS_MAT3X4_ZERO_INIT
+   GLMS_MAT3X4_ZERO
+
+ Functions:
+   CGLM_INLINE mat3x4s glms_mat3x4_zero(void);
+   CGLM_INLINE mat3x4s glms_mat3x4_make(const float * __restrict src);
+   CGLM_INLINE mat4s   glms_mat3x4_mul(mat3x4s m1, mat4x3s m2);
+   CGLM_INLINE vec4s   glms_mat3x4_mulv(mat3x4s m, vec3s v);
+   CGLM_INLINE mat4x3s glms_mat3x4_transpose(mat3x4s m);
+   CGLM_INLINE mat3x4s glms_mat3x4_scale(mat3x4s m, float s);
+ */
+
+#ifndef cglms_mat3x4_h
+#define cglms_mat3x4_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../mat3x4.h"
+
+/* api definition */
+#define glms_mat3x4_(NAME) CGLM_STRUCTAPI(mat3x4, NAME)
+
+#define GLMS_MAT3X4_ZERO_INIT {GLM_MAT3X4_ZERO_INIT}
+
+/* for C only */
+#define GLMS_MAT3X4_ZERO ((mat3x4s)GLMS_MAT3X4_ZERO_INIT)
+
+/*!
+ * @brief Zero out the mat3x4s (dest).
+ *
+ * @return[out] dest constructed mat3x4s from raw pointer
+ */
+CGLM_INLINE
+mat3x4s
+glms_mat3x4_(zero)(void) {
+  mat3x4s dest;
+  glm_mat3x4_zero(dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Create mat3x4s (dest) from pointer (src).
+ *
+ * @param[in]   src  pointer to an array of floats
+ * @return[out] dest constructed mat3x4s from raw pointer
+ */
+CGLM_INLINE
+mat3x4s
+glms_mat3x4_(make)(const float * __restrict src) {
+  mat3x4s dest;
+  glm_mat3x4_make(src, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat3x4s (m1) by mat4x3s (m2) and store in mat4s (dest).
+ *
+ * @code
+ * r = glms_mat3x4_mul(mat3x4s, mat4x3s);
+ * @endcode
+ *
+ * @param[in]   m1   mat3x4s (left)
+ * @param[in]   m2   mat4x3s (right)
+ * @return[out] dest constructed mat4s from raw pointers
+ */
+CGLM_INLINE
+mat4s
+glms_mat3x4_(mul)(mat3x4s m1, mat4x3s m2) {
+  mat4s dest;
+  glm_mat3x4_mul(m1.raw, m2.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat3x4s (m) by vec3s (v) and store in vec4s (dest).
+ *
+ * @param[in]   m    mat3x4s (left)
+ * @param[in]   v    vec3s (right, column vector)
+ * @return[out] dest constructed vec4s from raw pointers
+ */
+CGLM_INLINE
+vec4s
+glms_mat3x4_(mulv)(mat3x4s m, vec3s v) {
+  vec4s dest;
+  glm_mat3x4_mulv(m.raw, v.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Transpose mat3x4s (m) and store in mat4x3s (dest).
+ *
+ * @param[in]   m    mat3x4s (left)
+ * @return[out] dest constructed mat4x3s from raw pointers
+ */
+CGLM_INLINE
+mat4x3s
+glms_mat3x4_(transpose)(mat3x4s m) {
+  mat4x3s dest;
+  glm_mat3x4_transpose(m.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat3x4s (m) by scalar constant (s).
+ *
+ * @param[in, out] m mat3x4s (src, dest)
+ * @param[in]      s float (scalar)
+ */
+CGLM_INLINE
+mat3x4s
+glms_mat3x4_(scale)(mat3x4s m, float s) {
+  glm_mat3x4_scale(m.raw, s);
+  return m;
+}
+
+#endif /* cglms_mat3x4_h */
diff --git a/external/cglm/struct/mat4.h b/external/cglm/struct/mat4.h
new file mode 100644
index 0000000..663a5fd
--- /dev/null
+++ b/external/cglm/struct/mat4.h
@@ -0,0 +1,496 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*!
+ * Most of functions in this header are optimized manually with SIMD
+ * if available. You dont need to call/incude SIMD headers manually
+ */
+
+/*
+ Macros:
+   GLMS_MAT4_IDENTITY_INIT
+   GLMS_MAT4_ZERO_INIT
+   GLMS_MAT4_IDENTITY
+   GLMS_MAT4_ZERO
+
+ Functions:
+   CGLM_INLINE mat4s   glms_mat4_ucopy(mat4s mat);
+   CGLM_INLINE mat4s   glms_mat4_copy(mat4s mat);
+   CGLM_INLINE mat4s   glms_mat4_identity(void);
+   CGLM_INLINE void    glms_mat4_identity_array(mat4s * __restrict mat, size_t count);
+   CGLM_INLINE mat4s   glms_mat4_zero(void);
+   CGLM_INLINE mat3s   glms_mat4_pick3(mat4s mat);
+   CGLM_INLINE mat3s   glms_mat4_pick3t(mat4s mat);
+   CGLM_INLINE mat4s   glms_mat4_ins3(mat3s mat, mat4s dest);
+   CGLM_INLINE mat4s   glms_mat4_mul(mat4s m1, mat4s m2);
+   CGLM_INLINE mat4s   glms_mat4_mulN(mat4s * __restrict matrices[], uint32_t len);
+   CGLM_INLINE vec4s   glms_mat4_mulv(mat4s m, vec4s v);
+   CGLM_INLINE float   glms_mat4_trace(mat4s m);
+   CGLM_INLINE float   glms_mat4_trace3(mat4s m);
+   CGLM_INLINE versors glms_mat4_quat(mat4s m);
+   CGLM_INLINE vec3s   glms_mat4_mulv3(mat4s m, vec3s v, float last);
+   CGLM_INLINE mat4s   glms_mat4_transpose(mat4s m);
+   CGLM_INLINE mat4s   glms_mat4_scale_p(mat4s m, float s);
+   CGLM_INLINE mat4s   glms_mat4_scale(mat4s m, float s);
+   CGLM_INLINE float   glms_mat4_det(mat4s mat);
+   CGLM_INLINE mat4s   glms_mat4_inv(mat4s mat);
+   CGLM_INLINE mat4s   glms_mat4_inv_fast(mat4s mat);
+   CGLM_INLINE mat4s   glms_mat4_swap_col(mat4s mat, int col1, int col2);
+   CGLM_INLINE mat4s   glms_mat4_swap_row(mat4s mat, int row1, int row2);
+   CGLM_INLINE float   glms_mat4_rmc(vec4s r, mat4s m, vec4s c);
+   CGLM_INLINE mat4s   glms_mat4_make(const float * __restrict src);
+   CGLM_INLINE mat4s   glms_mat4_textrans(float sx, float sy, float rot, float tx, float ty);
+ */
+
+#ifndef cglms_mat4s_h
+#define cglms_mat4s_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../mat4.h"
+#include "vec4.h"
+#include "vec3.h"
+
+/* api definition */ 
+#define glms_mat4_(NAME) CGLM_STRUCTAPI(mat4, NAME)
+
+#define GLMS_MAT4_IDENTITY_INIT  {GLM_MAT4_IDENTITY_INIT}
+#define GLMS_MAT4_ZERO_INIT      {GLM_MAT4_ZERO_INIT}
+
+/* for C only */
+#define GLMS_MAT4_IDENTITY ((mat4s)GLMS_MAT4_IDENTITY_INIT)
+#define GLMS_MAT4_ZERO     ((mat4s)GLMS_MAT4_ZERO_INIT)
+
+/*!
+ * @brief copy all members of [mat] to [dest]
+ *
+ * matrix may not be aligned, u stands for unaligned, this may be useful when
+ * copying a matrix from external source e.g. asset importer...
+ *
+ * @param[in]  mat  source
+ * @returns         destination
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(ucopy)(mat4s mat) {
+  mat4s r;
+  glm_mat4_ucopy(mat.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief copy all members of [mat] to [dest]
+ *
+ * @param[in]  mat  source
+ * @returns         destination
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(copy)(mat4s mat) {
+  mat4s r;
+  glm_mat4_copy(mat.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief make given matrix identity. It is identical with below, 
+ *        but it is more easy to do that with this func especially for members
+ *        e.g. glm_mat4_identity(aStruct->aMatrix);
+ *
+ * @code
+ * glm_mat4_copy(GLM_MAT4_IDENTITY, mat); // C only
+ *
+ * // or
+ * mat4 mat = GLM_MAT4_IDENTITY_INIT;
+ * @endcode
+ *
+ * @returns  destination
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(identity)(void) {
+  mat4s r;
+  glm_mat4_identity(r.raw);
+  return r;
+}
+
+/*!
+ * @brief make given matrix array's each element identity matrix
+ *
+ * @param[in, out]  mat   matrix array (must be aligned (16/32)
+ *                        if alignment is not disabled)
+ *
+ * @param[in]       count count of matrices
+ */
+CGLM_INLINE
+void
+glms_mat4_(identity_array)(mat4s * __restrict mat, size_t count) {
+  CGLM_ALIGN_MAT mat4s t = GLMS_MAT4_IDENTITY_INIT;
+  size_t i;
+
+  for (i = 0; i < count; i++) {
+    glm_mat4_copy(t.raw, mat[i].raw);
+  }
+}
+
+/*!
+ * @brief make given matrix zero.
+ *
+ * @returns  matrix
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(zero)(void) {
+  mat4s r;
+  glm_mat4_zero(r.raw);
+  return r;
+}
+
+/*!
+ * @brief copy upper-left of mat4 to mat3
+ *
+ * @param[in]  mat  source
+ * @returns         destination
+ */
+CGLM_INLINE
+mat3s
+glms_mat4_(pick3)(mat4s mat) {
+  mat3s r;
+  glm_mat4_pick3(mat.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief copy upper-left of mat4 to mat3 (transposed)
+ *
+ * the postfix t stands for transpose
+ *
+ * @param[in]  mat  source
+ * @returns         destination
+ */
+CGLM_INLINE
+mat3s
+glms_mat4_(pick3t)(mat4s mat) {
+  mat3s r;
+  glm_mat4_pick3t(mat.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief copy mat3 to mat4's upper-left
+ *
+ * @param[in]  mat  source
+ * @param[in]  dest destination
+ * @returns         destination
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(ins3)(mat3s mat, mat4s dest) {
+  glm_mat4_ins3(mat.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiply m1 and m2 to dest
+ *
+ * m1, m2 and dest matrices can be same matrix, it is possible to write this:
+ *
+ * @code
+ * mat4 m = GLM_MAT4_IDENTITY_INIT;
+ * r = glms_mat4_mul(m, m);
+ * @endcode
+ *
+ * @param[in]  m1   left matrix
+ * @param[in]  m2   right matrix
+ * @returns destination matrix
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(mul)(mat4s m1, mat4s m2) {
+  mat4s r;
+  glm_mat4_mul(m1.raw, m2.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief mupliply N mat4 matrices and store result in dest
+ *
+ * this function lets you multiply multiple (more than two or more...) matrices
+ * <br><br>multiplication will be done in loop, this may reduce instructions
+ * size but if <b>len</b> is too small then compiler may unroll whole loop,
+ * usage:
+ * @code
+ * mat4 m1, m2, m3, m4, res;
+ *
+ * res = glm_mat4_mulN((mat4 *[]){&m1, &m2, &m3, &m4}, 4);
+ * @endcode
+ *
+ * @warning matrices parameter is pointer array not mat4 array!
+ *
+ * @param[in]  matrices mat4 * array
+ * @param[in]  len      matrices count
+ * @returns             result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(mulN)(mat4s * __restrict matrices[], uint32_t len) {
+  CGLM_ALIGN_MAT mat4s r = GLMS_MAT4_IDENTITY_INIT;
+  size_t i;
+
+  for (i = 0; i < len; i++) {
+    r = glms_mat4_(mul)(r, *matrices[i]);
+  }
+  
+  return r;
+}
+
+/*!
+ * @brief multiply mat4 with vec4 (column vector) and store in dest vector
+ *
+ * @param[in]  m    mat4 (left)
+ * @param[in]  v    vec4 (right, column vector)
+ * @returns         vec4 (result, column vector)
+ */
+CGLM_INLINE
+vec4s
+glms_mat4_(mulv)(mat4s m, vec4s v) {
+  vec4s r;
+  glm_mat4_mulv(m.raw, v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief trace of matrix
+ *
+ * sum of the elements on the main diagonal from upper left to the lower right
+ *
+ * @param[in]  m matrix
+ */
+CGLM_INLINE
+float
+glms_mat4_(trace)(mat4s m) {
+  return glm_mat4_trace(m.raw);
+}
+
+/*!
+ * @brief trace of matrix (rotation part)
+ *
+ * sum of the elements on the main diagonal from upper left to the lower right
+ *
+ * @param[in]  m matrix
+ */
+CGLM_INLINE
+float
+glms_mat4_(trace3)(mat4s m) {
+  return glm_mat4_trace3(m.raw);
+}
+
+/*!
+ * @brief convert mat4's rotation part to quaternion
+ *
+ * @param[in]  m    affine matrix
+ * @returns         destination quaternion
+ */
+CGLM_INLINE
+versors
+glms_mat4_(quat)(mat4s m) {
+  versors r;
+  glm_mat4_quat(m.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply vector with mat4
+ *
+ * @param[in]  m    mat4(affine transform)
+ * @param[in]  v    vec3
+ * @param[in]  last 4th item to make it vec4
+ * @returns         result vector (vec3)
+ */
+CGLM_INLINE
+vec3s
+glms_mat4_(mulv3)(mat4s m, vec3s v, float last) {
+  vec3s r;
+  glm_mat4_mulv3(m.raw, v.raw, last, r.raw);
+  return r;
+}
+
+/*!
+ * @brief transpose mat4 and store result in same matrix
+ *
+ * @param[in] m source
+ * @returns     result
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(transpose)(mat4s m) {
+  glm_mat4_transpose(m.raw);
+  return m;
+}
+
+/*!
+ * @brief scale (multiply with scalar) matrix without simd optimization
+ *
+ * multiply matrix with scalar
+ *
+ * @param[in] m matrix
+ * @param[in] s scalar
+ * @returns     matrix    
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(scale_p)(mat4s m, float s) {
+  glm_mat4_scale_p(m.raw, s);
+  return m;
+}
+
+/*!
+ * @brief scale (multiply with scalar) matrix
+ *
+ * multiply matrix with scalar
+ *
+ * @param[in] m matrix
+ * @param[in] s scalar
+ * @returns     matrix
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(scale)(mat4s m, float s) {
+  glm_mat4_scale(m.raw, s);
+  return m;
+}
+
+/*!
+ * @brief mat4 determinant
+ *
+ * @param[in] mat matrix
+ *
+ * @return determinant
+ */
+CGLM_INLINE
+float
+glms_mat4_(det)(mat4s mat) {
+  return glm_mat4_det(mat.raw);
+}
+
+/*!
+ * @brief inverse mat4 and store in dest
+ *
+ * @param[in]  mat  matrix
+ * @returns         inverse matrix
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(inv)(mat4s mat) {
+  mat4s r;
+  glm_mat4_inv(mat.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief inverse mat4 and store in dest
+ *
+ * this func uses reciprocal approximation without extra corrections
+ * e.g Newton-Raphson. this should work faster than normal,
+ * to get more precise use glm_mat4_inv version.
+ *
+ * NOTE: You will lose precision, glm_mat4_inv is more accurate
+ *
+ * @param[in]  mat  matrix
+ * @returns         inverse matrix
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(inv_fast)(mat4s mat) {
+  mat4s r;
+  glm_mat4_inv_fast(mat.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief swap two matrix columns
+ *
+ * @param[in] mat  matrix
+ * @param[in] col1 col1
+ * @param[in] col2 col2
+ * @returns        matrix
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(swap_col)(mat4s mat, int col1, int col2) {
+  glm_mat4_swap_col(mat.raw, col1, col2);
+  return mat;
+}
+
+/*!
+ * @brief swap two matrix rows
+ *
+ * @param[in] mat  matrix
+ * @param[in] row1 row1
+ * @param[in] row2 row2
+ * @returns        matrix
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(swap_row)(mat4s mat, int row1, int row2) {
+  glm_mat4_swap_row(mat.raw, row1, row2);
+  return mat;
+}
+
+/*!
+ * @brief helper for  R (row vector) * M (matrix) * C (column vector)
+ *
+ * rmc stands for Row * Matrix * Column
+ *
+ * the result is scalar because R * M = Matrix1x4 (row vector),
+ * then Matrix1x4 * Vec4 (column vector) = Matrix1x1 (Scalar)
+ *
+ * @param[in]  r   row vector or matrix1x4
+ * @param[in]  m   matrix4x4
+ * @param[in]  c   column vector or matrix4x1
+ *
+ * @return scalar value e.g. B(s)
+ */
+CGLM_INLINE
+float
+glms_mat4_(rmc)(vec4s r, mat4s m, vec4s c) {
+  return glm_mat4_rmc(r.raw, m.raw, c.raw);
+}
+
+/*!
+ * @brief Create mat4 matrix from pointer
+ *
+ * @param[in]  src  pointer to an array of floats
+ * @return constructed matrix from raw pointer
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(make)(const float * __restrict src) {
+  mat4s r;
+  glm_mat4_make(src, r.raw);
+  return r;
+}
+
+/*!
+ * @brief Create mat4 matrix from texture transform parameters
+ *
+ * @param[in]  sx  scale x
+ * @param[in]  sy  scale y
+ * @param[in]  rot rotation in radians CCW/RH
+ * @param[in]  tx  translate x
+ * @param[in]  ty  translate y
+ * @return texture transform matrix
+ */
+CGLM_INLINE
+mat4s
+glms_mat4_(textrans)(float sx, float sy, float rot, float tx, float ty) {
+  mat4s r;
+  glm_mat4_textrans(sx, sy, rot, tx, ty, r.raw);
+  return r;
+}
+
+#endif /* cglms_mat4s_h */
diff --git a/external/cglm/struct/mat4x2.h b/external/cglm/struct/mat4x2.h
new file mode 100644
index 0000000..6c68abe
--- /dev/null
+++ b/external/cglm/struct/mat4x2.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLMS_MAT4X2_ZERO_INIT
+   GLMS_MAT4X2_ZERO
+
+ Functions:
+   CGLM_INLINE mat4x2s glms_mat4x2_zero(void);
+   CGLM_INLINE mat4x2s glms_mat4x2_make(const float * __restrict src);
+   CGLM_INLINE mat2s   glms_mat4x2_mul(mat4x2s m1, mat2x4s m2);
+   CGLM_INLINE vec2s   glms_mat4x2_mulv(mat4x2s m, vec4s v);
+   CGLM_INLINE mat2x4s glms_mat4x2_transpose(mat4x2s m);
+   CGLM_INLINE mat4x2s glms_mat4x2_scale(mat4x2s m, float s);
+ */
+
+#ifndef cglms_mat4x2_h
+#define cglms_mat4x2_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../mat4x2.h"
+
+/* api definition */
+#define glms_mat4x2_(NAME) CGLM_STRUCTAPI(mat4x2, NAME)
+
+#define GLMS_MAT4X2_ZERO_INIT {GLM_MAT4X2_ZERO_INIT}
+
+/* for C only */
+#define GLMS_MAT4X2_ZERO ((mat4x2s)GLMS_MAT4X2_ZERO_INIT)
+
+
+/*!
+ * @brief Zero out the mat4x2s (dest).
+ *
+ * @return[out] dest constructed mat4x2s from raw pointer
+ */
+CGLM_INLINE
+mat4x2s
+glms_mat4x2_(zero)(void) {
+  mat4x2s dest;
+  glm_mat4x2_zero(dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Create mat4x2s (dest) from pointer (src).
+ *
+ * @param[in]   src  pointer to an array of floats
+ * @return[out] dest constructed mat4x2s from raw pointer
+ */
+CGLM_INLINE
+mat4x2s
+glms_mat4x2_(make)(const float * __restrict src) {
+  mat4x2s dest;
+  glm_mat4x2_make(src, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat4x2s (m1) by mat2x4s (m2) and store in mat2s (dest).
+ *
+ * @code
+ * r = glms_mat4x2_mul(mat4x2s, mat2x4s);
+ * @endcode
+ *
+ * @param[in]   m1   mat4x2s (left)
+ * @param[in]   m2   mat2x4s (right)
+ * @return[out] dest constructed mat2s from raw pointers
+ */
+CGLM_INLINE
+mat2s
+glms_mat4x2_(mul)(mat4x2s m1, mat2x4s m2) {
+  mat2s dest;
+  glm_mat4x2_mul(m1.raw, m2.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat4x2s (m) by vec4s (v) and store in vec2s (dest).
+ *
+ * @param[in]   m    mat4x2s (left)
+ * @param[in]   v    vec4s (right, column vector)
+ * @return[out] dest constructed vec2s from raw pointers
+ */
+CGLM_INLINE
+vec2s
+glms_mat4x2_(mulv)(mat4x2s m, vec4s v) {
+  vec2s dest;
+  glm_mat4x2_mulv(m.raw, v.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Transpose mat4x2s (m) and store in mat2x4s (dest).
+ *
+ * @param[in]   m    mat4x2s (left)
+ * @return[out] dest constructed mat2x4s from raw pointers
+ */
+CGLM_INLINE
+mat2x4s
+glms_mat4x2_(transpose)(mat4x2s m) {
+  mat2x4s dest;
+  glm_mat4x2_transpose(m.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat4x2s (m) by scalar constant (s).
+ *
+ * @param[in, out] m mat4x2s (src, dest)
+ * @param[in]      s float (scalar)
+ */
+CGLM_INLINE
+mat4x2s
+glms_mat4x2_(scale)(mat4x2s m, float s) {
+  glm_mat4x2_scale(m.raw, s);
+  return m;
+}
+
+#endif /* cglms_mat4x2_h */
diff --git a/external/cglm/struct/mat4x3.h b/external/cglm/struct/mat4x3.h
new file mode 100644
index 0000000..b398f98
--- /dev/null
+++ b/external/cglm/struct/mat4x3.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLMS_MAT4X3_ZERO_INIT
+   GLMS_MAT4X3_ZERO
+
+ Functions:
+   CGLM_INLINE mat4x3s glms_mat4x3_zero(void);
+   CGLM_INLINE mat4x3s glms_mat4x3_make(const float * __restrict src);
+   CGLM_INLINE mat3s   glms_mat4x3_mul(mat4x3s m1, mat3x4s m2);
+   CGLM_INLINE vec3s   glms_mat4x3_mulv(mat4x3s m, vec4s v);
+   CGLM_INLINE mat3x4s glms_mat4x3_transpose(mat4x3s m);
+   CGLM_INLINE mat4x3s glms_mat4x3_scale(mat4x3s m, float s);
+ */
+
+#ifndef cglms_mat4x3_h
+#define cglms_mat4x3_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../mat4x3.h"
+
+/* api definition */
+#define glms_mat4x3_(NAME) CGLM_STRUCTAPI(mat4x3, NAME)
+
+#define GLMS_MAT4X3_ZERO_INIT {GLM_MAT4X3_ZERO_INIT}
+
+/* for C only */
+#define GLMS_MAT4X3_ZERO ((mat4x3s)GLMS_MAT4X3_ZERO_INIT)
+
+/*!
+ * @brief Zero out the mat4x3s (dest).
+ *
+ * @return[out] dest constructed mat4x3s from raw pointer
+ */
+CGLM_INLINE
+mat4x3s
+glms_mat4x3_(zero)(void) {
+  mat4x3s dest;
+  glm_mat4x3_zero(dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Create mat4x3s (dest) from pointer (src).
+ *
+ * @param[in]   src  pointer to an array of floats
+ * @return[out] dest constructed mat4x3s from raw pointer
+ */
+CGLM_INLINE
+mat4x3s
+glms_mat4x3_(make)(const float * __restrict src) {
+  mat4x3s dest;
+  glm_mat4x3_make(src, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat4x3s (m1) by mat3x4s (m2) and store in mat3s (dest).
+ *
+ * @code
+ * r = glms_mat4x3_mul(mat4x3s, mat3x4s);
+ * @endcode
+ *
+ * @param[in]   m1   mat4x3s (left)
+ * @param[in]   m2   mat3x4s (right)
+ * @return[out] dest constructed mat3s from raw pointers
+ */
+CGLM_INLINE
+mat3s
+glms_mat4x3_(mul)(mat4x3s m1, mat3x4s m2) {
+  mat3s dest;
+  glm_mat4x3_mul(m1.raw, m2.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat4x3s (m) by vec4s (v) and store in vec3s (dest).
+ *
+ * @param[in]   m    mat4x3s (left)
+ * @param[in]   v    vec4s (right, column vector)
+ * @return[out] dest constructed vec3s from raw pointers
+ */
+CGLM_INLINE
+vec3s
+glms_mat4x3_(mulv)(mat4x3s m, vec4s v) {
+  vec3s dest;
+  glm_mat4x3_mulv(m.raw, v.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Transpose mat4x3s (m) and store in mat3x4s (dest).
+ *
+ * @param[in]   m    mat4x3s (left)
+ * @return[out] dest constructed mat3x4s from raw pointers
+ */
+CGLM_INLINE
+mat3x4s
+glms_mat4x3_(transpose)(mat4x3s m) {
+  mat3x4s dest;
+  glm_mat4x3_transpose(m.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Multiply mat4x3s (m) by scalar constant (s).
+ *
+ * @param[in, out] m mat4x3s (src, dest)
+ * @param[in]      s float (scalar)
+ */
+CGLM_INLINE
+mat4x3s
+glms_mat4x3_(scale)(mat4x3s m, float s) {
+  glm_mat4x3_scale(m.raw, s);
+  return m;
+}
+
+#endif /* cglms_mat4x3_h */
diff --git a/external/cglm/struct/noise.h b/external/cglm/struct/noise.h
new file mode 100644
index 0000000..3fd7d2e
--- /dev/null
+++ b/external/cglm/struct/noise.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglms_noises_h
+#define cglms_noises_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../noise.h"
+#include "vec4.h"
+
+/*
+ Functions:
+   CGLM_INLINE float glms_perlin_vec4(vec4s point);
+ */
+
+/*!
+ * @brief Classic perlin noise
+ *
+ * @param[in]  point  4D vector
+ * @returns           perlin noise value
+ */
+CGLM_INLINE
+float
+glms_perlin_vec4(vec4s point) {
+  return glm_perlin_vec4(point.raw);
+}
+
+/*!
+ * @brief Classic perlin noise
+ *
+ * @param[in]  point  3D vector
+ * @returns           perlin noise value
+ */
+CGLM_INLINE
+float
+glms_perlin_vec3(vec3s point) {
+  return glm_perlin_vec3(point.raw);
+}
+
+/*!
+ * @brief Classic perlin noise
+ *
+ * @param[in]  point  2D vector
+ * @returns           perlin noise value
+ */
+CGLM_INLINE
+float
+glms_perlin_vec2(vec2s point) {
+  return glm_perlin_vec2(point.raw);
+}
+
+#endif /* cglms_noises_h */
diff --git a/external/cglm/struct/plane.h b/external/cglm/struct/plane.h
new file mode 100644
index 0000000..6a84ac7
--- /dev/null
+++ b/external/cglm/struct/plane.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglms_planes_h
+#define cglms_planes_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../plane.h"
+#include "vec4.h"
+
+/*
+ Plane equation:  Ax + By + Cz + D = 0;
+
+ It stored in vec4 as [A, B, C, D]. (A, B, C) is normal and D is distance
+*/
+
+/*
+ Functions:
+   CGLM_INLINE vec4s glms_plane_normalize(vec4s plane);
+ */
+
+/*!
+ * @brief normalizes a plane
+ *
+ * @param[in] plane plane to normalize
+ * @returns         normalized plane
+ */
+CGLM_INLINE
+vec4s
+glms_plane_normalize(vec4s plane) {
+  glm_plane_normalize(plane.raw);
+  return plane;
+}
+
+#endif /* cglms_planes_h */
diff --git a/external/cglm/struct/project.h b/external/cglm/struct/project.h
new file mode 100644
index 0000000..8383c77
--- /dev/null
+++ b/external/cglm/struct/project.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglms_projects_h
+#define cglms_projects_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../project.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+
+#ifndef CGLM_CLIPSPACE_INCLUDE_ALL
+#  if CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_ZO_BIT
+#    include "clipspace/project_zo.h"
+#  elif CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_NO_BIT
+#    include "clipspace/project_no.h"
+#  endif
+#else
+#  include "clipspace/project_zo.h"
+#  include "clipspace/project_no.h"
+#endif
+
+/*!
+ * @brief maps the specified viewport coordinates into specified space [1]
+ *        the matrix should contain projection matrix.
+ *
+ * if you don't have ( and don't want to have ) an inverse matrix then use
+ * glm_unproject version. You may use existing inverse of matrix in somewhere
+ * else, this is why glm_unprojecti exists to save save inversion cost
+ *
+ * [1] space:
+ *  1- if m = invProj:     View Space
+ *  2- if m = invViewProj: World Space
+ *  3- if m = invMVP:      Object Space
+ *
+ * You probably want to map the coordinates into object space
+ * so use invMVP as m
+ *
+ * Computing viewProj:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *   glm_mat4_inv(viewProj, invMVP);
+ *
+ * @param[in]  pos      point/position in viewport coordinates
+ * @param[in]  invMat   matrix (see brief)
+ * @param[in]  vp       viewport as [x, y, width, height]
+ * @returns             unprojected coordinates
+ */
+CGLM_INLINE
+vec3s
+glms_unprojecti(vec3s pos, mat4s invMat, vec4s vp) {
+  vec3s r;
+  glm_unprojecti(pos.raw, invMat.raw, vp.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief maps the specified viewport coordinates into specified space [1]
+ *        the matrix should contain projection matrix.
+ *
+ * this is same as glm_unprojecti except this function get inverse matrix for
+ * you.
+ *
+ * [1] space:
+ *  1- if m = proj:     View Space
+ *  2- if m = viewProj: World Space
+ *  3- if m = MVP:      Object Space
+ *
+ * You probably want to map the coordinates into object space
+ * so use MVP as m
+ *
+ * Computing viewProj and MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * or in struct api:
+ *   MVP = mat4_mul(mat4_mul(proj, view), model)
+ *
+ * @param[in]  pos      point/position in viewport coordinates
+ * @param[in]  m        matrix (see brief)
+ * @param[in]  vp       viewport as [x, y, width, height]
+ * @returns             unprojected coordinates
+ */
+CGLM_INLINE
+vec3s
+glms_unproject(vec3s pos, mat4s m, vec4s vp) {
+  vec3s r;
+  glm_unproject(pos.raw, m.raw, vp.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief map object coordinates to window coordinates
+ *
+ * Computing MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * or in struct api:
+ *   MVP = mat4_mul(mat4_mul(proj, view), model)
+ *
+ * @param[in]  pos      object coordinates
+ * @param[in]  m        MVP matrix
+ * @param[in]  vp       viewport as [x, y, width, height]
+ * @returns projected coordinates
+ */
+CGLM_INLINE
+vec3s
+glms_project(vec3s pos, mat4s m, vec4s vp) {
+  vec3s r;
+  glm_project(pos.raw, m.raw, vp.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief map object's z coordinate to window coordinates
+ *
+ * Computing MVP:
+ *   glm_mat4_mul(proj, view, viewProj);
+ *   glm_mat4_mul(viewProj, model, MVP);
+ *
+ * or in struct api:
+ *   MVP = mat4_mul(mat4_mul(proj, view), model)
+ *
+ * @param[in]  v  object coordinates
+ * @param[in]  m  MVP matrix
+ *
+ * @returns projected z coordinate
+ */
+CGLM_INLINE
+float
+glms_project_z(vec3s v, mat4s m) {
+#if CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_ZO_BIT
+  return glms_project_z_zo(v, m);
+#elif CGLM_CONFIG_CLIP_CONTROL & CGLM_CLIP_CONTROL_NO_BIT
+  return glms_project_z_no(v, m);
+#endif
+}
+
+/*!
+ * @brief define a picking region
+ *
+ * @param[in]  center   center [x, y] of a picking region in window coordinates
+ * @param[in]  size     size [width, height] of the picking region in window coordinates
+ * @param[in]  vp       viewport as [x, y, width, height]
+ * @returns projected coordinates
+ */
+CGLM_INLINE
+mat4s
+glms_pickmatrix(vec2s center, vec2s size, vec4s vp) {
+  mat4s res;
+  glm_pickmatrix(center.raw, size.raw, vp.raw, res.raw);
+  return res;
+}
+
+#endif /* cglms_projects_h */
diff --git a/external/cglm/struct/quat.h b/external/cglm/struct/quat.h
new file mode 100644
index 0000000..d6789e4
--- /dev/null
+++ b/external/cglm/struct/quat.h
@@ -0,0 +1,601 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLMS_QUAT_IDENTITY_INIT
+   GLMS_QUAT_IDENTITY
+
+ Functions:
+   CGLM_INLINE versors glms_quat_identity(void)
+   CGLM_INLINE void    glms_quat_identity_array(versor *q, size_t count)
+   CGLM_INLINE versors glms_quat_init(float x, float y, float z, float w)
+   CGLM_INLINE versors glms_quatv(float angle, vec3s axis)
+   CGLM_INLINE versors glms_quat(float angle, float x, float y, float z)
+   CGLM_INLINE versors glms_quat_from_vecs(vec3s a, vec3s b)
+   CGLM_INLINE float   glms_quat_norm(versors q)
+   CGLM_INLINE versors glms_quat_normalize(versors q)
+   CGLM_INLINE float   glms_quat_dot(versors p, versors q)
+   CGLM_INLINE versors glms_quat_conjugate(versors q)
+   CGLM_INLINE versors glms_quat_inv(versors q)
+   CGLM_INLINE versors glms_quat_add(versors p, versors q)
+   CGLM_INLINE versors glms_quat_sub(versors p, versors q)
+   CGLM_INLINE vec3s   glms_quat_imagn(versors q)
+   CGLM_INLINE float   glms_quat_imaglen(versors q)
+   CGLM_INLINE float   glms_quat_angle(versors q)
+   CGLM_INLINE vec3s   glms_quat_axis(versors q)
+   CGLM_INLINE versors glms_quat_mul(versors p, versors q)
+   CGLM_INLINE mat4s   glms_quat_mat4(versors q)
+   CGLM_INLINE mat4s   glms_quat_mat4t(versors q)
+   CGLM_INLINE mat3s   glms_quat_mat3(versors q)
+   CGLM_INLINE mat3s   glms_quat_mat3t(versors q)
+   CGLM_INLINE versors glms_quat_lerp(versors from, versors to, float t)
+   CGLM_INLINE versors glms_quat_lerpc(versors from, versors to, float t)
+   CGLM_INLINE versors glms_quat_nlerp(versors from, versors to, float t)
+   CGLM_INLINE versors glms_quat_slerp(versors from, versors to, float t)
+   CGLM_INLINE versors glms_quat_slerp_longest(versors from, versors to, float t)
+   CGLM_INLINE mat4s.  glms_quat_look(vec3s eye, versors ori)
+   CGLM_INLINE versors glms_quat_for(vec3s dir, vec3s fwd, vec3s up)
+   CGLM_INLINE versors glms_quat_forp(vec3s from, vec3s to, vec3s fwd, vec3s up)
+   CGLM_INLINE vec3s   glms_quat_rotatev(versors q, vec3s v)
+   CGLM_INLINE mat4s   glms_quat_rotate(mat4s m, versors q)
+   CGLM_INLINE mat4s   glms_quat_rotate_at(mat4s m, versors q, vec3s pivot)
+   CGLM_INLINE mat4s   glms_quat_rotate_atm(versors q, vec3s pivot)
+   CGLM_INLINE versors glms_quat_make(float * restrict src)
+ */
+
+#ifndef cglms_quat_h
+#define cglms_quat_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../plane.h"
+#include "../quat.h"
+
+/* api definition */
+#define glms_quat_(NAME) CGLM_STRUCTAPI(quat, NAME)
+
+/*
+ * IMPORTANT:
+ * ----------------------------------------------------------------------------
+ * cglm stores quat as [x, y, z, w] since v0.3.6
+ *
+ * it was [w, x, y, z] before v0.3.6 it has been changed to [x, y, z, w]
+ * with v0.3.6 version.
+ * ----------------------------------------------------------------------------
+ */
+
+#define GLMS_QUAT_IDENTITY_INIT  {GLM_QUAT_IDENTITY_INIT}
+#define GLMS_QUAT_IDENTITY       ((versors)GLMS_QUAT_IDENTITY_INIT)
+
+/*!
+ * @brief makes given quat to identity
+ *
+ * @returns identity quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(identity)(void) {
+  versors dest;
+  glm_quat_identity(dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief make given quaternion array's each element identity quaternion
+ *
+ * @param[in, out]  q     quat array (must be aligned (16)
+ *                        if alignment is not disabled)
+ *
+ * @param[in]       count count of quaternions
+ */
+CGLM_INLINE
+void
+glms_quat_(identity_array)(versors * __restrict q, size_t count) {
+  CGLM_ALIGN(16) versor v = GLM_QUAT_IDENTITY_INIT;
+  size_t i;
+
+  for (i = 0; i < count; i++) {
+    glm_vec4_copy(v, q[i].raw);
+  }
+}
+
+/*!
+ * @brief inits quaternion with raw values
+ *
+ * @param[in]   x     x
+ * @param[in]   y     y
+ * @param[in]   z     z
+ * @param[in]   w     w (real part)
+ * @returns quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(init)(float x, float y, float z, float w) {
+  versors dest;
+  glm_quat_init(dest.raw, x, y, z, w);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion with axis vector
+ *
+ * @param[in]   angle angle (radians)
+ * @param[in]   axis  axis
+ * @returns quaternion
+ */
+CGLM_INLINE
+versors
+glms_quatv(float angle, vec3s axis) {
+  versors dest;
+  glm_quatv(dest.raw, angle, axis.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates NEW quaternion with individual axis components
+ *
+ * @param[in]   angle angle (radians)
+ * @param[in]   x     axis.x
+ * @param[in]   y     axis.y
+ * @param[in]   z     axis.z
+ * @returns quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat(float angle, float x, float y, float z) {
+  versors dest;
+  glm_quat(dest.raw, angle, x, y, z);
+  return dest;
+}
+
+/*!
+ * @brief compute quaternion rotating vector A to vector B
+ *
+ * @param[in]   a     vec3 (must have unit length)
+ * @param[in]   b     vec3 (must have unit length)
+ * @returns     quaternion (of unit length)
+ */
+CGLM_INLINE
+versors
+glms_quat_(from_vecs)(vec3s a, vec3s b) {
+  versors dest;
+  glm_quat_from_vecs(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief returns norm (magnitude) of quaternion
+ *
+ * @param[in]  q  quaternion
+ */
+CGLM_INLINE
+float
+glms_quat_(norm)(versors q) {
+  return glm_quat_norm(q.raw);
+}
+
+/*!
+ * @brief normalize quaternion
+ *
+ * @param[in]  q  quaternion
+ * @returns    quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(normalize)(versors q) {
+  versors dest;
+  glm_quat_normalize_to(q.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief dot product of two quaternion
+ *
+ * @param[in]  p  quaternion 1
+ * @param[in]  q  quaternion 2
+ * @returns    dot product
+ */
+CGLM_INLINE
+float
+glms_quat_(dot)(versors p, versors q) {
+  return glm_quat_dot(p.raw, q.raw);
+}
+
+/*!
+ * @brief conjugate of quaternion
+ *
+ * @param[in]   q     quaternion
+ * @returns    conjugate
+ */
+CGLM_INLINE
+versors
+glms_quat_(conjugate)(versors q) {
+  versors dest;
+  glm_quat_conjugate(q.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief inverse of non-zero quaternion
+ *
+ * @param[in]  q    quaternion
+ * @returns    inverse quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(inv)(versors q) {
+  versors dest;
+  glm_quat_inv(q.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add (componentwise) two quaternions and store result in dest
+ *
+ * @param[in]   p    quaternion 1
+ * @param[in]   q    quaternion 2
+ * @returns result quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(add)(versors p, versors q) {
+  versors dest;
+  glm_quat_add(p.raw, q.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief subtract (componentwise) two quaternions and store result in dest
+ *
+ * @param[in]   p    quaternion 1
+ * @param[in]   q    quaternion 2
+ * @returns result quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(sub)(versors p, versors q) {
+  versors dest;
+  glm_quat_sub(p.raw, q.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief returns normalized imaginary part of quaternion
+ *
+ * @param[in]   q    quaternion
+ */
+CGLM_INLINE
+vec3s
+glms_quat_(imagn)(versors q) {
+  vec3s dest;
+  glm_normalize_to(q.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief returns length of imaginary part of quaternion
+ *
+ * @param[in]   q    quaternion
+ */
+CGLM_INLINE
+float
+glms_quat_(imaglen)(versors q) {
+  return glm_quat_imaglen(q.raw);
+}
+
+/*!
+ * @brief returns angle of quaternion
+ *
+ * @param[in]   q    quaternion
+ */
+CGLM_INLINE
+float
+glms_quat_(angle)(versors q) {
+  return glm_quat_angle(q.raw);
+}
+
+/*!
+ * @brief axis of quaternion
+ *
+ * @param[in]   q    quaternion
+ * @returns axis of quaternion
+ */
+CGLM_INLINE
+vec3s
+glms_quat_(axis)(versors q) {
+  vec3s dest;
+  glm_quat_axis(q.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief multiplies two quaternion and stores result in dest
+ *        this is also called Hamilton Product
+ *
+ * According to WikiPedia:
+ * The product of two rotation quaternions [clarification needed] will be
+ * equivalent to the rotation q followed by the rotation p
+ *
+ * @param[in]   p     quaternion 1
+ * @param[in]   q     quaternion 2
+ * @returns  result quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(mul)(versors p, versors q) {
+  versors dest;
+  glm_quat_mul(p.raw, q.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief convert quaternion to mat4
+ *
+ * @param[in]   q     quaternion
+ * @returns  result matrix
+ */
+CGLM_INLINE
+mat4s
+glms_quat_(mat4)(versors q) {
+  mat4s dest;
+  glm_quat_mat4(q.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief convert quaternion to mat4 (transposed)
+ *
+ * @param[in]   q     quaternion
+ * @returns  result matrix as transposed
+ */
+CGLM_INLINE
+mat4s
+glms_quat_(mat4t)(versors q) {
+  mat4s dest;
+  glm_quat_mat4t(q.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief convert quaternion to mat3
+ *
+ * @param[in]   q     quaternion
+ * @returns  result matrix
+ */
+CGLM_INLINE
+mat3s
+glms_quat_(mat3)(versors q) {
+  mat3s dest;
+  glm_quat_mat3(q.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief convert quaternion to mat3 (transposed)
+ *
+ * @param[in]   q     quaternion
+ * @returns  result matrix
+ */
+CGLM_INLINE
+mat3s
+glms_quat_(mat3t)(versors q) {
+  mat3s dest;
+  glm_quat_mat3t(q.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief interpolates between two quaternions
+ *        using linear interpolation (LERP)
+ *
+ * @param[in]   from  from
+ * @param[in]   to    to
+ * @param[in]   t     interpolant (amount)
+ * @returns  result quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(lerp)(versors from, versors to, float t) {
+  versors dest;
+  glm_quat_lerp(from.raw, to.raw, t, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief interpolates between two quaternions
+ *        using linear interpolation (LERP)
+ *
+ * @param[in]   from  from
+ * @param[in]   to    to
+ * @param[in]   t     interpolant (amount) clamped between 0 and 1
+ * @returns  result quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(lerpc)(versors from, versors to, float t) {
+  versors dest;
+  glm_quat_lerpc(from.raw, to.raw, t, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief interpolates between two quaternions
+ *        taking the shortest rotation path using
+ *        normalized linear interpolation (NLERP)
+ *
+ * @param[in]   from  from
+ * @param[in]   to    to
+ * @param[in]   t     interpolant (amount)
+ * @returns result quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(nlerp)(versors from, versors to, float t) {
+  versors dest;
+  glm_quat_nlerp(from.raw, to.raw, t, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief interpolates between two quaternions
+ *        using spherical linear interpolation (SLERP)
+ *
+ * @param[in]   from  from
+ * @param[in]   to    to
+ * @param[in]   t     amount
+ * @returns result quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(slerp)(versors from, versors to, float t) {
+  versors dest;
+  glm_quat_slerp(from.raw, to.raw, t, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief interpolates between two quaternions
+ *        using spherical linear interpolation (SLERP) and always takes the longest path
+ *
+ * @param[in]   from  from
+ * @param[in]   to    to
+ * @param[in]   t     amount
+ * @returns result quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(slerp_longest)(versors from, versors to, float t) {
+  versors dest;
+  glm_quat_slerp_longest(from.raw, to.raw, t, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates view matrix using quaternion as camera orientation
+ *
+ * @param[in]   eye   eye
+ * @param[in]   ori   orientation in world space as quaternion
+ * @returns  view matrix
+ */
+CGLM_INLINE
+mat4s
+glms_quat_(look)(vec3s eye, versors ori) {
+  mat4s dest;
+  glm_quat_look(eye.raw, ori.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates look rotation quaternion
+ *
+ * @param[in]   dir   direction to look
+ * @param[in]   up    up vector
+ * @returns  destination quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(for)(vec3s dir, vec3s up) {
+  versors dest;
+  glm_quat_for(dir.raw, up.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief creates look rotation quaternion using source and
+ *        destination positions p suffix stands for position
+ *
+ * @param[in]   from  source point
+ * @param[in]   to    destination point
+ * @param[in]   up    up vector
+ * @returns  destination quaternion
+ */
+CGLM_INLINE
+versors
+glms_quat_(forp)(vec3s from, vec3s to, vec3s up) {
+  versors dest;
+  glm_quat_forp(from.raw, to.raw, up.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief rotate vector using using quaternion
+ *
+ * @param[in]   q     quaternion
+ * @param[in]   v     vector to rotate
+ * @returns  rotated vector
+ */
+CGLM_INLINE
+vec3s
+glms_quat_(rotatev)(versors q, vec3s v) {
+  vec3s dest;
+  glm_quat_rotatev(q.raw, v.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief rotate existing transform matrix using quaternion
+ *
+ * @param[in]   m     existing transform matrix
+ * @param[in]   q     quaternion
+ * @returns  rotated matrix/transform
+ */
+CGLM_INLINE
+mat4s
+glms_quat_(rotate)(mat4s m, versors q) {
+  glm_quat_rotate(m.raw, q.raw, m.raw);
+  return m;
+}
+
+/*!
+ * @brief rotate existing transform matrix using quaternion at pivot point
+ *
+ * @param[in, out]   m     existing transform matrix
+ * @param[in]        q     quaternion
+ * @returns pivot
+ */
+CGLM_INLINE
+mat4s
+glms_quat_(rotate_at)(mat4s m, versors q, vec3s pivot) {
+  glm_quat_rotate_at(m.raw, q.raw, pivot.raw);
+  return m;
+}
+
+/*!
+ * @brief rotate NEW transform matrix using quaternion at pivot point
+ *
+ * this creates rotation matrix, it assumes you don't have a matrix
+ *
+ * this should work faster than glm_quat_rotate_at because it reduces
+ * one glm_translate.
+ *
+ * @param[in]   q     quaternion
+ * @returns pivot
+ */
+CGLM_INLINE
+mat4s
+glms_quat_(rotate_atm)(versors q, vec3s pivot) {
+  mat4s dest;
+  glm_quat_rotate_atm(dest.raw, q.raw, pivot.raw);
+  return dest;
+}
+
+/*!
+ * @brief Create CGLM quaternion from pointer
+ *
+ * @param[in]  src  pointer to an array of floats
+ * @returns constructed quaternion from raw pointer
+ */
+CGLM_INLINE
+versors
+glms_quat_(make)(const float * __restrict src) {
+  versors dest;
+  glm_quat_make(src, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_quat_h */
diff --git a/external/cglm/struct/ray.h b/external/cglm/struct/ray.h
new file mode 100644
index 0000000..10609b9
--- /dev/null
+++ b/external/cglm/struct/ray.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglms_ray_h
+#define cglms_ray_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../ray.h"
+
+/* api definition */
+#define glms_ray_(NAME) CGLM_STRUCTAPI(ray, NAME)
+
+/*!
+ * @brief Möller–Trumbore ray-triangle intersection algorithm
+ *
+ * @param[in] origin         origin of ray
+ * @param[in] direction      direction of ray
+ * @param[in] v0             first vertex of triangle
+ * @param[in] v1             second vertex of triangle
+ * @param[in] v2             third vertex of triangle
+ * @param[in, out] d         distance to intersection
+ * @return whether there is intersection
+ */
+CGLM_INLINE
+bool
+glms_ray_(triangle)(vec3s  origin,
+                    vec3s  direction,
+                    vec3s  v0,
+                    vec3s  v1,
+                    vec3s  v2,
+                    float *d) {
+  return glm_ray_triangle(origin.raw, direction.raw, v0.raw, v1.raw, v2.raw, d);
+}
+
+/*!
+ * @brief ray sphere intersection
+ *
+ * returns false if there is no intersection if true:
+ *
+ * - t1 > 0, t2 > 0: ray intersects the sphere at t1 and t2 both ahead of the origin
+ * - t1 < 0, t2 > 0: ray starts inside the sphere, exits at t2
+ * - t1 < 0, t2 < 0: no intersection ahead of the ray ( returns false )
+ * - the caller can check if the intersection points (t1 and t2) fall within a
+ *   specific range (for example, tmin < t1, t2 < tmax) to determine if the
+ *   intersections are within a desired segment of the ray
+ *
+ * @param[in]  origin ray origin
+ * @param[out] dir    normalized ray direction
+ * @param[in]  s      sphere  [center.x, center.y, center.z, radii]
+ * @param[in]  t1     near point1 (closer to origin)
+ * @param[in]  t2     far point2 (farther from origin)
+ *
+ * @returns whether there is intersection
+ */
+CGLM_INLINE
+bool
+glms_ray_(sphere)(vec3s origin,
+                  vec3s dir,
+                  vec4s s,
+                  float * __restrict t1,
+                  float * __restrict t2) {
+  return glm_ray_sphere(origin.raw, dir.raw, s.raw, t1, t2);
+}
+
+/*!
+ * @brief point using t by 𝐏(𝑡)=𝐀+𝑡𝐛
+ *
+ * @param[in]  orig  origin of ray
+ * @param[in]  dir   direction of ray
+ * @param[in]  t     parameter
+ * @returns point point at t
+ */
+CGLM_INLINE
+vec3s
+glms_ray_(at)(vec3s orig, vec3s dir, float t) {
+  vec3s r;
+  glm_ray_at(orig.raw, dir.raw, t, r.raw);
+  return r;
+}
+
+#endif /* cglms_ray_h */
diff --git a/external/cglm/struct/sphere.h b/external/cglm/struct/sphere.h
new file mode 100644
index 0000000..9859c72
--- /dev/null
+++ b/external/cglm/struct/sphere.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglms_spheres_h
+#define cglms_spheres_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../sphere.h"
+#include "mat4.h"
+
+/*
+  Sphere Representation in cglm: [center.x, center.y, center.z, radii]
+
+  You could use this representation or you can convert it to vec4 before call
+  any function
+ */
+
+/*!
+ * @brief helper for getting sphere radius
+ *
+ * @param[in]   s  sphere
+ *
+ * @return returns radii
+ */
+CGLM_INLINE
+float
+glms_sphere_radii(vec4s s) {
+  return glm_sphere_radii(s.raw);
+}
+
+/*!
+ * @brief apply transform to sphere, it is just wrapper for glm_mat4_mulv3
+ *
+ * @param[in]  s    sphere
+ * @param[in]  m    transform matrix
+ * @returns         transformed sphere
+ */
+CGLM_INLINE
+vec4s
+glms_sphere_transform(vec4s s, mat4s m) {
+  vec4s r;
+  glm_sphere_transform(s.raw, m.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief merges two spheres and creates a new one
+ *
+ * two sphere must be in same space, for instance if one in world space then
+ * the other must be in world space too, not in local space.
+ *
+ * @param[in]  s1   sphere 1
+ * @param[in]  s2   sphere 2
+ * returns          merged/extended sphere
+ */
+CGLM_INLINE
+vec4s
+glms_sphere_merge(vec4s s1, vec4s s2) {
+  vec4s r;
+  glm_sphere_merge(s1.raw, s2.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief check if two sphere intersects
+ *
+ * @param[in]   s1  sphere
+ * @param[in]   s2  other sphere
+ */
+CGLM_INLINE
+bool
+glms_sphere_sphere(vec4s s1, vec4s s2) {
+  return glm_sphere_sphere(s1.raw, s2.raw);
+}
+
+/*!
+ * @brief check if sphere intersects with point
+ *
+ * @param[in]   s      sphere
+ * @param[in]   point  point
+ */
+CGLM_INLINE
+bool
+glms_sphere_point(vec4s s, vec3s point) {
+  return glm_sphere_point(s.raw, point.raw);
+}
+
+#endif /* cglms_spheres_h */
diff --git a/external/cglm/struct/vec2-ext.h b/external/cglm/struct/vec2-ext.h
new file mode 100644
index 0000000..246132f
--- /dev/null
+++ b/external/cglm/struct/vec2-ext.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*!
+ * @brief SIMD like functions
+ */
+
+/*
+ Functions:
+   CGLM_INLINE vec2s glms_vec2_fill(float val)
+   CGLM_INLINE bool  glms_vec2_eq(vec2s v, float val)
+   CGLM_INLINE bool  glms_vec2_eq_eps(vec2s v, float val)
+   CGLM_INLINE bool  glms_vec2_eq_all(vec2s v)
+   CGLM_INLINE bool  glms_vec2_eqv(vec2s a, vec2s b)
+   CGLM_INLINE bool  glms_vec2_eqv_eps(vec2s a, vec2s b)
+   CGLM_INLINE float glms_vec2_max(vec2s v)
+   CGLM_INLINE float glms_vec2_min(vec2s v)
+   CGLM_INLINE bool  glms_vec2_isnan(vec2s v)
+   CGLM_INLINE bool  glms_vec2_isinf(vec2s v)
+   CGLM_INLINE bool  glms_vec2_isvalid(vec2s v)
+   CGLM_INLINE vec2s glms_vec2_sign(vec2s v)
+   CGLM_INLINE vec2s glms_vec2_abs(vec2s v)
+   CGLM_INLINE vec2s glms_vec2_fract(vec2s v)
+   CGLM_INLINE vec2s glms_vec2_floor(vec2s v)
+   CGLM_INLINE vec2s glms_vec2_mods(vec2s v, float s)
+   CGLM_INLINE vec2s glms_vec2_steps(float edge, vec2s v)
+   CGLM_INLINE vec2s glms_vec2_stepr(vec2s edge, float v)
+   CGLM_INLINE vec2s glms_vec2_sqrt(vec2s v)
+ */
+
+#ifndef cglms_vec2s_ext_h
+#define cglms_vec2s_ext_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../util.h"
+#include "../vec2-ext.h"
+
+/* api definition */
+#define glms_vec2_(NAME) CGLM_STRUCTAPI(vec2, NAME)
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param[in]  val  value
+ * @returns         dest
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(fill)(float val) {
+  vec2s r;
+  glm_vec2_fill(r.raw, val);
+  return r;
+}
+
+/*!
+ * @brief check if vector is equal to value (without epsilon)
+ *
+ * @param[in] v   vector
+ * @param[in] val value
+ */
+CGLM_INLINE
+bool
+glms_vec2_(eq)(vec2s v, float val) {
+  return glm_vec2_eq(v.raw, val);
+}
+
+/*!
+ * @brief check if vector is equal to value (with epsilon)
+ *
+ * @param[in] v   vector
+ * @param[in] val value
+ */
+CGLM_INLINE
+bool
+glms_vec2_(eq_eps)(vec2s v, float val) {
+  return glm_vec2_eq_eps(v.raw, val);
+}
+
+/*!
+ * @brief check if vector members are equal (without epsilon)
+ *
+ * @param[in] v   vector
+ */
+CGLM_INLINE
+bool
+glms_vec2_(eq_all)(vec2s v) {
+  return glm_vec2_eq_all(v.raw);
+}
+
+/*!
+ * @brief check if vector is equal to another (without epsilon)
+ *
+ * @param[in] a vector
+ * @param[in] b vector
+ */
+CGLM_INLINE
+bool
+glms_vec2_(eqv)(vec2s a, vec2s b) {
+  return glm_vec2_eqv(a.raw, b.raw);
+}
+
+/*!
+ * @brief check if vector is equal to another (with epsilon)
+ *
+ * @param[in] a vector
+ * @param[in] b vector
+ */
+CGLM_INLINE
+bool
+glms_vec2_(eqv_eps)(vec2s a, vec2s b) {
+  return glm_vec2_eqv_eps(a.raw, b.raw);
+}
+
+/*!
+ * @brief max value of vector
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+float
+glms_vec2_(max)(vec2s v) {
+  return glm_vec2_max(v.raw);
+}
+
+/*!
+ * @brief min value of vector
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+float
+glms_vec2_min(vec2s v) {
+  return glm_vec2_min(v.raw);
+}
+
+/*!
+ * @brief check if one of items is NaN (not a number)
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glms_vec2_(isnan)(vec2s v) {
+  return glm_vec2_isnan(v.raw);
+}
+
+/*!
+ * @brief check if one of items is INFINITY
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glms_vec2_(isinf)(vec2s v) {
+  return glm_vec2_isinf(v.raw);
+}
+
+/*!
+ * @brief check if all items are valid number
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glms_vec2_isvalid(vec2s v) {
+  return glm_vec2_isvalid(v.raw);
+}
+
+/*!
+ * @brief get sign of 32 bit float as +1, -1, 0
+ *
+ * Important: It returns 0 for zero/NaN input
+ *
+ * @param   v   vector
+ * @returns     sign vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(sign)(vec2s v) {
+  vec2s r;
+  glm_vec2_sign(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief fractional part of each vector item
+ *
+ * @param   v   vector
+ * @returns     abs vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(abs)(vec2s v) {
+  vec2s r;
+  glm_vec2_abs(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief fractional part of each vector item
+ *
+ * @param[in]  v    vector
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(fract)(vec2s v) {
+  vec2s r;
+  glm_vec2_fract(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief floor of each vector item
+ *
+ * @param[in]  v    vector
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(floor)(vec2s v) {
+  vec2s r;
+  glm_vec2_floor(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief mod of each vector item by scalar
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(mods)(vec2s v, float s) {
+  vec2s r;
+  glm_vec2_mods(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief threshold each vector item with scalar
+ *        condition is: (x[i] < edge) ? 0.0 : 1.0
+ *
+ * @param[in]   edge   threshold
+ * @param[in]   x      vector to test against threshold
+ * @returns            destination
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(steps)(float edge, vec2s x) {
+  vec2s r;
+  glm_vec2_steps(edge, x.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief threshold a value with *vector* as the threshold
+ *        condition is: (x < edge[i]) ? 0.0 : 1.0
+ *
+ * @param[in]   edge   threshold vector
+ * @param[in]   x      value to test against threshold
+ * @returns            destination
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(stepr)(vec2s edge, float x) {
+  vec2s r;
+  glm_vec2_stepr(edge.raw, x, r.raw);
+  return r;
+}
+
+/*!
+ * @brief square root of each vector item
+ *
+ * @param[in]  v    vector
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(sqrt)(vec2s v) {
+  vec2s r;
+  glm_vec2_sqrt(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief treat vectors as complex numbers and multiply them as such.
+ *
+ * @param[in]  a    left number
+ * @param[in]  b    right number
+ * @param[out] dest destination number
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(complex_mul)(vec2s a, vec2s b, vec2s dest) {
+  glm_vec2_complex_mul(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief treat vectors as complex numbers and divide them as such.
+ *
+ * @param[in]  a    left number (numerator)
+ * @param[in]  b    right number (denominator)
+ * @param[out] dest destination number
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(complex_div)(vec2s a, vec2s b, vec2s dest) {
+  glm_vec2_complex_div(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief treat the vector as a complex number and conjugate it as such.
+ *
+ * @param[in]  a    the number
+ * @param[out] dest destination number
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(complex_conjugate)(vec2s a, vec2s dest) {
+  glm_vec2_complex_conjugate(a.raw, dest.raw);
+  return dest;
+}
+
+#endif /* cglms_vec2s_ext_h */
diff --git a/external/cglm/struct/vec2.h b/external/cglm/struct/vec2.h
new file mode 100644
index 0000000..40ed659
--- /dev/null
+++ b/external/cglm/struct/vec2.h
@@ -0,0 +1,747 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLMS_VEC2_ONE_INIT
+   GLMS_VEC2_ZERO_INIT
+   GLMS_VEC2_ONE
+   GLMS_VEC2_ZERO
+
+ Functions:
+   CGLM_INLINE vec2s glms_vec2(vec3s v3)
+   CGLM_INLINE void  glms_vec2_pack(vec2s dst[], vec2 src[], size_t len)
+   CGLM_INLINE void  glms_vec2_unpack(vec2 dst[], vec2s src[], size_t len)
+   CGLM_INLINE vec2s glms_vec2_zero(void)
+   CGLM_INLINE vec2s glms_vec2_one(void)
+   CGLM_INLINE float glms_vec2_dot(vec2s a, vec2s b)
+   CGLM_INLINE float glms_vec2_cross(vec2s a, vec2s b)
+   CGLM_INLINE float glms_vec2_norm2(vec2s v)
+   CGLM_INLINE float glms_vec2_norm(vec2s v)
+   CGLM_INLINE vec2s glms_vec2_add(vec2s a, vec2s b)
+   CGLM_INLINE vec2s glms_vec2_adds(vec2s a, float s)
+   CGLM_INLINE vec2s glms_vec2_sub(vec2s a, vec2s b)
+   CGLM_INLINE vec2s glms_vec2_subs(vec2s a, float s)
+   CGLM_INLINE vec2s glms_vec2_mul(vec2s a, vec2s b)
+   CGLM_INLINE vec2s glms_vec2_scale(vec2s v, float s)
+   CGLM_INLINE vec2s glms_vec2_scale_as(vec2s v, float s)
+   CGLM_INLINE vec2s glms_vec2_div(vec2s a, vec2s b)
+   CGLM_INLINE vec2s glms_vec2_divs(vec2s a, float s)
+   CGLM_INLINE vec2s glms_vec2_addadd(vec2s a, vec2s b, vec2s dest)
+   CGLM_INLINE vec2s glms_vec2_subadd(vec2s a, vec2s b, vec2s dest)
+   CGLM_INLINE vec2s glms_vec2_muladd(vec2s a, vec2s b, vec2s dest)
+   CGLM_INLINE vec2s glms_vec2_muladds(vec2s a, float s, vec2s dest)
+   CGLM_INLINE vec2s glms_vec2_maxadd(vec2s a, vec2s b, vec2s dest)
+   CGLM_INLINE vec2s glms_vec2_minadd(vec2s a, vec2s b, vec2s dest)
+   CGLM_INLINE vec2s glms_vec2_subsub(vec2s a, vec2s b, vec2s dest)
+   CGLM_INLINE vec2s glms_vec2_addsub(vec2s a, vec2s b, vec2s dest)
+   CGLM_INLINE vec2s glms_vec2_mulsub(vec2s a, vec2s b, vec2s dest)
+   CGLM_INLINE vec2s glms_vec2_mulsubs(vec2s a, float s, vec2s dest)
+   CGLM_INLINE vec2s glms_vec2_maxsub(vec2s a, vec2s b, vec2s dest)
+   CGLM_INLINE vec2s glms_vec2_minsub(vec2s a, vec2s b, vec2s dest)
+   CGLM_INLINE vec2s glms_vec2_negate(vec2s v)
+   CGLM_INLINE vec2s glms_vec2_normalize(vec2s v)
+   CGLM_INLINE vec2s glms_vec2_rotate(vec2s v, float angle, vec2s axis)
+   CGLM_INLINE vec2s glms_vec2_center(vec2s a, vec2s b)
+   CGLM_INLINE float glms_vec2_distance(vec2s a, vec2s b)
+   CGLM_INLINE float glms_vec2_distance2(vec2s a, vec2s b)
+   CGLM_INLINE vec2s glms_vec2_maxv(vec2s a, vec2s b)
+   CGLM_INLINE vec2s glms_vec2_minv(vec2s a, vec2s b)
+   CGLM_INLINE vec2s glms_vec2_clamp(vec2s v, float minVal, float maxVal)
+   CGLM_INLINE vec2s glms_vec2_lerp(vec2s from, vec2s to, float t)
+   CGLM_INLINE vec2s glms_vec2_step(vec2s edge, vec2s x)
+   CGLM_INLINE vec2s glms_vec2_make(float * restrict src)
+   CGLM_INLINE vec2s glms_vec2_reflect(vec2s v, vec2s n)
+   CGLM_INLINE bool  glms_vec2_refract(vec2s v, vec2s n, float eta, vec2s *dest)
+ */
+
+#ifndef cglms_vec2s_h
+#define cglms_vec2s_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../util.h"
+#include "../vec2.h"
+#include "vec2-ext.h"
+
+#define GLMS_VEC2_ONE_INIT   {GLM_VEC2_ONE_INIT}
+#define GLMS_VEC2_ZERO_INIT  {GLM_VEC2_ZERO_INIT}
+
+#define GLMS_VEC2_ONE  ((vec2s)GLMS_VEC2_ONE_INIT)
+#define GLMS_VEC2_ZERO ((vec2s)GLMS_VEC2_ZERO_INIT)
+
+/*!
+ * @brief init vec2 using vec2
+ *
+ * @param[in]  v3   vector3
+ * @returns         destination
+ */
+CGLM_INLINE
+vec2s
+glms_vec2(vec3s v3) {
+  vec2s r;
+  glm_vec2(v3.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief pack an array of vec2 into an array of vec2s
+ *
+ * @param[out] dst array of vec2
+ * @param[in]  src array of vec2s
+ * @param[in]  len number of elements
+ */
+CGLM_INLINE
+void
+glms_vec2_(pack)(vec2s dst[], vec2 src[], size_t len) {
+  size_t i;
+
+  for (i = 0; i < len; i++) {
+    glm_vec2_copy(src[i], dst[i].raw);
+  }
+}
+
+/*!
+ * @brief unpack an array of vec2s into an array of vec2
+ *
+ * @param[out] dst array of vec2s
+ * @param[in]  src array of vec2
+ * @param[in]  len number of elements
+ */
+CGLM_INLINE
+void
+glms_vec2_(unpack)(vec2 dst[], vec2s src[], size_t len) {
+  size_t i;
+
+  for (i = 0; i < len; i++) {
+    glm_vec2_copy(src[i].raw, dst[i]);
+  }
+}
+
+/*!
+ * @brief make vector zero
+ *
+ * @returns zero vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(zero)(void) {
+  vec2s r;
+  glm_vec2_zero(r.raw);
+  return r;
+}
+
+/*!
+ * @brief make vector one
+ *
+ * @returns one vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(one)(void) {
+  vec2s r;
+  glm_vec2_one(r.raw);
+  return r;
+}
+
+/*!
+ * @brief vec2 dot product
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ *
+ * @return dot product
+ */
+CGLM_INLINE
+float
+glms_vec2_(dot)(vec2s a, vec2s b) {
+  return glm_vec2_dot(a.raw, b.raw);
+}
+
+/*!
+ * @brief vec2 cross product
+ *
+ * REF: http://allenchou.net/2013/07/cross-product-of-2d-vectors/
+ *
+ * @param[in]  a vector1
+ * @param[in]  b vector2
+ *
+ * @return Z component of cross product
+ */
+CGLM_INLINE
+float
+glms_vec2_(cross)(vec2s a, vec2s b) {
+  return glm_vec2_cross(a.raw, b.raw);
+}
+
+/*!
+ * @brief norm * norm (magnitude) of vec
+ *
+ * we can use this func instead of calling norm * norm, because it would call
+ * sqrtf function twice but with this func we can avoid func call, maybe this is
+ * not good name for this func
+ *
+ * @param[in] v vector
+ *
+ * @return norm * norm
+ */
+CGLM_INLINE
+float
+glms_vec2_(norm2)(vec2s v) {
+  return glm_vec2_norm2(v.raw);
+}
+
+/*!
+ * @brief norm (magnitude) of vec2
+ *
+ * @param[in] v vector
+ *
+ * @return norm
+ */
+CGLM_INLINE
+float
+glms_vec2_(norm)(vec2s v) {
+  return glm_vec2_norm(v.raw);
+}
+
+/*!
+ * @brief add a vector to b vector store result in dest
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(add)(vec2s a, vec2s b) {
+  vec2s r;
+  glm_vec2_add(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief add scalar to v vector store result in dest (d = v + s)
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(adds)(vec2s a, float s) {
+  vec2s r;
+  glm_vec2_adds(a.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief subtract b vector from a vector store result in dest
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(sub)(vec2s a, vec2s b) {
+  vec2s r;
+  glm_vec2_sub(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief subtract scalar from v vector store result in dest (d = v - s)
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(subs)(vec2s a, float s) {
+  vec2s r;
+  glm_vec2_subs(a.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply two vectors (component-wise multiplication)
+ *
+ * @param     a     vector1
+ * @param     b     vector2
+ * @returns         result = (a[0] * b[0], a[1] * b[1])
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(mul)(vec2s a, vec2s b) {
+  vec2s r;
+  glm_vec2_mul(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply/scale vec2 vector with scalar: result = v * s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(scale)(vec2s v, float s) {
+  vec2s r;
+  glm_vec2_scale(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief make vec2 vector scale as specified: result = unit(v) * s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(scale_as)(vec2s v, float s) {
+  vec2s r;
+  glm_vec2_scale_as(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief div vector with another component-wise division: d = a / b
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         result = (a[0]/b[0], a[1]/b[1])
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(div)(vec2s a, vec2s b) {
+  vec2s r;
+  glm_vec2_div(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief div vector with scalar: d = v / s
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @returns         result = (a[0]/s, a[1]/s)
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(divs)(vec2s a, float s) {
+  vec2s r;
+  glm_vec2_divs(a.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief add two vectors and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += (a + b)
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(addadd)(vec2s a, vec2s b, vec2s dest) {
+  glm_vec2_addadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief sub two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += (a + b)
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(subadd)(vec2s a, vec2s b, vec2s dest) {
+  glm_vec2_subadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief mul two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += (a * b)
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(muladd)(vec2s a, vec2s b, vec2s dest) {
+  glm_vec2_muladd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief mul vector with scalar and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @returns         dest += (a * b)
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(muladds)(vec2s a, float s, vec2s dest) {
+  glm_vec2_muladds(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add max of two vectors to result/dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += max(a, b)
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(maxadd)(vec2s a, vec2s b, vec2s dest) {
+  glm_vec2_maxadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add min of two vectors to result/dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += min(a, b)
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(minadd)(vec2s a, vec2s b, vec2s dest) {
+  glm_vec2_minadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief sub two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= (a - b)
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(subsub)(vec2s a, vec2s b, vec2s dest) {
+  glm_vec2_subsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= (a + b)
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(addsub)(vec2s a, vec2s b, vec2s dest) {
+  glm_vec2_addsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief mul two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= (a * b)
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(mulsub)(vec2s a, vec2s b, vec2s dest) {
+  glm_vec2_mulsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief mul vector with scalar and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @returns         dest -= (a * b)
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(mulsubs)(vec2s a, float s, vec2s dest) {
+  glm_vec2_mulsubs(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief sub max of two vectors to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= max(a, b)
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(maxsub)(vec2s a, vec2s b, vec2s dest) {
+  glm_vec2_maxsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief sub min of two vectors to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= min(a, b)
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(minsub)(vec2s a, vec2s b, vec2s dest) {
+  glm_vec2_minsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief negate vector components
+ *
+ * @param[in]  v  vector
+ * @returns       negated vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(negate)(vec2s v) {
+  glm_vec2_negate(v.raw);
+  return v;
+}
+
+/*!
+ * @brief normalize vec2 and store result in same vec
+ *
+ * @param[in] v vector
+ * @returns     normalized vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(normalize)(vec2s v) {
+  glm_vec2_normalize(v.raw);
+  return v;
+}
+
+/*!
+ * @brief rotate vec2 by angle using Rodrigues' rotation formula
+ *
+ * @param[in]     v     vector
+ * @param[in]     angle angle by radians
+ * @returns             rotated vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(rotate)(vec2s v, float angle) {
+  vec2s r;
+  glm_vec2_rotate(v.raw, angle, r.raw);
+  return r;
+}
+
+/**
+ * @brief find center point of two vector
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         center point
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(center)(vec2s a, vec2s b) {
+  vec2s r;
+  glm_vec2_center(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/**
+ * @brief distance between two vectors
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ * @return      distance
+ */
+CGLM_INLINE
+float
+glms_vec2_(distance)(vec2s a, vec2s b) {
+  return glm_vec2_distance(a.raw, b.raw);
+}
+
+/**
+ * @brief squared distance between two vectors
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ * @return      squared distance (distance * distance)
+ */
+CGLM_INLINE
+float
+glms_vec2_(distance2)(vec2s a, vec2s b) {
+  return glm_vec2_distance2(a.raw, b.raw);
+}
+
+/*!
+ * @brief max values of vectors
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         destination
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(maxv)(vec2s a, vec2s b) {
+  vec2s r;
+  glm_vec2_maxv(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief min values of vectors
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         destination
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(minv)(vec2s a, vec2s b) {
+  vec2s r;
+  glm_vec2_minv(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief clamp vector's individual members between min and max values
+ *
+ * @param[in]       v       vector
+ * @param[in]       minVal  minimum value
+ * @param[in]       maxVal  maximum value
+ * @returns                 clamped vector
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(clamp)(vec2s v, float minVal, float maxVal) {
+  glm_vec2_clamp(v.raw, minVal, maxVal);
+  return v;
+}
+
+/*!
+ * @brief linear interpolation between two vectors
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from  from value
+ * @param[in]   to    to value
+ * @param[in]   t     interpolant (amount)
+ * @returns           destination
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(lerp)(vec2s from, vec2s to, float t) {
+  vec2s r;
+  glm_vec2_lerp(from.raw, to.raw, t, r.raw);
+  return r;
+}
+
+/*!
+ * @brief threshold function
+ *
+ * @param[in]   edge    threshold
+ * @param[in]   x       value to test against threshold
+ * @returns             destination
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(step)(vec2s edge, vec2s x) {
+  vec2s r;
+  glm_vec2_step(edge.raw, x.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief Create two dimensional vector from pointer
+ *
+ * @param[in]  src  pointer to an array of floats
+ * @returns constructed 2D vector from raw pointer
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(make)(const float * __restrict src) {
+  vec2s dest;
+  glm_vec2_make(src, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief reflection vector using an incident ray and a surface normal
+ *
+ * @param[in]  I    incident vector
+ * @param[in]  N    normalized normal vector
+ * @returns reflection result
+ */
+CGLM_INLINE
+vec2s
+glms_vec2_(reflect)(vec2s v, vec2s n) {
+  vec2s dest;
+  glm_vec2_reflect(v.raw, n.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief computes refraction vector for an incident vector and a surface normal.
+ *
+ * calculates the refraction vector based on Snell's law. If total internal reflection
+ * occurs (angle too great given eta), dest is set to zero and returns false.
+ * Otherwise, computes refraction vector, stores it in dest, and returns true.
+ *
+ * @param[in]  v    normalized incident vector
+ * @param[in]  n    normalized normal vector
+ * @param[in]  eta  ratio of indices of refraction (incident/transmitted)
+ * @param[out] dest refraction vector if refraction occurs; zero vector otherwise
+ *
+ * @returns true if refraction occurs; false if total internal reflection occurs.
+ */
+CGLM_INLINE
+bool
+glms_vec2_(refract)(vec2s v, vec2s n, float eta, vec2s * __restrict dest) {
+  return glm_vec2_refract(v.raw, n.raw, eta, dest->raw);
+}
+
+#endif /* cglms_vec2s_h */
diff --git a/external/cglm/struct/vec3-ext.h b/external/cglm/struct/vec3-ext.h
new file mode 100644
index 0000000..6cd8ca0
--- /dev/null
+++ b/external/cglm/struct/vec3-ext.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*!
+ * @brief SIMD like functions
+ */
+
+/*
+ Functions:
+   CGLM_INLINE vec3s glms_vec3_broadcast(float val);
+   CGLM_INLINE vec3s glms_vec3_fill(float val);
+   CGLM_INLINE bool  glms_vec3_eq(vec3s v, float val);
+   CGLM_INLINE bool  glms_vec3_eq_eps(vec3s v, float val);
+   CGLM_INLINE bool  glms_vec3_eq_all(vec3s v);
+   CGLM_INLINE bool  glms_vec3_eqv(vec3s a, vec3s b);
+   CGLM_INLINE bool  glms_vec3_eqv_eps(vec3s a, vec3s b);
+   CGLM_INLINE float glms_vec3_max(vec3s v);
+   CGLM_INLINE float glms_vec3_min(vec3s v);
+   CGLM_INLINE bool  glms_vec3_isnan(vec3s v);
+   CGLM_INLINE bool  glms_vec3_isinf(vec3s v);
+   CGLM_INLINE bool  glms_vec3_isvalid(vec3s v);
+   CGLM_INLINE vec3s glms_vec3_sign(vec3s v);
+   CGLM_INLINE vec3s glms_vec3_abs(vec3s v);
+   CGLM_INLINE vec3s glms_vec3_fract(vec3s v);
+   CGLM_INLINE vec3s glms_vec3_floor(vec3s v);
+   CGLM_INLINE vec3s glms_vec3_mods(vec3s v, float s);
+   CGLM_INLINE vec3s glms_vec3_steps(float edge, vec3s v);
+   CGLM_INLINE vec3s glms_vec3_stepr(vec3s edge, float v);
+   CGLM_INLINE float glms_vec3_hadd(vec3s v);
+   CGLM_INLINE vec3s glms_vec3_sqrt(vec3s v);
+ */
+
+#ifndef cglms_vec3s_ext_h
+#define cglms_vec3s_ext_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../util.h"
+#include "../vec3-ext.h"
+
+/* api definition */
+#define glms_vec3_(NAME) CGLM_STRUCTAPI(vec3, NAME)
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param[in]  val  value
+ * @returns         dest
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(broadcast)(float val) {
+  vec3s r;
+  glm_vec3_broadcast(val, r.raw);
+  return r;
+}
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param[in]  val  value
+ * @returns         dest
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(fill)(float val) {
+  vec3s r;
+  glm_vec3_fill(r.raw, val);
+  return r;
+}
+
+/*!
+ * @brief check if vector is equal to value (without epsilon)
+ *
+ * @param[in] v   vector
+ * @param[in] val value
+ */
+CGLM_INLINE
+bool
+glms_vec3_(eq)(vec3s v, float val) {
+  return glm_vec3_eq(v.raw, val);
+}
+
+/*!
+ * @brief check if vector is equal to value (with epsilon)
+ *
+ * @param[in] v   vector
+ * @param[in] val value
+ */
+CGLM_INLINE
+bool
+glms_vec3_(eq_eps)(vec3s v, float val) {
+  return glm_vec3_eq_eps(v.raw, val);
+}
+
+/*!
+ * @brief check if vector members are equal (without epsilon)
+ *
+ * @param[in] v   vector
+ */
+CGLM_INLINE
+bool
+glms_vec3_(eq_all)(vec3s v) {
+  return glm_vec3_eq_all(v.raw);
+}
+
+/*!
+ * @brief check if vector is equal to another (without epsilon)
+ *
+ * @param[in] a vector
+ * @param[in] b vector
+ */
+CGLM_INLINE
+bool
+glms_vec3_(eqv)(vec3s a, vec3s b) {
+  return glm_vec3_eqv(a.raw, b.raw);
+}
+
+/*!
+ * @brief check if vector is equal to another (with epsilon)
+ *
+ * @param[in] a vector
+ * @param[in] b vector
+ */
+CGLM_INLINE
+bool
+glms_vec3_(eqv_eps)(vec3s a, vec3s b) {
+  return glm_vec3_eqv_eps(a.raw, b.raw);
+}
+
+/*!
+ * @brief max value of vector
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+float
+glms_vec3_(max)(vec3s v) {
+  return glm_vec3_max(v.raw);
+}
+
+/*!
+ * @brief min value of vector
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+float
+glms_vec3_(min)(vec3s v) {
+  return glm_vec3_min(v.raw);
+}
+
+/*!
+ * @brief check if one of items is NaN (not a number)
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glms_vec3_(isnan)(vec3s v) {
+  return glm_vec3_isnan(v.raw);
+}
+
+/*!
+ * @brief check if one of items is INFINITY
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glms_vec3_(isinf)(vec3s v) {
+  return glm_vec3_isinf(v.raw);
+}
+
+/*!
+ * @brief check if all items are valid number
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glms_vec3_(isvalid)(vec3s v) {
+  return glm_vec3_isvalid(v.raw);
+}
+
+/*!
+ * @brief get sign of 32 bit float as +1, -1, 0
+ *
+ * Important: It returns 0 for zero/NaN input
+ *
+ * @param   v   vector
+ * @returns     sign vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(sign)(vec3s v) {
+  vec3s r;
+  glm_vec3_sign(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief absolute value of each vector item
+ *
+ * @param[in]  v    vector
+ * @return          destination vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(abs)(vec3s v) {
+  vec3s r;
+  glm_vec3_abs(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief fractional part of each vector item
+ *
+ * @param[in]  v    vector
+ * @return          dest destination vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(fract)(vec3s v) {
+  vec3s r;
+  glm_vec3_fract(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief floor of each vector item
+ *
+ * @param[in]  v    vector
+ * @return          dest destination vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(floor)(vec3s v) {
+  vec3s r;
+  glm_vec3_floor(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief mod of each vector item by scalar
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(mods)(vec3s v, float s) {
+  vec3s r;
+  glm_vec3_mods(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief threshold each vector item with scalar
+ *        condition is: (x[i] < edge) ? 0.0 : 1.0
+ *
+ * @param[in]   edge   threshold
+ * @param[in]   x      vector to test against threshold
+ * @returns            destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(steps)(float edge, vec3s x) {
+  vec3s r;
+  glm_vec3_steps(edge, x.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief threshold a value with *vector* as the threshold
+ *        condition is: (x < edge[i]) ? 0.0 : 1.0
+ *
+ * @param[in]   edge   threshold vector
+ * @param[in]   x      value to test against threshold
+ * @returns            destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(stepr)(vec3s edge, float x) {
+  vec3s r;
+  glm_vec3_stepr(edge.raw, x, r.raw);
+  return r;
+}
+
+/*!
+ * @brief vector reduction by summation
+ * @warning could overflow
+ *
+ * @param[in]  v    vector
+ * @return     sum of all vector's elements
+ */
+CGLM_INLINE
+float
+glms_vec3_(hadd)(vec3s v) {
+  return glm_vec3_hadd(v.raw);
+}
+
+/*!
+ * @brief square root of each vector item
+ *
+ * @param[in]  v    vector
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(sqrt)(vec3s v) {
+  vec3s r;
+  glm_vec3_sqrt(v.raw, r.raw);
+  return r;
+}
+
+#endif /* cglms_vec3s_ext_h */
diff --git a/external/cglm/struct/vec3.h b/external/cglm/struct/vec3.h
new file mode 100644
index 0000000..a1d901e
--- /dev/null
+++ b/external/cglm/struct/vec3.h
@@ -0,0 +1,1132 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLMS_VEC3_ONE_INIT
+   GLMS_VEC3_ZERO_INIT
+   GLMS_VEC3_ONE
+   GLMS_VEC3_ZERO
+   GLMS_YUP
+   GLMS_ZUP
+   GLMS_XUP
+
+ Functions:
+   CGLM_INLINE vec3s glms_vec3(vec4s v4);
+   CGLM_INLINE void  glms_vec3_pack(vec3s dst[], vec3 src[], size_t len);
+   CGLM_INLINE void  glms_vec3_unpack(vec3 dst[], vec3s src[], size_t len);
+   CGLM_INLINE vec3s glms_vec3_zero(void);
+   CGLM_INLINE vec3s glms_vec3_one(void);
+   CGLM_INLINE float glms_vec3_dot(vec3s a, vec3s b);
+   CGLM_INLINE float glms_vec3_norm2(vec3s v);
+   CGLM_INLINE float glms_vec3_norm(vec3s v);
+   CGLM_INLINE float glms_vec3_norm_one(vec3s v);
+   CGLM_INLINE float glms_vec3_norm_inf(vec3s v);
+   CGLM_INLINE vec3s glms_vec3_add(vec3s a, vec3s b);
+   CGLM_INLINE vec3s glms_vec3_adds(vec3s a, float s);
+   CGLM_INLINE vec3s glms_vec3_sub(vec3s a, vec3s b);
+   CGLM_INLINE vec3s glms_vec3_subs(vec3s a, float s);
+   CGLM_INLINE vec3s glms_vec3_mul(vec3s a, vec3s b);
+   CGLM_INLINE vec3s glms_vec3_scale(vec3s v, float s);
+   CGLM_INLINE vec3s glms_vec3_scale_as(vec3s v, float s);
+   CGLM_INLINE vec3s glms_vec3_div(vec3s a, vec3s b);
+   CGLM_INLINE vec3s glms_vec3_divs(vec3s a, float s);
+   CGLM_INLINE vec3s glms_vec3_addadd(vec3s a, vec3s b, vec3s dest);
+   CGLM_INLINE vec3s glms_vec3_subadd(vec3s a, vec3s b, vec3s dest);
+   CGLM_INLINE vec3s glms_vec3_muladd(vec3s a, vec3s b, vec3s dest);
+   CGLM_INLINE vec3s glms_vec3_muladds(vec3s a, float s, vec3s dest);
+   CGLM_INLINE vec3s glms_vec3_maxadd(vec3s a, vec3s b, vec3s dest);
+   CGLM_INLINE vec3s glms_vec3_minadd(vec3s a, vec3s b, vec3s dest);
+   CGLM_INLINE vec3s glms_vec3_subsub(vec3s a, vec3s b, vec3s dest);
+   CGLM_INLINE vec3s glms_vec3_addsub(vec3s a, vec3s b, vec3s dest);
+   CGLM_INLINE vec3s glms_vec3_mulsub(vec3s a, vec3s b, vec3s dest);
+   CGLM_INLINE vec3s glms_vec3_mulsubs(vec3s a, float s, vec3s dest);
+   CGLM_INLINE vec3s glms_vec3_maxsub(vec3s a, vec3s b, vec3s dest);
+   CGLM_INLINE vec3s glms_vec3_minsub(vec3s a, vec3s b, vec3s dest);
+   CGLM_INLINE vec3s glms_vec3_flipsign(vec3s v);
+   CGLM_INLINE vec3s glms_vec3_negate(vec3s v);
+   CGLM_INLINE vec3s glms_vec3_normalize(vec3s v);
+   CGLM_INLINE vec3s glms_vec3_cross(vec3s a, vec3s b);
+   CGLM_INLINE vec3s glms_vec3_crossn(vec3s a, vec3s b);
+   CGLM_INLINE float glms_vec3_angle(vec3s a, vec3s b);
+   CGLM_INLINE vec3s glms_vec3_rotate(vec3s v, float angle, vec3s axis);
+   CGLM_INLINE vec3s glms_vec3_rotate_m4(mat4s m, vec3s v);
+   CGLM_INLINE vec3s glms_vec3_rotate_m3(mat3s m, vec3s v);
+   CGLM_INLINE vec3s glms_vec3_proj(vec3s a, vec3s b);
+   CGLM_INLINE vec3s glms_vec3_center(vec3s a, vec3s b);
+   CGLM_INLINE float glms_vec3_distance(vec3s a, vec3s b);
+   CGLM_INLINE float glms_vec3_distance2(vec3s a, vec3s b);
+   CGLM_INLINE vec3s glms_vec3_maxv(vec3s a, vec3s b);
+   CGLM_INLINE vec3s glms_vec3_minv(vec3s a, vec3s b);
+   CGLM_INLINE vec3s glms_vec3_ortho(vec3s v);
+   CGLM_INLINE vec3s glms_vec3_clamp(vec3s v, float minVal, float maxVal);
+   CGLM_INLINE vec3s glms_vec3_lerp(vec3s from, vec3s to, float t);
+   CGLM_INLINE vec3s glms_vec3_lerpc(vec3s from, vec3s to, float t);
+   CGLM_INLINE vec3s glms_vec3_mix(vec3s from, vec3s to, float t);
+   CGLM_INLINE vec3s glms_vec3_mixc(vec3s from, vec3s to, float t);
+   CGLM_INLINE vec3s glms_vec3_step(vec3s edge, vec3s x);
+   CGLM_INLINE vec3s glms_vec3_smoothstep_uni(float edge0, float edge1, vec3s x);
+   CGLM_INLINE vec3s glms_vec3_smoothstep(vec3s edge0, vec3s edge1, vec3s x);
+   CGLM_INLINE vec3s glms_vec3_smoothinterp(vec3s from, vec3s to, float t);
+   CGLM_INLINE vec3s glms_vec3_smoothinterpc(vec3s from, vec3s to, float t);
+   CGLM_INLINE vec3s glms_vec3_swizzle(vec3s v, int mask);
+   CGLM_INLINE vec3s glms_vec3_make(float * restrict src);
+   CGLM_INLINE vec3s glms_vec3_faceforward(vec3s n, vec3s v, vec3s nref);
+   CGLM_INLINE vec3s glms_vec3_reflect(vec3s v, vec3s n);
+   CGLM_INLINE bool  glms_vec3_refract(vec3s v, vec3s n, float eta, vec3s *dest)
+
+ Convenient:
+   CGLM_INLINE vec3s glms_cross(vec3s a, vec3s b);
+   CGLM_INLINE float glms_dot(vec3s a, vec3s b);
+   CGLM_INLINE vec3s glms_normalize(vec3s v);
+
+ Deprecated:
+   glms_vec3_step_uni  -->  use glms_vec3_steps
+ */
+
+#ifndef cglms_vec3s_h
+#define cglms_vec3s_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../util.h"
+#include "../vec3.h"
+#include "vec3-ext.h"
+
+/* DEPRECATED! */
+#define glms_vec3_step_uni(edge, x) glms_vec3_steps(edge, x)
+
+#define GLMS_VEC3_ONE_INIT   {GLM_VEC3_ONE_INIT}
+#define GLMS_VEC3_ZERO_INIT  {GLM_VEC3_ZERO_INIT}
+
+#define GLMS_VEC3_ONE  ((vec3s)GLMS_VEC3_ONE_INIT)
+#define GLMS_VEC3_ZERO ((vec3s)GLMS_VEC3_ZERO_INIT)
+
+#define GLMS_YUP  ((vec3s){{0.0f, 1.0f, 0.0f}})
+#define GLMS_ZUP  ((vec3s){{0.0f, 0.0f, 1.0f}})
+#define GLMS_XUP  ((vec3s){{1.0f, 0.0f, 0.0f}})
+
+/*!
+ * @brief init vec3 using vec4
+ *
+ * @param[in]  v4   vector4
+ * @returns         destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3(vec4s v4) {
+  vec3s r;
+  glm_vec3(v4.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief pack an array of vec3 into an array of vec3s
+ *
+ * @param[out] dst array of vec3
+ * @param[in]  src array of vec3s
+ * @param[in]  len number of elements
+ */
+CGLM_INLINE
+void
+glms_vec3_(pack)(vec3s dst[], vec3 src[], size_t len) {
+  size_t i;
+
+  for (i = 0; i < len; i++) {
+    glm_vec3_copy(src[i], dst[i].raw);
+  }
+}
+
+/*!
+ * @brief unpack an array of vec3s into an array of vec3
+ *
+ * @param[out] dst array of vec3s
+ * @param[in]  src array of vec3
+ * @param[in]  len number of elements
+ */
+CGLM_INLINE
+void
+glms_vec3_(unpack)(vec3 dst[], vec3s src[], size_t len) {
+  size_t i;
+
+  for (i = 0; i < len; i++) {
+    glm_vec3_copy(src[i].raw, dst[i]);
+  }
+}
+
+/*!
+ * @brief make vector zero
+ *
+ * @returns       zero vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(zero)(void) {
+  vec3s r;
+  glm_vec3_zero(r.raw);
+  return r;
+}
+
+/*!
+ * @brief make vector one
+ *
+ * @returns       one vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(one)(void) {
+  vec3s r;
+  glm_vec3_one(r.raw);
+  return r;
+}
+
+/*!
+ * @brief vec3 dot product
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ *
+ * @return dot product
+ */
+CGLM_INLINE
+float
+glms_vec3_(dot)(vec3s a, vec3s b) {
+  return glm_vec3_dot(a.raw, b.raw);
+}
+
+/*!
+ * @brief norm * norm (magnitude) of vec
+ *
+ * we can use this func instead of calling norm * norm, because it would call
+ * sqrtf function twice but with this func we can avoid func call, maybe this is
+ * not good name for this func
+ *
+ * @param[in] v vector
+ *
+ * @return norm * norm
+ */
+CGLM_INLINE
+float
+glms_vec3_(norm2)(vec3s v) {
+  return glm_vec3_norm2(v.raw);
+}
+
+/*!
+ * @brief norm (magnitude) of vec3
+ *
+ * @param[in] v vector
+ *
+ * @return norm
+ */
+CGLM_INLINE
+float
+glms_vec3_(norm)(vec3s v) {
+  return glm_vec3_norm(v.raw);
+}
+
+/*!
+ * @brief L1 norm of vec3
+ * Also known as Manhattan Distance or Taxicab norm.
+ * L1 Norm is the sum of the magnitudes of the vectors in a space.
+ * It is calculated as the sum of the absolute values of the vector components.
+ * In this norm, all the components of the vector are weighted equally.
+ *
+ * This computes:
+ * R = |v[0]| + |v[1]| + |v[2]|
+ *
+ * @param[in] v vector
+ *
+ * @return L1 norm
+ */
+CGLM_INLINE
+float
+glms_vec3_(norm_one)(vec3s v) {
+  return glm_vec3_norm_one(v.raw);
+}
+
+/*!
+ * @brief Infinity norm of vec3
+ * Also known as Maximum norm.
+ * Infinity Norm is the largest magnitude among each element of a vector.
+ * It is calculated as the maximum of the absolute values of the vector components.
+ *
+ * This computes:
+ * inf norm = max(|v[0]|, |v[1]|, |v[2]|)
+ *
+ * @param[in] v vector
+ *
+ * @return Infinity norm
+ */
+CGLM_INLINE
+float
+glms_vec3_(norm_inf)(vec3s v) {
+  return glm_vec3_norm_inf(v.raw);
+}
+
+/*!
+ * @brief add a vector to b vector store result in dest
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(add)(vec3s a, vec3s b) {
+  vec3s r;
+  glm_vec3_add(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief add scalar to v vector store result in dest (d = v + s)
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(adds)(vec3s a, float s) {
+  vec3s r;
+  glm_vec3_adds(a.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief subtract b vector from a vector store result in dest
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(sub)(vec3s a, vec3s b) {
+  vec3s r;
+  glm_vec3_sub(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief subtract scalar from v vector store result in dest (d = v - s)
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(subs)(vec3s a, float s) {
+  vec3s r;
+  glm_vec3_subs(a.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply two vectors (component-wise multiplication)
+ *
+ * @param     a     vector1
+ * @param     b     vector2
+ * @returns         v3 = (a[0] * b[0], a[1] * b[1], a[2] * b[2])
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(mul)(vec3s a, vec3s b) {
+  vec3s r;
+  glm_vec3_mul(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply/scale vec3 vector with scalar: result = v * s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(scale)(vec3s v, float s) {
+  vec3s r;
+  glm_vec3_scale(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief make vec3 vector scale as specified: result = unit(v) * s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(scale_as)(vec3s v, float s) {
+  vec3s r;
+  glm_vec3_scale_as(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief div vector with another component-wise division: d = a / b
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         result = (a[0]/b[0], a[1]/b[1], a[2]/b[2])
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(div)(vec3s a, vec3s b) {
+  vec3s r;
+  glm_vec3_div(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief div vector with scalar: d = v / s
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @returns         result = (a[0]/s, a[1]/s, a[2]/s)
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(divs)(vec3s a, float s) {
+  vec3s r;
+  glm_vec3_divs(a.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief add two vectors and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += (a + b)
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(addadd)(vec3s a, vec3s b, vec3s dest) {
+  glm_vec3_addadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief sub two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += (a + b)
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(subadd)(vec3s a, vec3s b, vec3s dest) {
+  glm_vec3_subadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief mul two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += (a * b)
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(muladd)(vec3s a, vec3s b, vec3s dest) {
+  glm_vec3_muladd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief mul vector with scalar and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @returns         dest += (a * b)
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(muladds)(vec3s a, float s, vec3s dest) {
+  glm_vec3_muladds(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add max of two vectors to result/dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += max(a, b)
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(maxadd)(vec3s a, vec3s b, vec3s dest) {
+  glm_vec3_maxadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add min of two vectors to result/dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += min(a, b)
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(minadd)(vec3s a, vec3s b, vec3s dest) {
+  glm_vec3_minadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief sub two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= (a - b)
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(subsub)(vec3s a, vec3s b, vec3s dest) {
+  glm_vec3_subsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= (a + b)
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(addsub)(vec3s a, vec3s b, vec3s dest) {
+  glm_vec3_addsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief mul two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= (a * b)
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(mulsub)(vec3s a, vec3s b, vec3s dest) {
+  glm_vec3_mulsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief mul vector with scalar and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @returns         dest -= (a * b)
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(mulsubs)(vec3s a, float s, vec3s dest) {
+  glm_vec3_mulsubs(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief sub max of two vectors to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= max(a, b)
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(maxsub)(vec3s a, vec3s b, vec3s dest) {
+  glm_vec3_maxsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief sub min of two vectors to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= min(a, b)
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(minsub)(vec3s a, vec3s b, vec3s dest) {
+  glm_vec3_minsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief negate vector components and store result in dest
+ *
+ * @param[in]   v     vector
+ * @returns           result vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(flipsign)(vec3s v) {
+  glm_vec3_flipsign(v.raw);
+  return v;
+}
+
+/*!
+ * @brief negate vector components
+ *
+ * @param[in]  v  vector
+ * @returns       negated vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(negate)(vec3s v) {
+  glm_vec3_negate(v.raw);
+  return v;
+}
+
+/*!
+ * @brief normalize vec3 and store result in same vec
+ *
+ * @param[in] v vector
+ * @returns     normalized vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(normalize)(vec3s v) {
+  glm_vec3_normalize(v.raw);
+  return v;
+}
+
+/*!
+ * @brief cross product of two vector (RH)
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(cross)(vec3s a, vec3s b) {
+  vec3s r;
+  glm_vec3_cross(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief cross product of two vector (RH) and normalize the result
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(crossn)(vec3s a, vec3s b) {
+  vec3s r;
+  glm_vec3_crossn(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief angle between two vector
+ *
+ * @param[in] a  vector1
+ * @param[in] b  vector2
+ *
+ * @return angle as radians
+ */
+CGLM_INLINE
+float
+glms_vec3_(angle)(vec3s a, vec3s b) {
+  return glm_vec3_angle(a.raw, b.raw);
+}
+
+/*!
+ * @brief rotate vec3 around axis by angle using Rodrigues' rotation formula
+ *
+ * @param[in]     v     vector
+ * @param[in]     axis  axis vector (must be unit vector)
+ * @param[in]     angle angle by radians
+ * @returns             rotated vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(rotate)(vec3s v, float angle, vec3s axis) {
+  glm_vec3_rotate(v.raw, angle, axis.raw);
+  return v;
+}
+
+/*!
+ * @brief apply rotation matrix to vector
+ *
+ *  matrix format should be (no perspective):
+ *   a  b  c  x
+ *   e  f  g  y
+ *   i  j  k  z
+ *   0  0  0  w
+ *
+ * @param[in]  m    affine matrix or rot matrix
+ * @param[in]  v    vector
+ * @returns         rotated vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(rotate_m4)(mat4s m, vec3s v) {
+  vec3s r;
+  glm_vec3_rotate_m4(m.raw, v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief apply rotation matrix to vector
+ *
+ * @param[in]  m    affine matrix or rot matrix
+ * @param[in]  v    vector
+ * @returns         rotated vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(rotate_m3)(mat3s m, vec3s v) {
+  vec3s r;
+  glm_vec3_rotate_m3(m.raw, v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief project a vector onto b vector
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         projected vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(proj)(vec3s a, vec3s b) {
+  vec3s r;
+  glm_vec3_proj(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/**
+ * @brief find center point of two vector
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         center point
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(center)(vec3s a, vec3s b) {
+  vec3s r;
+  glm_vec3_center(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/**
+ * @brief distance between two vectors
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ * @return      distance
+ */
+CGLM_INLINE
+float
+glms_vec3_(distance)(vec3s a, vec3s b) {
+  return glm_vec3_distance(a.raw, b.raw);
+}
+
+/**
+ * @brief squared distance between two vectors
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ * @return      squared distance (distance * distance)
+ */
+CGLM_INLINE
+float
+glms_vec3_(distance2)(vec3s a, vec3s b) {
+  return glm_vec3_distance2(a.raw, b.raw);
+}
+
+/*!
+ * @brief max values of vectors
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(maxv)(vec3s a, vec3s b) {
+  vec3s r;
+  glm_vec3_maxv(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief min values of vectors
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(minv)(vec3s a, vec3s b) {
+  vec3s r;
+  glm_vec3_minv(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief possible orthogonal/perpendicular vector
+ *
+ * @param[in]  v    vector
+ * @returns         orthogonal/perpendicular vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(ortho)(vec3s v) {
+  vec3s r;
+  glm_vec3_ortho(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief clamp vector's individual members between min and max values
+ *
+ * @param[in]       v       vector
+ * @param[in]       minVal  minimum value
+ * @param[in]       maxVal  maximum value
+ * @returns                 clamped vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(clamp)(vec3s v, float minVal, float maxVal) {
+  glm_vec3_clamp(v.raw, minVal, maxVal);
+  return v;
+}
+
+/*!
+ * @brief linear interpolation between two vectors
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from  from value
+ * @param[in]   to    to value
+ * @param[in]   t     interpolant (amount)
+ * @returns           destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(lerp)(vec3s from, vec3s to, float t) {
+  vec3s r;
+  glm_vec3_lerp(from.raw, to.raw, t, r.raw);
+  return r;
+}
+
+/*!
+ * @brief linear interpolation between two vectors (clamped)
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from  from value
+ * @param[in]   to    to value
+ * @param[in]   t     interpolant (amount) clamped between 0 and 1
+ * @returns           destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(lerpc)(vec3s from, vec3s to, float t) {
+  vec3s r;
+  glm_vec3_lerpc(from.raw, to.raw, t, r.raw);
+  return r;
+}
+
+/*!
+ * @brief linear interpolation between two vectors
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from  from value
+ * @param[in]   to    to value
+ * @param[in]   t     interpolant (amount)
+ * @returns           destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(mix)(vec3s from, vec3s to, float t) {
+  vec3s r;
+  glm_vec3_mix(from.raw, to.raw, t, r.raw);
+  return r;
+}
+
+/*!
+ * @brief linear interpolation between two vectors (clamped)
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from  from value
+ * @param[in]   to    to value
+ * @param[in]   t     interpolant (amount) clamped between 0 and 1
+ * @returns           destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(mixc)(vec3s from, vec3s to, float t) {
+  vec3s r;
+  glm_vec3_mixc(from.raw, to.raw, t, r.raw);
+  return r;
+}
+
+/*!
+ * @brief threshold function
+ *
+ * @param[in]   edge    threshold
+ * @param[in]   x       value to test against threshold
+ * @returns             0.0 if x < edge, else 1.0
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(step)(vec3s edge, vec3s x) {
+  vec3s r;
+  glm_vec3_step(edge.raw, x.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief threshold function with a smooth transition (unidimensional)
+ *
+ * @param[in]   edge0   low threshold
+ * @param[in]   edge1   high threshold
+ * @param[in]   x       value to test against threshold
+ * @returns             destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(smoothstep_uni)(float edge0, float edge1, vec3s x) {
+  vec3s r;
+  glm_vec3_smoothstep_uni(edge0, edge1, x.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief threshold function with a smooth transition
+ *
+ * @param[in]   edge0   low threshold
+ * @param[in]   edge1   high threshold
+ * @param[in]   x       value to test against threshold
+ * @returns             destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(smoothstep)(vec3s edge0, vec3s edge1, vec3s x) {
+  vec3s r;
+  glm_vec3_smoothstep(edge0.raw, edge1.raw, x.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief smooth Hermite interpolation between two vectors
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from    from value
+ * @param[in]   to      to value
+ * @param[in]   t       interpolant (amount)
+ * @returns             destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(smoothinterp)(vec3s from, vec3s to, float t) {
+  vec3s r;
+  glm_vec3_smoothinterp(from.raw, to.raw, t, r.raw);
+  return r;
+}
+
+/*!
+ * @brief smooth Hermite interpolation between two vectors (clamped)
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from    from value
+ * @param[in]   to      to value
+ * @param[in]   t       interpolant (amount) clamped between 0 and 1
+ * @returns             destination
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(smoothinterpc)(vec3s from, vec3s to, float t) {
+  vec3s r;
+  glm_vec3_smoothinterpc(from.raw, to.raw, t, r.raw);
+  return r;
+}
+
+/*!
+ * @brief vec3 cross product
+ *
+ * this is just convenient wrapper
+ *
+ * @param[in]  a  source 1
+ * @param[in]  b  source 2
+ * @returns       destination
+ */
+CGLM_INLINE
+vec3s
+glms_cross(vec3s a, vec3s b) {
+  vec3s r;
+  glm_cross(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief vec3 dot product
+ *
+ * this is just convenient wrapper
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ * @return      dot product
+ */
+CGLM_INLINE
+float
+glms_dot(vec3s a, vec3s b) {
+  return glm_dot(a.raw, b.raw);
+}
+
+/*!
+ * @brief normalize vec3 and store result in same vec
+ *
+ * this is just convenient wrapper
+ *
+ * @param[in]   v   vector
+ * @returns         normalized vector
+ */
+CGLM_INLINE
+vec3s
+glms_normalize(vec3s v) {
+  glm_normalize(v.raw);
+  return v;
+}
+
+/*!
+ * @brief swizzle vector components
+ *
+ * you can use existing masks e.g. GLM_XXX, GLM_ZYX
+ *
+ * @param[in]  v    source
+ * @param[in]  mask mask
+ * @returns swizzled vector
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(swizzle)(vec3s v, int mask) {
+  vec3s dest;
+  glm_vec3_swizzle(v.raw, mask, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Create three dimensional vector from pointer
+ *
+ * @param[in]  src  pointer to an array of floats
+ * @returns constructed 3D vector from raw pointer
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(make)(const float * __restrict src) {
+  vec3s dest;
+  glm_vec3_make(src, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief a vector pointing in the same direction as another
+ *
+ * orients a vector to point away from a surface as defined by its normal
+ *
+ * @param[in] n      vector to orient.
+ * @param[in] v      incident vector
+ * @param[in] nref   reference vector
+ * @returns oriented vector, pointing away from the surface.
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(faceforward)(vec3s n, vec3s v, vec3s nref) {
+  vec3s dest;
+  glm_vec3_faceforward(n.raw, v.raw, nref.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief reflection vector using an incident ray and a surface normal
+ *
+ * @param[in]  I    incident vector
+ * @param[in]  N    normalized normal vector
+ * @returns reflection result
+ */
+CGLM_INLINE
+vec3s
+glms_vec3_(reflect)(vec3s v, vec3s n) {
+  vec3s dest;
+  glm_vec3_reflect(v.raw, n.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief computes refraction vector for an incident vector and a surface normal.
+ *
+ * calculates the refraction vector based on Snell's law. If total internal reflection
+ * occurs (angle too great given eta), dest is set to zero and returns false.
+ * Otherwise, computes refraction vector, stores it in dest, and returns true.
+ *
+ * @param[in]  v    normalized incident vector
+ * @param[in]  n    normalized normal vector
+ * @param[in]  eta  ratio of indices of refraction (incident/transmitted)
+ * @param[out] dest refraction vector if refraction occurs; zero vector otherwise
+ *
+ * @returns true if refraction occurs; false if total internal reflection occurs.
+ */
+CGLM_INLINE
+bool
+glms_vec3_(refract)(vec3s v, vec3s n, float eta, vec3s * __restrict dest) {
+  return glm_vec3_refract(v.raw, n.raw, eta, dest->raw);
+}
+
+#endif /* cglms_vec3s_h */
diff --git a/external/cglm/struct/vec4-ext.h b/external/cglm/struct/vec4-ext.h
new file mode 100644
index 0000000..f57348e
--- /dev/null
+++ b/external/cglm/struct/vec4-ext.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*!
+ * @brief SIMD like functions
+ */
+
+/*
+ Functions:
+   CGLM_INLINE vec4s glms_vec4_broadcast(float val);
+   CGLM_INLINE vec4s glms_vec4_fill(float val);
+   CGLM_INLINE bool  glms_vec4_eq(vec4s v, float val);
+   CGLM_INLINE bool  glms_vec4_eq_eps(vec4s v, float val);
+   CGLM_INLINE bool  glms_vec4_eq_all(vec4s v);
+   CGLM_INLINE bool  glms_vec4_eqv(vec4s a, vec4s b);
+   CGLM_INLINE bool  glms_vec4_eqv_eps(vec4s a, vec4s b);
+   CGLM_INLINE float glms_vec4_max(vec4s v);
+   CGLM_INLINE float glms_vec4_min(vec4s v);
+   CGLM_INLINE bool  glms_vec4_isnan(vec4s v);
+   CGLM_INLINE bool  glms_vec4_isinf(vec4s v);
+   CGLM_INLINE bool  glms_vec4_isvalid(vec4s v);
+   CGLM_INLINE vec4s glms_vec4_sign(vec4s v);
+   CGLM_INLINE vec4s glms_vec4_abs(vec4s v);
+   CGLM_INLINE vec4s glms_vec4_fract(vec4s v);
+   CGLM_INLINE float glms_vec4_floor(vec4s v);
+   CGLM_INLINE float glms_vec4_mods(vec4s v, float s);
+   CGLM_INLINE float glms_vec4_steps(float edge, vec4s v);
+   CGLM_INLINE void  glms_vec4_stepr(vec4s edge, float v);
+   CGLM_INLINE float glms_vec4_hadd(vec4s v);
+   CGLM_INLINE vec4s glms_vec4_sqrt(vec4s v);
+ */
+
+#ifndef cglms_vec4s_ext_h
+#define cglms_vec4s_ext_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../util.h"
+#include "../vec4-ext.h"
+
+/* api definition */
+#define glms_vec4_(NAME) CGLM_STRUCTAPI(vec4, NAME)
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param val value
+ * @returns   dest
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(broadcast)(float val) {
+  vec4s r;
+  glm_vec4_broadcast(val, r.raw);
+  return r;
+}
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param val value
+ * @returns   dest
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(fill)(float val) {
+  vec4s r;
+  glm_vec4_fill(r.raw, val);
+  return r;
+}
+
+/*!
+ * @brief check if vector is equal to value (without epsilon)
+ *
+ * @param v   vector
+ * @param val value
+ */
+CGLM_INLINE
+bool
+glms_vec4_(eq)(vec4s v, float val) {
+  return glm_vec4_eq(v.raw, val);
+}
+
+/*!
+ * @brief check if vector is equal to value (with epsilon)
+ *
+ * @param v   vector
+ * @param val value
+ */
+CGLM_INLINE
+bool
+glms_vec4_(eq_eps)(vec4s v, float val) {
+  return glm_vec4_eq_eps(v.raw, val);
+}
+
+/*!
+ * @brief check if vector members are equal (without epsilon)
+ *
+ * @param v   vector
+ */
+CGLM_INLINE
+bool
+glms_vec4_(eq_all)(vec4s v) {
+  return glm_vec4_eq_all(v.raw);
+}
+
+/*!
+ * @brief check if vector is equal to another (without epsilon)
+ *
+ * @param a vector
+ * @param b vector
+ */
+CGLM_INLINE
+bool
+glms_vec4_(eqv)(vec4s a, vec4s b) {
+  return glm_vec4_eqv(a.raw, b.raw);
+}
+
+/*!
+ * @brief check if vector is equal to another (with epsilon)
+ *
+ * @param a vector
+ * @param b vector
+ */
+CGLM_INLINE
+bool
+glms_vec4_(eqv_eps)(vec4s a, vec4s b) {
+  return glm_vec4_eqv_eps(a.raw, b.raw);
+}
+
+/*!
+ * @brief max value of vector
+ *
+ * @param v vector
+ */
+CGLM_INLINE
+float
+glms_vec4_(max)(vec4s v) {
+  return glm_vec4_max(v.raw);
+}
+
+/*!
+ * @brief min value of vector
+ *
+ * @param v vector
+ */
+CGLM_INLINE
+float
+glms_vec4_(min)(vec4s v) {
+  return glm_vec4_min(v.raw);
+}
+
+/*!
+ * @brief check if one of items is NaN (not a number)
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glms_vec4_(isnan)(vec4s v) {
+  return glm_vec4_isnan(v.raw);
+}
+
+/*!
+ * @brief check if one of items is INFINITY
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glms_vec4_(isinf)(vec4s v) {
+  return glm_vec4_isinf(v.raw);
+}
+
+/*!
+ * @brief check if all items are valid number
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glms_vec4_(isvalid)(vec4s v) {
+  return glm_vec4_isvalid(v.raw);
+}
+
+/*!
+ * @brief get sign of 32 bit float as +1, -1, 0
+ *
+ * Important: It returns 0 for zero/NaN input
+ *
+ * @param   v   vector
+ * @returns     sign vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(sign)(vec4s v) {
+  vec4s r;
+  glm_vec4_sign(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief absolute value of each vector item
+ *
+ * @param[in]  v    vector
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(abs)(vec4s v) {
+  vec4s r;
+  glm_vec4_abs(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief fractional part of each vector item
+ *
+ * @param[in]  v    vector
+ * @returns          dest destination vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(fract)(vec4s v) {
+  vec4s r;
+  glm_vec4_fract(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief floor of each vector item
+ *
+ * @param[in]  v    vector
+ * @returns          dest destination vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(floor)(vec4s v) {
+  vec4s r;
+  glm_vec4_floor(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief mod of each vector item by scalar
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(mods)(vec4s v, float s) {
+  vec4s r;
+  glm_vec4_mods(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief threshold each vector item with scalar
+ *        condition is: (x[i] < edge) ? 0.0 : 1.0
+ *
+ * @param[in]   edge   threshold
+ * @param[in]   x      vector to test against threshold
+ * @returns            destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(steps)(float edge, vec4s x) {
+  vec4s r;
+  glm_vec4_steps(edge, x.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief threshold a value with *vector* as the threshold
+ *        condition is: (x < edge[i]) ? 0.0 : 1.0
+ *
+ * @param[in]   edge   threshold vector
+ * @param[in]   x      value to test against threshold
+ * @returns            destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(stepr)(vec4s edge, float x) {
+  vec4s r;
+  glm_vec4_stepr(edge.raw, x, r.raw);
+  return r;
+}
+
+/*!
+ * @brief vector reduction by summation
+ * @warning could overflow
+ *
+ * @param[in]  v    vector
+ * @return     sum of all vector's elements
+ */
+CGLM_INLINE
+float
+glms_vec4_(hadd)(vec4s v) {
+  return glm_vec4_hadd(v.raw);
+}
+
+/*!
+ * @brief square root of each vector item
+ *
+ * @param[in]  v    vector
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(sqrt)(vec4s v) {
+  vec4s r;
+  glm_vec4_sqrt(v.raw, r.raw);
+  return r;
+}
+
+#endif /* cglms_vec4s_ext_h */
diff --git a/external/cglm/struct/vec4.h b/external/cglm/struct/vec4.h
new file mode 100644
index 0000000..a64c1a3
--- /dev/null
+++ b/external/cglm/struct/vec4.h
@@ -0,0 +1,961 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLMS_VEC4_ONE_INIT
+   GLMS_VEC4_BLACK_INIT
+   GLMS_VEC4_ZERO_INIT
+   GLMS_VEC4_ONE
+   GLMS_VEC4_BLACK
+   GLMS_VEC4_ZERO
+
+ Functions:
+   CGLM_INLINE vec4s glms_vec4(vec3s v3, float last);
+   CGLM_INLINE vec3s glms_vec4_copy3(vec4s v);
+   CGLM_INLINE vec4s glms_vec4_copy(vec4s v);
+   CGLM_INLINE vec4s glms_vec4_ucopy(vec4s v);
+   CGLM_INLINE void  glms_vec4_pack(vec4s dst[], vec4 src[], size_t len);
+   CGLM_INLINE void  glms_vec4_unpack(vec4 dst[], vec4s src[], size_t len);
+   CGLM_INLINE float glms_vec4_dot(vec4s a, vec4s b);
+   CGLM_INLINE float glms_vec4_norm2(vec4s v);
+   CGLM_INLINE float glms_vec4_norm(vec4s v);
+   CGLM_INLINE float glms_vec4_norm_one(vec4s v);
+   CGLM_INLINE float glms_vec4_norm_inf(vec4s v);
+   CGLM_INLINE vec4s glms_vec4_add(vec4s a, vec4s b);
+   CGLM_INLINE vec4s glms_vec4_adds(vec4s v, float s);
+   CGLM_INLINE vec4s glms_vec4_sub(vec4s a, vec4s b);
+   CGLM_INLINE vec4s glms_vec4_subs(vec4s v, float s);
+   CGLM_INLINE vec4s glms_vec4_mul(vec4s a, vec4s b);
+   CGLM_INLINE vec4s glms_vec4_scale(vec4s v, float s);
+   CGLM_INLINE vec4s glms_vec4_scale_as(vec4s v, float s);
+   CGLM_INLINE vec4s glms_vec4_div(vec4s a, vec4s b);
+   CGLM_INLINE vec4s glms_vec4_divs(vec4s v, float s);
+   CGLM_INLINE vec4s glms_vec4_addadd(vec4s a, vec4s b, vec4s dest);
+   CGLM_INLINE vec4s glms_vec4_subadd(vec4s a, vec4s b, vec4s dest);
+   CGLM_INLINE vec4s glms_vec4_muladd(vec4s a, vec4s b, vec4s dest);
+   CGLM_INLINE vec4s glms_vec4_muladds(vec4s a, float s, vec4s dest);
+   CGLM_INLINE vec4s glms_vec4_maxadd(vec4s a, vec4s b, vec4s dest);
+   CGLM_INLINE vec4s glms_vec4_minadd(vec4s a, vec4s b, vec4s dest);
+   CGLM_INLINE vec4s glms_vec4_subsub(vec4s a, vec4s b, vec4s dest);
+   CGLM_INLINE vec4s glms_vec4_addsub(vec4s a, vec4s b, vec4s dest);
+   CGLM_INLINE vec4s glms_vec4_mulsub(vec4s a, vec4s b, vec4s dest);
+   CGLM_INLINE vec4s glms_vec4_mulsubs(vec4s a, float s, vec4s dest);
+   CGLM_INLINE vec4s glms_vec4_maxsub(vec4s a, vec4s b, vec4s dest);
+   CGLM_INLINE vec4s glms_vec4_minsub(vec4s a, vec4s b, vec4s dest);
+   CGLM_INLINE vec4s glms_vec4_negate(vec4s v);
+   CGLM_INLINE vec4s glms_vec4_normalize(vec4s v);
+   CGLM_INLINE float glms_vec4_distance(vec4s a, vec4s b);
+   CGLM_INLINE float glms_vec4_distance2(vec4s a, vec4s b);
+   CGLM_INLINE vec4s glms_vec4_maxv(vec4s a, vec4s b);
+   CGLM_INLINE vec4s glms_vec4_minv(vec4s a, vec4s b);
+   CGLM_INLINE vec4s glms_vec4_clamp(vec4s v, float minVal, float maxVal);
+   CGLM_INLINE vec4s glms_vec4_lerp(vec4s from, vec4s to, float t);
+   CGLM_INLINE vec4s glms_vec4_lerpc(vec4s from, vec4s to, float t);
+   CGLM_INLINE vec4s glms_vec4_mix(vec4s from, vec4s to, float t);
+   CGLM_INLINE vec4s glms_vec4_mixc(vec4s from, vec4s to, float t);
+   CGLM_INLINE vec4s glms_vec4_step(vec4s edge, vec4s x);
+   CGLM_INLINE vec4s glms_vec4_smoothstep_uni(float edge0, float edge1, vec4s x);
+   CGLM_INLINE vec4s glms_vec4_smoothstep(vec4s edge0, vec4s edge1, vec4s x);
+   CGLM_INLINE vec4s glms_vec4_smoothinterp(vec4s from, vec4s to, float t);
+   CGLM_INLINE vec4s glms_vec4_smoothinterpc(vec4s from, vec4s to, float t);
+   CGLM_INLINE vec4s glms_vec4_cubic(float s);
+   CGLM_INLINE vec4s glms_vec4_swizzle(vec4s v, int mask);
+   CGLM_INLINE vec4s glms_vec4_make(float * restrict src);
+   CGLM_INLINE vec4s glms_vec4_reflect(vec4s v, vec4s n);
+   CGLM_INLINE bool  glms_vec4_refract(vec4s v, vec4s n, float eta, vec4s *dest)
+
+ Deprecated:
+   glms_vec4_step_uni  -->  use glms_vec4_steps
+ */
+
+#ifndef cglms_vec4s_h
+#define cglms_vec4s_h
+
+#include "../common.h"
+#include "../types-struct.h"
+#include "../util.h"
+#include "../vec4.h"
+#include "vec4-ext.h"
+
+/* DEPRECATED! */
+#define glms_vec4_step_uni(edge, x) glms_vec4_steps(edge, x)
+
+#define GLMS_VEC4_ONE_INIT   {GLM_VEC4_ONE_INIT}
+#define GLMS_VEC4_BLACK_INIT {GLM_VEC4_BLACK_INIT}
+#define GLMS_VEC4_ZERO_INIT  {GLM_VEC4_ZERO_INIT}
+
+#define GLMS_VEC4_ONE        ((vec4s)GLM_VEC4_ONE_INIT)
+#define GLMS_VEC4_BLACK      ((vec4s)GLM_VEC4_BLACK_INIT)
+#define GLMS_VEC4_ZERO       ((vec4s)GLM_VEC4_ZERO_INIT)
+
+/*!
+ * @brief init vec4 using vec3
+ *
+ * @param[in]  v3   vector3
+ * @param[in]  last last item
+ * @returns         destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4(vec3s v3, float last) {
+  vec4s r;
+  glm_vec4(v3.raw, last, r.raw);
+  return r;
+}
+
+/*!
+ * @brief copy first 3 members of [a] to [dest]
+ *
+ * @param[in]  v    source
+ * @returns         vec3
+ */
+CGLM_INLINE
+vec3s
+glms_vec4_(copy3)(vec4s v) {
+  vec3s r;
+  glm_vec4_copy3(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief copy all members of [a] to [dest]
+ *
+ * @param[in]  v    source
+ * @returns         destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(copy)(vec4s v) {
+  vec4s r;
+  glm_vec4_copy(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief copy all members of [a] to [dest]
+ *
+ * alignment is not required
+ *
+ * @param[in]  v    source
+ * @returns         destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(ucopy)(vec4s v) {
+  vec4s r;
+  glm_vec4_ucopy(v.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief pack an array of vec4 into an array of vec4s
+ *
+ * @param[out] dst array of vec4
+ * @param[in]  src array of vec4s
+ * @param[in]  len number of elements
+ */
+CGLM_INLINE
+void
+glms_vec4_(pack)(vec4s dst[], vec4 src[], size_t len) {
+  size_t i;
+
+  for (i = 0; i < len; i++) {
+    glm_vec4_copy(src[i], dst[i].raw);
+  }
+}
+
+/*!
+ * @brief unpack an array of vec4s into an array of vec4
+ *
+ * @param[out] dst array of vec4s
+ * @param[in]  src array of vec4
+ * @param[in]  len number of elements
+ */
+CGLM_INLINE
+void
+glms_vec4_(unpack)(vec4 dst[], vec4s src[], size_t len) {
+  size_t i;
+
+  for (i = 0; i < len; i++) {
+    glm_vec4_copy(src[i].raw, dst[i]);
+  }
+}
+
+/*!
+ * @brief make vector zero
+ *
+ * @returns      zero vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(zero)(void) {
+  vec4s r;
+  glm_vec4_zero(r.raw);
+  return r;
+}
+
+/*!
+ * @brief make vector one
+ *
+ * @returns      one vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(one)(void) {
+  vec4s r;
+  glm_vec4_one(r.raw);
+  return r;
+}
+
+/*!
+ * @brief vec4 dot product
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ *
+ * @return dot product
+ */
+CGLM_INLINE
+float
+glms_vec4_(dot)(vec4s a, vec4s b) {
+  return glm_vec4_dot(a.raw, b.raw);
+}
+
+/*!
+ * @brief norm * norm (magnitude) of vec
+ *
+ * we can use this func instead of calling norm * norm, because it would call
+ * sqrtf function twice but with this func we can avoid func call, maybe this is
+ * not good name for this func
+ *
+ * @param[in] v vec4
+ *
+ * @return norm * norm
+ */
+CGLM_INLINE
+float
+glms_vec4_(norm2)(vec4s v) {
+  return glm_vec4_norm2(v.raw);
+}
+
+/*!
+ * @brief norm (magnitude) of vec4
+ *
+ * @param[in] v vector
+ *
+ * @return norm
+ */
+CGLM_INLINE
+float
+glms_vec4_(norm)(vec4s v) {
+  return glm_vec4_norm(v.raw);
+}
+
+/*!
+ * @brief L1 norm of vec4
+ * Also known as Manhattan Distance or Taxicab norm.
+ * L1 Norm is the sum of the magnitudes of the vectors in a space.
+ * It is calculated as the sum of the absolute values of the vector components.
+ * In this norm, all the components of the vector are weighted equally.
+ *
+ * This computes:
+ * R = |v[0]| + |v[1]| + |v[2]| + |v[3]|
+ *
+ * @param[in] v vector
+ *
+ * @return L1 norm
+ */
+CGLM_INLINE
+float
+glms_vec4_(norm_one)(vec4s v) {
+  return glm_vec4_norm_one(v.raw);
+}
+
+/*!
+ * @brief Infinity norm of vec4
+ * Also known as Maximum norm.
+ * Infinity Norm is the largest magnitude among each element of a vector.
+ * It is calculated as the maximum of the absolute values of the vector components.
+ *
+ * This computes:
+ * inf norm = max(|v[0]|, |v[1]|, |v[2]|, |v[3]|)
+ *
+ * @param[in] v vector
+ *
+ * @return Infinity norm
+ */
+CGLM_INLINE
+float
+glms_vec4_(norm_inf)(vec4s v) {
+  return glm_vec4_norm_inf(v.raw);
+}
+
+/*!
+ * @brief add b vector to a vector store result in dest
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(add)(vec4s a, vec4s b) {
+  vec4s r;
+  glm_vec4_add(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief add scalar to v vector store result in dest (d = v + vec(s))
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(adds)(vec4s v, float s) {
+  vec4s r;
+  glm_vec4_adds(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief subtract b vector from a vector store result in dest (d = a - b)
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(sub)(vec4s a, vec4s b) {
+  vec4s r;
+  glm_vec4_sub(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief subtract scalar from v vector store result in dest (d = v - vec(s))
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(subs)(vec4s v, float s) {
+  vec4s r;
+  glm_vec4_subs(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply two vectors (component-wise multiplication)
+ *
+ * @param a    vector1
+ * @param b    vector2
+ * @returns    dest = (a[0] * b[0], a[1] * b[1], a[2] * b[2], a[3] * b[3])
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(mul)(vec4s a, vec4s b) {
+  vec4s r;
+  glm_vec4_mul(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief multiply/scale vec4 vector with scalar: result = v * s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(scale)(vec4s v, float s) {
+  vec4s r;
+  glm_vec4_scale(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief make vec4 vector scale as specified: result = unit(v) * s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(scale_as)(vec4s v, float s) {
+  vec4s r;
+  glm_vec4_scale_as(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief div vector with another component-wise division: d = a / b
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         result = (a[0]/b[0], a[1]/b[1], a[2]/b[2], a[3]/b[3])
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(div)(vec4s a, vec4s b) {
+  vec4s r;
+  glm_vec4_div(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief div vec4 vector with scalar: d = v / s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @returns         destination vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(divs)(vec4s v, float s) {
+  vec4s r;
+  glm_vec4_divs(v.raw, s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief add two vectors and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += (a + b)
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(addadd)(vec4s a, vec4s b, vec4s dest) {
+  glm_vec4_addadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief sub two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += (a - b)
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(subadd)(vec4s a, vec4s b, vec4s dest) {
+  glm_vec4_subadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief mul two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += (a * b)
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(muladd)(vec4s a, vec4s b, vec4s dest) {
+  glm_vec4_muladd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief mul vector with scalar and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @returns         dest += (a * b)
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(muladds)(vec4s a, float s, vec4s dest) {
+  glm_vec4_muladds(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add max of two vectors to result/dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += max(a, b)
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(maxadd)(vec4s a, vec4s b, vec4s dest) {
+  glm_vec4_maxadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add min of two vectors to result/dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest += min(a, b)
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(minadd)(vec4s a, vec4s b, vec4s dest) {
+  glm_vec4_minadd(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief sub two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= (a + b)
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(subsub)(vec4s a, vec4s b, vec4s dest) {
+  glm_vec4_subsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief add two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= (a + b)
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(addsub)(vec4s a, vec4s b, vec4s dest) {
+  glm_vec4_addsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief mul two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= (a * b)
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(mulsub)(vec4s a, vec4s b, vec4s dest) {
+  glm_vec4_mulsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief mul vector with scalar and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @returns         dest -= (a * b)
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(mulsubs)(vec4s a, float s, vec4s dest) {
+  glm_vec4_mulsubs(a.raw, s, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief sub max of two vectors to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= max(a, b)
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(maxsub)(vec4s a, vec4s b, vec4s dest) {
+  glm_vec4_maxsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief sub min of two vectors to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @returns         dest -= min(a, b)
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(minsub)(vec4s a, vec4s b, vec4s dest) {
+  glm_vec4_minsub(a.raw, b.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief negate vector components and store result in dest
+ *
+ * @param[in]  v     vector
+ * @returns          result vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(negate)(vec4s v) {
+  glm_vec4_negate(v.raw);
+  return v;
+}
+
+/*!
+ * @brief normalize vec4 and store result in same vec
+ *
+ * @param[in] v   vector
+ * @returns       normalized vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(normalize)(vec4s v) {
+  glm_vec4_normalize(v.raw);
+  return v;
+}
+
+/**
+ * @brief distance between two vectors
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ * @return returns distance
+ */
+CGLM_INLINE
+float
+glms_vec4_(distance)(vec4s a, vec4s b) {
+  return glm_vec4_distance(a.raw, b.raw);
+}
+
+/**
+ * @brief squared distance between two vectors
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ * @return returns squared distance
+ */
+CGLM_INLINE
+float
+glms_vec4_(distance2)(vec4s a, vec4s b) {
+  return glm_vec4_distance2(a.raw, b.raw);
+}
+
+/*!
+ * @brief max values of vectors
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(maxv)(vec4s a, vec4s b) {
+  vec4s r;
+  glm_vec4_maxv(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief min values of vectors
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @returns         destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(minv)(vec4s a, vec4s b) {
+  vec4s r;
+  glm_vec4_minv(a.raw, b.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief clamp vector's individual members between min and max values
+ *
+ * @param[in]       v       vector
+ * @param[in]       minVal  minimum value
+ * @param[in]       maxVal  maximum value
+ * @returns                 clamped vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(clamp)(vec4s v, float minVal, float maxVal) {
+  glm_vec4_clamp(v.raw, minVal, maxVal);
+  return v;
+}
+
+/*!
+ * @brief linear interpolation between two vectors
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from  from value
+ * @param[in]   to    to value
+ * @param[in]   t     interpolant (amount)
+ * @returns           destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(lerp)(vec4s from, vec4s to, float t) {
+  vec4s r;
+  glm_vec4_lerp(from.raw, to.raw, t, r.raw);
+  return r;
+}
+
+/*!
+ * @brief linear interpolation between two vectors (clamped)
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from  from value
+ * @param[in]   to    to value
+ * @param[in]   t     interpolant (amount) clamped between 0 and 1
+ * @returns           destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(lerpc)(vec4s from, vec4s to, float t) {
+  vec4s r;
+  glm_vec4_lerpc(from.raw, to.raw, t, r.raw);
+  return r;
+}
+
+/*!
+ * @brief linear interpolation between two vectors
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from  from value
+ * @param[in]   to    to value
+ * @param[in]   t     interpolant (amount)
+ * @returns           destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(mix)(vec4s from, vec4s to, float t) {
+  vec4s r;
+  glm_vec4_mix(from.raw, to.raw, t, r.raw);
+  return r;
+}
+
+/*!
+ * @brief linear interpolation between two vectors (clamped)
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from  from value
+ * @param[in]   to    to value
+ * @param[in]   t     interpolant (amount) clamped between 0 and 1
+ * @returns           destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(mixc)(vec4s from, vec4s to, float t) {
+  vec4s r;
+  glm_vec4_mixc(from.raw, to.raw, t, r.raw);
+  return r;
+}
+
+/*!
+ * @brief threshold function
+ *
+ * @param[in]   edge    threshold
+ * @param[in]   x       value to test against threshold
+ * @returns             0.0 if x < edge, else 1.0
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(step)(vec4s edge, vec4s x) {
+  vec4s r;
+  glm_vec4_step(edge.raw, x.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief threshold function with a smooth transition (unidimensional)
+ *
+ * @param[in]   edge0   low threshold
+ * @param[in]   edge1   high threshold
+ * @param[in]   x       value to test against threshold
+ * @returns             destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(smoothstep_uni)(float edge0, float edge1, vec4s x) {
+  vec4s r;
+  glm_vec4_smoothstep_uni(edge0, edge1, x.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief threshold function with a smooth transition
+ *
+ * @param[in]   edge0   low threshold
+ * @param[in]   edge1   high threshold
+ * @param[in]   x       value to test against threshold
+ * @returns             destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(smoothstep)(vec4s edge0, vec4s edge1, vec4s x) {
+  vec4s r;
+  glm_vec4_smoothstep(edge0.raw, edge1.raw, x.raw, r.raw);
+  return r;
+}
+
+/*!
+ * @brief smooth Hermite interpolation between two vectors
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from    from value
+ * @param[in]   to      to value
+ * @param[in]   t       interpolant (amount)
+ * @returns             destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(smoothinterp)(vec4s from, vec4s to, float t) {
+  vec4s r;
+  glm_vec4_smoothinterp(from.raw, to.raw, t, r.raw);
+  return r;
+}
+
+/*!
+ * @brief smooth Hermite interpolation between two vectors (clamped)
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from    from value
+ * @param[in]   to      to value
+ * @param[in]   t       interpolant (amount) clamped between 0 and 1
+ * @returns             destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(smoothinterpc)(vec4s from, vec4s to, float t) {
+  vec4s r;
+  glm_vec4_smoothinterpc(from.raw, to.raw, t, r.raw);
+  return r;
+}
+
+/*!
+ * @brief helper to fill vec4 as [S^3, S^2, S, 1]
+ *
+ * @param[in]   s     parameter
+ * @returns           destination
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(cubic)(float s) {
+  vec4s r;
+  glm_vec4_cubic(s, r.raw);
+  return r;
+}
+
+/*!
+ * @brief swizzle vector components
+ *
+ * you can use existing masks e.g. GLM_XXXX, GLM_WZYX
+ *
+ * @param[in]  v    source
+ * @param[in]  mask mask
+ * @returns swizzled vector
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(swizzle)(vec4s v, int mask) {
+  vec4s dest;
+  glm_vec4_swizzle(v.raw, mask, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief Create four dimensional vector from pointer
+ *
+ * @param[in]  src  pointer to an array of floats
+ * @returns constructed 4D vector from raw pointer
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(make)(const float * __restrict src) {
+  vec4s dest;
+  glm_vec4_make(src, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief reflection vector using an incident ray and a surface normal
+ *
+ * @param[in]  v    incident vector
+ * @param[in]  n    normalized normal vector
+ * @returns reflection result
+ */
+CGLM_INLINE
+vec4s
+glms_vec4_(reflect)(vec4s v, vec4s n) {
+  vec4s dest;
+  glm_vec4_reflect(v.raw, n.raw, dest.raw);
+  return dest;
+}
+
+/*!
+ * @brief computes refraction vector for an incident vector and a surface normal.
+ *
+ * calculates the refraction vector based on Snell's law. If total internal reflection
+ * occurs (angle too great given eta), dest is set to zero and returns false.
+ * Otherwise, computes refraction vector, stores it in dest, and returns true.
+ *
+ * this implementation does not explicitly preserve the 'w' component of the
+ * incident vector 'I' in the output 'dest', users requiring the preservation of
+ * the 'w' component should manually adjust 'dest' after calling this function.
+ *
+ * @param[in]  v    normalized incident vector
+ * @param[in]  n    normalized normal vector
+ * @param[in]  eta  ratio of indices of refraction (incident/transmitted)
+ * @param[out] dest refraction vector if refraction occurs; zero vector otherwise
+ *
+ * @returns true if refraction occurs; false if total internal reflection occurs.
+ */
+CGLM_INLINE
+bool
+glms_vec4_(refract)(vec4s v, vec4s n, float eta, vec4s * __restrict dest) {
+  return glm_vec4_refract(v.raw, n.raw, eta, dest->raw);
+}
+
+#endif /* cglms_vec4s_h */
diff --git a/external/cglm/types-struct.h b/external/cglm/types-struct.h
new file mode 100644
index 0000000..d93152e
--- /dev/null
+++ b/external/cglm/types-struct.h
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_types_struct_h
+#define cglm_types_struct_h
+
+#include "types.h"
+
+/*
+ * Anonymous structs are available since C11, but we'd like to be compatible
+ * with C99 and C89 too. So let's figure out if we should be using them or not.
+ * It's simply a convenience feature, you can e.g. build the library with
+ * anonymous structs and your application without them and they'll still be
+ * compatible, cglm doesn't use the anonymous structs internally.
+ */
+#ifndef CGLM_USE_ANONYMOUS_STRUCT
+   /* If the user doesn't explicitly specify if they want anonymous structs or
+    * not, then we'll try to intuit an appropriate choice. */
+#  if defined(CGLM_NO_ANONYMOUS_STRUCT)
+     /* The user has defined CGLM_NO_ANONYMOUS_STRUCT. This used to be the
+      * only #define governing the use of anonymous structs, so for backward
+      * compatibility, we still honor that choice and disable them. */
+#    define CGLM_USE_ANONYMOUS_STRUCT 0
+     /* Disable anonymous structs if strict ANSI mode is enabled for C89 or C99 */
+#  elif defined(__STRICT_ANSI__) && \
+        (!defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L))
+     /* __STRICT_ANSI__ is defined and we're in C89
+      * or C99 mode (C11 or later not detected) */
+#    define CGLM_USE_ANONYMOUS_STRUCT 0
+#  elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || \
+        (defined(__cplusplus)      && __cplusplus >= 201103L)
+     /* We're compiling for C11 or this is the MSVC compiler. In either
+      * case, anonymous structs are available, so use them. */
+#    define CGLM_USE_ANONYMOUS_STRUCT 1
+#  elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+     /* GCC 4.6 and onwards support anonymous structs as an extension */
+#    define CGLM_USE_ANONYMOUS_STRUCT 1
+#  elif defined(__clang__) && __clang_major__ >= 3
+     /* Clang 3.0 and onwards support anonymous structs as an extension */
+#    define CGLM_USE_ANONYMOUS_STRUCT 1
+#  elif defined(_MSC_VER) && (_MSC_VER >= 1900) /*  Visual Studio 2015 */
+     /* We can support anonymous structs
+      * since Visual Studio 2015 or 2017 (1910) maybe? */
+#    define CGLM_USE_ANONYMOUS_STRUCT 1
+#  else
+     /* Otherwise, we're presumably building for C99 or C89 and can't rely
+      * on anonymous structs being available. Turn them off. */
+#    define CGLM_USE_ANONYMOUS_STRUCT 0
+#  endif
+#endif
+
+typedef union vec2s {
+  vec2 raw;
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    float x;
+    float y;
+  };
+  
+  struct {
+    float r;
+    float i;
+  };
+  
+  struct {
+    float u;
+    float v;
+  };
+  
+  struct {
+    float s;
+    float t;
+  };
+#endif
+} vec2s;
+
+typedef union vec3s {
+  vec3 raw;
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    float x;
+    float y;
+    float z;
+  };
+  
+  struct {
+    float r;
+    float g;
+    float b;
+  };
+#endif
+} vec3s;
+
+typedef union ivec2s {
+  ivec2 raw;
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    int x;
+    int y;
+  };
+  
+  struct {
+    int r;
+    int i;
+  };
+  
+  struct {
+    int u;
+    int v;
+  };
+  
+  struct {
+    int s;
+    int t;
+  };
+#endif
+} ivec2s;
+
+typedef union ivec3s {
+  ivec3 raw;
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    int x;
+    int y;
+    int z;
+  };
+  
+  struct {
+    int r;
+    int g;
+    int b;
+  };
+#endif
+} ivec3s;
+
+typedef union ivec4s {
+  ivec4 raw;
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    int x;
+    int y;
+    int z;
+    int w;
+  };
+  
+  struct {
+    int r;
+    int g;
+    int b;
+    int a;
+  };
+#endif
+} ivec4s;
+
+typedef union CGLM_ALIGN_IF(16) vec4s {
+  vec4 raw;
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    float x;
+    float y;
+    float z;
+    float w;
+  };
+  
+  struct {
+    float r;
+    float g;
+    float b;
+    float a;
+  };
+#endif
+} vec4s;
+
+typedef union CGLM_ALIGN_IF(16) versors {
+  vec4 raw;
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    float x;
+    float y;
+    float z;
+    float w;
+  };
+
+  struct {
+    vec3s imag;
+    float real;
+  };
+#endif
+} versors;
+
+typedef union mat2s {
+  mat2  raw;
+  vec2s col[2];
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    float m00, m01;
+    float m10, m11;
+  };
+#endif
+} mat2s;
+
+typedef union mat2x3s {
+  mat2x3 raw;
+  vec3s  col[2]; /* [col (2), row (3)] */
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    float m00, m01, m02;
+    float m10, m11, m12;
+  };
+#endif
+} mat2x3s;
+
+typedef union mat2x4s {
+  mat2x4 raw;
+  vec4s  col[2]; /* [col (2), row (4)] */
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    float m00, m01, m02, m03;
+    float m10, m11, m12, m13;
+  };
+#endif
+} mat2x4s;
+
+typedef union mat3s {
+  mat3  raw;
+  vec3s col[3];
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    float m00, m01, m02;
+    float m10, m11, m12;
+    float m20, m21, m22;
+  };
+#endif
+} mat3s;
+
+typedef union mat3x2s {
+  mat3x2 raw;
+  vec2s  col[3]; /* [col (3), row (2)] */
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    float m00, m01;
+    float m10, m11;
+    float m20, m21;
+  };
+#endif
+} mat3x2s;
+
+typedef union mat3x4s {
+  mat3x4 raw;
+  vec4s  col[3]; /* [col (3), row (4)] */
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    float m00, m01, m02, m03;
+    float m10, m11, m12, m13;
+    float m20, m21, m22, m23;
+  };
+#endif
+} mat3x4s;
+
+typedef union CGLM_ALIGN_MAT mat4s {
+  mat4  raw;
+  vec4s col[4];
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    float m00, m01, m02, m03;
+    float m10, m11, m12, m13;
+    float m20, m21, m22, m23;
+    float m30, m31, m32, m33;
+  };
+#endif
+} mat4s;
+
+typedef union mat4x2s {
+  mat4x2 raw;
+  vec2s  col[4]; /* [col (4), row (2)] */
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    float m00, m01;
+    float m10, m11;
+    float m20, m21;
+    float m30, m31;
+  };
+#endif
+} mat4x2s;
+
+typedef union mat4x3s {
+  mat4x3 raw;
+  vec3s  col[4]; /* [col (4), row (3)] */
+#if CGLM_USE_ANONYMOUS_STRUCT
+  struct {
+    float m00, m01, m02;
+    float m10, m11, m12;
+    float m20, m21, m22;
+    float m30, m31, m32;
+  };
+#endif
+} mat4x3s;
+
+#endif /* cglm_types_struct_h */
diff --git a/external/cglm/types.h b/external/cglm/types.h
new file mode 100644
index 0000000..7a482c0
--- /dev/null
+++ b/external/cglm/types.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_types_h
+#define cglm_types_h
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
+# include <stdalign.h>
+#endif
+
+#if defined(_MSC_VER)
+/* do not use alignment for older visual studio versions */
+/* also ARM32 also causes similar error, disable it for now on ARM32 too */
+#  if _MSC_VER < 1913 || _M_ARM /*  Visual Studio 2017 version 15.6  */
+#    define CGLM_ALL_UNALIGNED
+#    define CGLM_ALIGN(X) /* no alignment */
+#  else
+#    define CGLM_ALIGN(X) __declspec(align(X))
+#  endif
+#else
+#  define CGLM_ALIGN(X) __attribute((aligned(X)))
+#endif
+
+#ifndef CGLM_ALL_UNALIGNED
+#  define CGLM_ALIGN_IF(X) CGLM_ALIGN(X)
+#else
+#  define CGLM_ALIGN_IF(X) /* no alignment */
+#endif
+
+#ifdef __AVX__
+#  define CGLM_ALIGN_MAT CGLM_ALIGN(32)
+#else
+#  define CGLM_ALIGN_MAT CGLM_ALIGN(16)
+#endif
+
+#ifndef CGLM_HAVE_BUILTIN_ASSUME_ALIGNED
+
+#  if defined(__has_builtin)
+#    if __has_builtin(__builtin_assume_aligned)
+#      define CGLM_HAVE_BUILTIN_ASSUME_ALIGNED 1
+#    endif
+#  elif defined(__GNUC__) && defined(__GNUC_MINOR__)
+#    if __GNUC__ >= 4 && __GNUC_MINOR__ >= 7
+#      define CGLM_HAVE_BUILTIN_ASSUME_ALIGNED 1
+#    endif
+#  endif
+
+#  ifndef CGLM_HAVE_BUILTIN_ASSUME_ALIGNED
+#    define CGLM_HAVE_BUILTIN_ASSUME_ALIGNED 0
+#  endif
+
+#endif
+
+#if CGLM_HAVE_BUILTIN_ASSUME_ALIGNED
+#  define CGLM_ASSUME_ALIGNED(expr, alignment) \
+     __builtin_assume_aligned((expr), (alignment))
+#else
+#  define CGLM_ASSUME_ALIGNED(expr, alignment) (expr)
+#endif
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
+# define CGLM_CASTPTR_ASSUME_ALIGNED(expr, type) \
+   ((type*)CGLM_ASSUME_ALIGNED((expr), alignof(type)))
+#elif defined(_MSC_VER)
+# define CGLM_CASTPTR_ASSUME_ALIGNED(expr, type) \
+   ((type*)CGLM_ASSUME_ALIGNED((expr), __alignof(type)))
+#else
+# define CGLM_CASTPTR_ASSUME_ALIGNED(expr, type) \
+   ((type*)CGLM_ASSUME_ALIGNED((expr), __alignof__(type)))
+#endif
+
+typedef int                     ivec2[2];
+typedef int                     ivec3[3];
+typedef int                     ivec4[4];
+
+typedef float                   vec2[2];
+typedef float                   vec3[3];
+typedef CGLM_ALIGN_IF(16) float vec4[4];
+typedef vec4                    versor;     /* |x, y, z, w| -> w is the last */
+typedef vec3                    mat3[3];
+typedef vec2                    mat3x2[3];  /* [col (3), row (2)] */
+typedef vec4                    mat3x4[3];  /* [col (3), row (4)] */
+typedef CGLM_ALIGN_IF(16) vec2  mat2[2];
+typedef vec3                    mat2x3[2];  /* [col (2), row (3)] */
+typedef vec4                    mat2x4[2];  /* [col (2), row (4)] */
+typedef CGLM_ALIGN_MAT    vec4  mat4[4];
+typedef vec2                    mat4x2[4];  /* [col (4), row (2)] */
+typedef vec3                    mat4x3[4];  /* [col (4), row (3)] */
+
+/*
+  Important: cglm stores quaternion as [x, y, z, w] in memory since v0.4.0 
+  it was [w, x, y, z] before v0.4.0 ( v0.3.5 and earlier ). w is real part.
+*/
+
+#define GLM_E         2.71828182845904523536028747135266250   /* e           */
+#define GLM_LOG2E     1.44269504088896340735992468100189214   /* log2(e)     */
+#define GLM_LOG10E    0.434294481903251827651128918916605082  /* log10(e)    */
+#define GLM_LN2       0.693147180559945309417232121458176568  /* loge(2)     */
+#define GLM_LN10      2.30258509299404568401799145468436421   /* loge(10)    */
+#define GLM_PI        3.14159265358979323846264338327950288   /* pi          */
+#define GLM_PI_2      1.57079632679489661923132169163975144   /* pi/2        */
+#define GLM_PI_4      0.785398163397448309615660845819875721  /* pi/4        */
+#define GLM_1_PI      0.318309886183790671537767526745028724  /* 1/pi        */
+#define GLM_2_PI      0.636619772367581343075535053490057448  /* 2/pi        */
+#define GLM_TAU       6.283185307179586476925286766559005768  /* tau         */
+#define GLM_TAU_2     GLM_PI                                  /* tau/2       */
+#define GLM_TAU_4     GLM_PI_2                                /* tau/4       */
+#define GLM_1_TAU     0.159154943091895335768883763372514362  /* 1/tau       */
+#define GLM_2_TAU     0.318309886183790671537767526745028724  /* 2/tau       */
+#define GLM_2_SQRTPI  1.12837916709551257389615890312154517   /* 2/sqrt(pi)  */
+#define GLM_SQRTTAU   2.506628274631000502415765284811045253  /* sqrt(tau)   */
+#define GLM_SQRT2     1.41421356237309504880168872420969808   /* sqrt(2)     */
+#define GLM_SQRT1_2   0.707106781186547524400844362104849039  /* 1/sqrt(2)   */
+
+#define GLM_Ef         ((float)GLM_E)
+#define GLM_LOG2Ef     ((float)GLM_LOG2E)
+#define GLM_LOG10Ef    ((float)GLM_LOG10E)
+#define GLM_LN2f       ((float)GLM_LN2)
+#define GLM_LN10f      ((float)GLM_LN10)
+#define GLM_PIf        ((float)GLM_PI)
+#define GLM_PI_2f      ((float)GLM_PI_2)
+#define GLM_PI_4f      ((float)GLM_PI_4)
+#define GLM_1_PIf      ((float)GLM_1_PI)
+#define GLM_2_PIf      ((float)GLM_2_PI)
+#define GLM_TAUf       ((float)GLM_TAU)
+#define GLM_TAU_2f     ((float)GLM_TAU_2)
+#define GLM_TAU_4f     ((float)GLM_TAU_4)
+#define GLM_1_TAUf     ((float)GLM_1_TAU)
+#define GLM_2_TAUf     ((float)GLM_2_TAU)
+#define GLM_2_SQRTPIf  ((float)GLM_2_SQRTPI)
+#define GLM_2_SQRTTAUf ((float)GLM_SQRTTAU)
+#define GLM_SQRT2f     ((float)GLM_SQRT2)
+#define GLM_SQRT1_2f   ((float)GLM_SQRT1_2)
+
+/* DEPRECATED! use GLM_PI and friends */
+#define CGLM_PI       GLM_PIf
+#define CGLM_PI_2     GLM_PI_2f
+#define CGLM_PI_4     GLM_PI_4f
+
+#endif /* cglm_types_h */
diff --git a/external/cglm/util.h b/external/cglm/util.h
new file mode 100644
index 0000000..8c5f2cb
--- /dev/null
+++ b/external/cglm/util.h
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE int   glm_sign(int val);
+   CGLM_INLINE float glm_signf(float val);
+   CGLM_INLINE float glm_rad(float deg);
+   CGLM_INLINE float glm_deg(float rad);
+   CGLM_INLINE void  glm_make_rad(float *deg);
+   CGLM_INLINE void  glm_make_deg(float *rad);
+   CGLM_INLINE float glm_pow2(float x);
+   CGLM_INLINE float glm_min(float a, float b);
+   CGLM_INLINE float glm_max(float a, float b);
+   CGLM_INLINE float glm_clamp(float val, float minVal, float maxVal);
+   CGLM_INLINE float glm_clamp_zo(float val, float minVal, float maxVal);
+   CGLM_INLINE float glm_lerp(float from, float to, float t);
+   CGLM_INLINE float glm_lerpc(float from, float to, float t);
+   CGLM_INLINE float glm_step(float edge, float x);
+   CGLM_INLINE float glm_smooth(float t);
+   CGLM_INLINE float glm_smoothstep(float edge0, float edge1, float x);
+   CGLM_INLINE float glm_smoothinterp(float from, float to, float t);
+   CGLM_INLINE float glm_smoothinterpc(float from, float to, float t);
+   CGLM_INLINE bool  glm_eq(float a, float b);
+   CGLM_INLINE float glm_percent(float from, float to, float current);
+   CGLM_INLINE float glm_percentc(float from, float to, float current);
+ */
+
+#ifndef cglm_util_h
+#define cglm_util_h
+
+#include "common.h"
+
+#define GLM_MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+#define GLM_MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
+
+/*!
+ * @brief get sign of 32 bit integer as +1, -1, 0
+ *
+ * Important: It returns 0 for zero input
+ *
+ * @param val integer value
+ */
+CGLM_INLINE
+int
+glm_sign(int val) {
+  return ((val >> 31) - (-val >> 31));
+}
+
+/*!
+ * @brief get sign of 32 bit float as +1, -1, 0
+ *
+ * Important: It returns 0 for zero/NaN input
+ *
+ * @param val float value
+ */
+CGLM_INLINE
+float
+glm_signf(float val) {
+  return (float)((val > 0.0f) - (val < 0.0f));
+}
+
+/*!
+ * @brief convert degree to radians
+ *
+ * @param[in] deg angle in degrees
+ */
+CGLM_INLINE
+float
+glm_rad(float deg) {
+  return deg * GLM_PIf / 180.0f;
+}
+
+/*!
+ * @brief convert radians to degree
+ *
+ * @param[in] rad angle in radians
+ */
+CGLM_INLINE
+float
+glm_deg(float rad) {
+  return rad * 180.0f / GLM_PIf;
+}
+
+/*!
+ * @brief convert existing degree to radians. this will override degrees value
+ *
+ * @param[in, out] deg pointer to angle in degrees
+ */
+CGLM_INLINE
+void
+glm_make_rad(float *deg) {
+  *deg = *deg * GLM_PIf / 180.0f;
+}
+
+/*!
+ * @brief convert existing radians to degree. this will override radians value
+ *
+ * @param[in, out] rad pointer to angle in radians
+ */
+CGLM_INLINE
+void
+glm_make_deg(float *rad) {
+  *rad = *rad * 180.0f / GLM_PIf;
+}
+
+/*!
+ * @brief multiplies given parameter with itself = x * x or powf(x, 2)
+ *
+ * @param[in] x x
+ */
+CGLM_INLINE
+float
+glm_pow2(float x) {
+  return x * x;
+}
+
+/*!
+ * @brief find minimum of given two values
+ *
+ * @param[in] a number 1
+ * @param[in] b number 2
+ */
+CGLM_INLINE
+float
+glm_min(float a, float b) {
+  if (a < b)
+    return a;
+  return b;
+}
+
+/*!
+ * @brief find maximum of given two values
+ *
+ * @param[in] a number 1
+ * @param[in] b number 2
+ */
+CGLM_INLINE
+float
+glm_max(float a, float b) {
+  if (a > b)
+    return a;
+  return b;
+}
+
+/*!
+ * @brief find minimum of given two values
+ * 
+ * @param[in] a number 1
+ * @param[in] b number 2
+ *
+ * @return smallest of the two values
+ */
+CGLM_INLINE
+int
+glm_imin(int a, int b) {
+  if (a < b)
+    return a;
+  return b;
+}
+
+/*!
+ * @brief find maximum of given two values
+ *
+ * @param[in] a number 1
+ * @param[in] b number 2
+ *
+ * @return largest of the two values
+ */
+CGLM_INLINE
+int
+glm_imax(int a, int b) {
+  if (a > b)
+    return a;
+  return b;
+}
+
+/*!
+ * @brief clamp a number between min and max
+ *
+ * @param[in] val    value to clamp
+ * @param[in] minVal minimum value
+ * @param[in] maxVal maximum value
+ */
+CGLM_INLINE
+float
+glm_clamp(float val, float minVal, float maxVal) {
+  return glm_min(glm_max(val, minVal), maxVal);
+}
+
+/*!
+ * @brief clamp a number to zero and one
+ *
+ * @param[in] val value to clamp
+ */
+CGLM_INLINE
+float
+glm_clamp_zo(float val) {
+  return glm_clamp(val, 0.0f, 1.0f);
+}
+
+/*!
+ * @brief linear interpolation between two numbers
+ *
+ * formula:  from + t * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount)
+ */
+CGLM_INLINE
+float
+glm_lerp(float from, float to, float t) {
+  return from + t * (to - from);
+}
+
+/*!
+ * @brief clamped linear interpolation between two numbers
+ *
+ * formula:  from + t * (to - from)
+ *
+ * @param[in]   from    from value
+ * @param[in]   to      to value
+ * @param[in]   t       interpolant (amount) clamped between 0 and 1
+ */
+CGLM_INLINE
+float
+glm_lerpc(float from, float to, float t) {
+  return glm_lerp(from, to, glm_clamp_zo(t));
+}
+
+/*!
+ * @brief threshold function
+ *
+ * @param[in]   edge    threshold
+ * @param[in]   x       value to test against threshold
+ * @return      returns 0.0 if x < edge, else 1.0
+ */
+CGLM_INLINE
+float
+glm_step(float edge, float x) {
+  /* branching - no type conversion */
+  return (x < edge) ? 0.0f : 1.0f;
+  /*
+   * An alternative implementation without branching
+   * but with type conversion could be:
+   * return !(x < edge);
+   */
+}
+
+/*!
+ * @brief smooth Hermite interpolation
+ *
+ * formula:  t^2 * (3-2t)
+ *
+ * @param[in]   t    interpolant (amount)
+ */
+CGLM_INLINE
+float
+glm_smooth(float t) {
+  return t * t * (3.0f - 2.0f * t);
+}
+
+/*!
+ * @brief threshold function with a smooth transition (according to OpenCL specs)
+ *
+ * formula:  t^2 * (3-2t)
+ *
+ * @param[in]   edge0 low threshold
+ * @param[in]   edge1 high threshold
+ * @param[in]   x     interpolant (amount)
+ */
+CGLM_INLINE
+float
+glm_smoothstep(float edge0, float edge1, float x) {
+  float t;
+  t = glm_clamp_zo((x - edge0) / (edge1 - edge0));
+  return glm_smooth(t);
+}
+
+/*!
+ * @brief smoothstep interpolation between two numbers
+ *
+ * formula:  from + smoothstep(t) * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount)
+ */
+CGLM_INLINE
+float
+glm_smoothinterp(float from, float to, float t) {
+  return from + glm_smooth(t) * (to - from);
+}
+
+/*!
+ * @brief clamped smoothstep interpolation between two numbers
+ *
+ * formula:  from + smoothstep(t) * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount) clamped between 0 and 1
+ */
+CGLM_INLINE
+float
+glm_smoothinterpc(float from, float to, float t) {
+  return glm_smoothinterp(from, to, glm_clamp_zo(t));
+}
+
+/*!
+ * @brief check if two float equal with using EPSILON
+ *
+ * @param[in]   a   a
+ * @param[in]   b   b
+ */
+CGLM_INLINE
+bool
+glm_eq(float a, float b) {
+  return fabsf(a - b) <= GLM_FLT_EPSILON;
+}
+
+/*!
+ * @brief percentage of current value between start and end value
+ *
+ * maybe fraction could be alternative name.
+ *
+ * @param[in]   from    from value
+ * @param[in]   to      to value
+ * @param[in]   current current value
+ */
+CGLM_INLINE
+float
+glm_percent(float from, float to, float current) {
+  float t;
+
+  if ((t = to - from) == 0.0f)
+    return 1.0f;
+
+  return (current - from) / t;
+}
+
+/*!
+ * @brief clamped percentage of current value between start and end value
+ *
+ * @param[in]   from    from value
+ * @param[in]   to      to value
+ * @param[in]   current current value
+ */
+CGLM_INLINE
+float
+glm_percentc(float from, float to, float current) {
+  return glm_clamp_zo(glm_percent(from, to, current));
+}
+
+/*!
+* @brief swap two float values
+*
+* @param[in]   a float value 1 (pointer)
+* @param[in]   b float value 2 (pointer)
+*/
+CGLM_INLINE
+void
+glm_swapf(float * __restrict a, float * __restrict b) {
+  float t;
+  t  = *a;
+  *a = *b;
+  *b = t;
+}
+
+#endif /* cglm_util_h */
diff --git a/external/cglm/vec2-ext.h b/external/cglm/vec2-ext.h
new file mode 100644
index 0000000..6186f07
--- /dev/null
+++ b/external/cglm/vec2-ext.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void  glm_vec2_fill(vec2 v, float val)
+   CGLM_INLINE bool  glm_vec2_eq(vec2 v, float val);
+   CGLM_INLINE bool  glm_vec2_eq_eps(vec2 v, float val);
+   CGLM_INLINE bool  glm_vec2_eq_all(vec2 v);
+   CGLM_INLINE bool  glm_vec2_eqv(vec2 a, vec2 b);
+   CGLM_INLINE bool  glm_vec2_eqv_eps(vec2 a, vec2 b);
+   CGLM_INLINE float glm_vec2_max(vec2 v);
+   CGLM_INLINE float glm_vec2_min(vec2 v);
+   CGLM_INLINE bool  glm_vec2_isnan(vec2 v);
+   CGLM_INLINE bool  glm_vec2_isinf(vec2 v);
+   CGLM_INLINE bool  glm_vec2_isvalid(vec2 v);
+   CGLM_INLINE void  glm_vec2_sign(vec2 v, vec2 dest);
+   CGLM_INLINE void  glm_vec2_abs(vec2 v, vec2 dest);
+   CGLM_INLINE void  glm_vec2_fract(vec2 v, vec2 dest);
+   CGLM_INLINE void  glm_vec2_floor(vec2 v, vec2 dest);
+   CGLM_INLINE float glm_vec2_mods(vec2 v, float s, vec2 dest);
+   CGLM_INLINE float glm_vec2_steps(float edge, vec2 v, vec2 dest);
+   CGLM_INLINE void  glm_vec2_stepr(vec2 edge, float v, vec2 dest);
+   CGLM_INLINE void  glm_vec2_sqrt(vec2 v, vec2 dest);
+   CGLM_INLINE void  glm_vec2_complex_mul(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_complex_div(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_complex_conjugate(vec2 a, vec2 dest)
+ */
+
+#ifndef cglm_vec2_ext_h
+#define cglm_vec2_ext_h
+
+#include "common.h"
+#include "util.h"
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param[out] v   dest
+ * @param[in]  val value
+ */
+CGLM_INLINE
+void
+glm_vec2_fill(vec2 v, float val) {
+  v[0] = v[1] = val;
+}
+
+/*!
+ * @brief check if vector is equal to value (without epsilon)
+ *
+ * @param[in] v   vector
+ * @param[in] val value
+ */
+CGLM_INLINE
+bool
+glm_vec2_eq(vec2 v, float val) {
+  return v[0] == val && v[0] == v[1];
+}
+
+/*!
+ * @brief check if vector is equal to value (with epsilon)
+ *
+ * @param[in] v   vector
+ * @param[in] val value
+ */
+CGLM_INLINE
+bool
+glm_vec2_eq_eps(vec2 v, float val) {
+  return fabsf(v[0] - val) <= GLM_FLT_EPSILON
+         && fabsf(v[1] - val) <= GLM_FLT_EPSILON;
+}
+
+/*!
+ * @brief check if vector members are equal (without epsilon)
+ *
+ * @param[in] v   vector
+ */
+CGLM_INLINE
+bool
+glm_vec2_eq_all(vec2 v) {
+  return glm_vec2_eq_eps(v, v[0]);
+}
+
+/*!
+ * @brief check if vector is equal to another (without epsilon)
+ *
+ * @param[in] a vector
+ * @param[in] b vector
+ */
+CGLM_INLINE
+bool
+glm_vec2_eqv(vec2 a, vec2 b) {
+  return a[0] == b[0] && a[1] == b[1];
+}
+
+/*!
+ * @brief check if vector is equal to another (with epsilon)
+ *
+ * @param[in] a vector
+ * @param[in] b vector
+ */
+CGLM_INLINE
+bool
+glm_vec2_eqv_eps(vec2 a, vec2 b) {
+  return fabsf(a[0] - b[0]) <= GLM_FLT_EPSILON
+         && fabsf(a[1] - b[1]) <= GLM_FLT_EPSILON;
+}
+
+/*!
+ * @brief max value of vector
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+float
+glm_vec2_max(vec2 v) {
+  return glm_max(v[0], v[1]);
+}
+
+/*!
+ * @brief min value of vector
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+float
+glm_vec2_min(vec2 v) {
+  return glm_min(v[0], v[1]);
+}
+
+/*!
+ * @brief check if one of items is NaN (not a number)
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec2_isnan(vec2 v) {
+#ifndef CGLM_FAST_MATH
+  return isnan(v[0]) || isnan(v[1]);
+#else
+  return false;
+#endif
+}
+
+/*!
+ * @brief check if one of items is INFINITY
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec2_isinf(vec2 v) {
+#ifndef CGLM_FAST_MATH
+  return isinf(v[0]) || isinf(v[1]);
+#else
+  return false;
+#endif
+}
+
+/*!
+ * @brief check if all items are valid number
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec2_isvalid(vec2 v) {
+  return !glm_vec2_isnan(v) && !glm_vec2_isinf(v);
+}
+
+/*!
+ * @brief get sign of 32 bit float as +1, -1, 0
+ *
+ * Important: It returns 0 for zero/NaN input
+ *
+ * @param v vector
+ */
+CGLM_INLINE
+void
+glm_vec2_sign(vec2 v, vec2 dest) {
+  dest[0] = glm_signf(v[0]);
+  dest[1] = glm_signf(v[1]);
+}
+
+/*!
+ * @brief absolute value of v
+ *
+ * @param[in]	v	vector
+ * @param[out]	dest	destination
+ */
+CGLM_INLINE
+void
+glm_vec2_abs(vec2 v, vec2 dest) {
+  dest[0] = fabsf(v[0]);
+  dest[1] = fabsf(v[1]);
+}
+
+/*!
+ * @brief fractional part of each vector item
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec2_fract(vec2 v, vec2 dest) {
+  dest[0] = fminf(v[0] - floorf(v[0]), 0.999999940395355224609375f);
+  dest[1] = fminf(v[1] - floorf(v[1]), 0.999999940395355224609375f);
+}
+
+/*!
+ * @brief floor of each vector item
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec2_floor(vec2 v, vec2 dest) {
+  dest[0] = floorf(v[0]);
+  dest[1] = floorf(v[1]);
+}
+
+/*!
+ * @brief mod of each vector item, result is written to dest (dest = v % s)
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec2_mods(vec2 v, float s, vec2 dest) {
+  dest[0] = fmodf(v[0], s);
+  dest[1] = fmodf(v[1], s);
+}
+
+/*!
+ * @brief square root of each vector item
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec2_sqrt(vec2 v, vec2 dest) {
+  dest[0] = sqrtf(v[0]);
+  dest[1] = sqrtf(v[1]);
+}
+
+/*!
+ * @brief treat vectors as complex numbers and multiply them as such.
+ *
+ * @param[in]  a    left number
+ * @param[in]  b    right number
+ * @param[out] dest destination number
+ */
+CGLM_INLINE
+void
+glm_vec2_complex_mul(vec2 a, vec2 b, vec2 dest) {
+  float tr, ti;
+  tr = a[0] * b[0] - a[1] * b[1];
+  ti = a[0] * b[1] + a[1] * b[0];
+  dest[0] = tr;
+  dest[1] = ti;
+}
+
+/*!
+ * @brief threshold each vector item with scalar
+ *        condition is: (x[i] < edge) ? 0.0 : 1.0
+ *
+ * @param[in]   edge    threshold
+ * @param[in]   x       vector to test against threshold
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec2_steps(float edge, vec2 x, vec2 dest) {
+  dest[0] = glm_step(edge, x[0]);
+  dest[1] = glm_step(edge, x[1]);
+}
+
+/*!
+ * @brief threshold a value with *vector* as the threshold
+ *        condition is: (x < edge[i]) ? 0.0 : 1.0
+ *
+ * @param[in]   edge    threshold vector
+ * @param[in]   x       value to test against threshold
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec2_stepr(vec2 edge, float x, vec2 dest) {
+  dest[0] = glm_step(edge[0], x);
+  dest[1] = glm_step(edge[1], x);
+}
+
+/*!
+ * @brief treat vectors as complex numbers and divide them as such.
+ *
+ * @param[in]  a    left number (numerator)
+ * @param[in]  b    right number (denominator)
+ * @param[out] dest destination number
+ */
+CGLM_INLINE
+void
+glm_vec2_complex_div(vec2 a, vec2 b, vec2 dest) {
+  float tr, ti;
+  float const ibnorm2 = 1.0f / (b[0] * b[0] + b[1] * b[1]);
+  tr = ibnorm2 * (a[0] * b[0] + a[1] * b[1]);
+  ti = ibnorm2 * (a[1] * b[0] - a[0] * b[1]);
+  dest[0] = tr;
+  dest[1] = ti;
+}
+
+/*!
+ * @brief treat the vector as a complex number and conjugate it as such.
+ *
+ * @param[in]  a    the number
+ * @param[out] dest destination number
+ */
+CGLM_INLINE
+void
+glm_vec2_complex_conjugate(vec2 a, vec2 dest) {
+  dest[0] =  a[0];
+  dest[1] = -a[1];
+}
+
+#endif /* cglm_vec2_ext_h */
diff --git a/external/cglm/vec2.h b/external/cglm/vec2.h
new file mode 100644
index 0000000..655fb4b
--- /dev/null
+++ b/external/cglm/vec2.h
@@ -0,0 +1,798 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_VEC2_ONE_INIT
+   GLM_VEC2_ZERO_INIT
+   GLM_VEC2_ONE
+   GLM_VEC2_ZERO
+
+ Functions:
+   CGLM_INLINE void  glm_vec2(float * __restrict v, vec2 dest)
+   CGLM_INLINE void  glm_vec2_copy(vec2 a, vec2 dest)
+   CGLM_INLINE void  glm_vec2_zero(vec2 v)
+   CGLM_INLINE void  glm_vec2_one(vec2 v)
+   CGLM_INLINE float glm_vec2_dot(vec2 a, vec2 b)
+   CGLM_INLINE float glm_vec2_cross(vec2 a, vec2 b)
+   CGLM_INLINE float glm_vec2_norm2(vec2 v)
+   CGLM_INLINE float glm_vec2_norm(vec2 vec)
+   CGLM_INLINE void  glm_vec2_add(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_adds(vec2 v, float s, vec2 dest)
+   CGLM_INLINE void  glm_vec2_sub(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_subs(vec2 v, float s, vec2 dest)
+   CGLM_INLINE void  glm_vec2_mul(vec2 a, vec2 b, vec2 d)
+   CGLM_INLINE void  glm_vec2_scale(vec2 v, float s, vec2 dest)
+   CGLM_INLINE void  glm_vec2_scale_as(vec2 v, float s, vec2 dest)
+   CGLM_INLINE void  glm_vec2_div(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_divs(vec2 v, float s, vec2 dest)
+   CGLM_INLINE void  glm_vec2_addadd(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_subadd(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_muladd(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_muladds(vec2 a, float s, vec2 dest)
+   CGLM_INLINE void  glm_vec2_maxadd(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_minadd(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_subsub(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_addsub(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_mulsub(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_mulsubs(vec2 a, float s, vec2 dest)
+   CGLM_INLINE void  glm_vec2_maxsub(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_minsub(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE void  glm_vec2_negate_to(vec2 v, vec2 dest)
+   CGLM_INLINE void  glm_vec2_negate(vec2 v)
+   CGLM_INLINE void  glm_vec2_normalize(vec2 v)
+   CGLM_INLINE void  glm_vec2_normalize_to(vec2 vec, vec2 dest)
+   CGLM_INLINE void  glm_vec2_rotate(vec2 v, float angle, vec2 dest)
+   CGLM_INLINE void  glm_vec2_center(vec2 a, vec2 b, vec2 dest)
+   CGLM_INLINE float glm_vec2_distance2(vec2 a, vec2 b)
+   CGLM_INLINE float glm_vec2_distance(vec2 a, vec2 b)
+   CGLM_INLINE void  glm_vec2_maxv(vec2 v1, vec2 v2, vec2 dest)
+   CGLM_INLINE void  glm_vec2_minv(vec2 v1, vec2 v2, vec2 dest)
+   CGLM_INLINE void  glm_vec2_clamp(vec2 v, float minVal, float maxVal)
+   CGLM_INLINE void  glm_vec2_swizzle(vec2 v, int mask, vec2 dest)
+   CGLM_INLINE void  glm_vec2_lerp(vec2 from, vec2 to, float t, vec2 dest)
+   CGLM_INLINE void  glm_vec2_step(vec2 edge, vec2 x, vec2 dest)
+   CGLM_INLINE void  glm_vec2_make(float * restrict src, vec2 dest)
+   CGLM_INLINE void  glm_vec2_reflect(vec2 v, vec2 n, vec2 dest)
+   CGLM_INLINE void  glm_vec2_refract(vec2 v, vec2 n, float eta, vec2 dest)
+ */
+
+#ifndef cglm_vec2_h
+#define cglm_vec2_h
+
+#include "common.h"
+#include "util.h"
+#include "vec2-ext.h"
+
+#define GLM_VEC2_ONE_INIT   {1.0f, 1.0f}
+#define GLM_VEC2_ZERO_INIT  {0.0f, 0.0f}
+
+#define GLM_VEC2_ONE  ((vec2)GLM_VEC2_ONE_INIT)
+#define GLM_VEC2_ZERO ((vec2)GLM_VEC2_ZERO_INIT)
+
+/*!
+ * @brief init vec2 using another vector
+ *
+ * @param[in]  v    a vector
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec2(float * __restrict v, vec2 dest) {
+  dest[0] = v[0];
+  dest[1] = v[1];
+}
+
+/*!
+ * @brief copy all members of [a] to [dest]
+ *
+ * @param[in]  a    source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec2_copy(vec2 a, vec2 dest) {
+  dest[0] = a[0];
+  dest[1] = a[1];
+}
+
+/*!
+ * @brief make vector zero
+ *
+ * @param[in, out]  v vector
+ */
+CGLM_INLINE
+void
+glm_vec2_zero(vec2 v) {
+  v[0] = v[1] = 0.0f;
+}
+
+/*!
+ * @brief make vector one
+ *
+ * @param[in, out]  v vector
+ */
+CGLM_INLINE
+void
+glm_vec2_one(vec2 v) {
+  v[0] = v[1] = 1.0f;
+}
+
+/*!
+ * @brief vec2 dot product
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ *
+ * @return dot product
+ */
+CGLM_INLINE
+float
+glm_vec2_dot(vec2 a, vec2 b) {
+  return a[0] * b[0] + a[1] * b[1];
+}
+
+/*!
+ * @brief vec2 cross product
+ *
+ * REF: http://allenchou.net/2013/07/cross-product-of-2d-vectors/
+ *
+ * @param[in]  a vector1
+ * @param[in]  b vector2
+ *
+ * @return Z component of cross product
+ */
+CGLM_INLINE
+float
+glm_vec2_cross(vec2 a, vec2 b) {
+  /* just calculate the z-component */
+  return a[0] * b[1] - a[1] * b[0];
+}
+
+/*!
+ * @brief norm * norm (magnitude) of vec
+ *
+ * we can use this func instead of calling norm * norm, because it would call
+ * sqrtf function twice but with this func we can avoid func call, maybe this is
+ * not good name for this func
+ *
+ * @param[in] v vector
+ *
+ * @return norm * norm
+ */
+CGLM_INLINE
+float
+glm_vec2_norm2(vec2 v) {
+  return glm_vec2_dot(v, v);
+}
+
+/*!
+ * @brief norm (magnitude) of vec2
+ *
+ * @param[in] vec vector
+ *
+ * @return norm
+ */
+CGLM_INLINE
+float
+glm_vec2_norm(vec2 vec) {
+  return sqrtf(glm_vec2_norm2(vec));
+}
+
+/*!
+ * @brief add a vector to b vector store result in dest
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec2_add(vec2 a, vec2 b, vec2 dest) {
+  dest[0] = a[0] + b[0];
+  dest[1] = a[1] + b[1];
+}
+
+/*!
+ * @brief add scalar to v vector store result in dest (d = v + s)
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec2_adds(vec2 v, float s, vec2 dest) {
+  dest[0] = v[0] + s;
+  dest[1] = v[1] + s;
+}
+
+/*!
+ * @brief subtract b vector from a vector store result in dest
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec2_sub(vec2 a, vec2 b, vec2 dest) {
+  dest[0] = a[0] - b[0];
+  dest[1] = a[1] - b[1];
+}
+
+/*!
+ * @brief subtract scalar from v vector store result in dest (d = v - s)
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec2_subs(vec2 v, float s, vec2 dest) {
+  dest[0] = v[0] - s;
+  dest[1] = v[1] - s;
+}
+
+/*!
+ * @brief multiply two vectors (component-wise multiplication)
+ *
+ * @param a    v1
+ * @param b    v2
+ * @param dest v3 = (a[0] * b[0], a[1] * b[1])
+ */
+CGLM_INLINE
+void
+glm_vec2_mul(vec2 a, vec2 b, vec2 dest) {
+  dest[0] = a[0] * b[0];
+  dest[1] = a[1] * b[1];
+}
+
+/*!
+ * @brief multiply/scale vector with scalar: result = v * s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec2_scale(vec2 v, float s, vec2 dest) {
+  dest[0] = v[0] * s;
+  dest[1] = v[1] * s;
+}
+
+/*!
+ * @brief scale as vector specified: result = unit(v) * s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec2_scale_as(vec2 v, float s, vec2 dest) {
+  float norm;
+  norm = glm_vec2_norm(v);
+
+  if (CGLM_UNLIKELY(norm < FLT_EPSILON)) {
+    glm_vec2_zero(dest);
+    return;
+  }
+
+  glm_vec2_scale(v, s / norm, dest);
+}
+
+/*!
+ * @brief div vector with another component-wise division: d = a / b
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest result = (a[0]/b[0], a[1]/b[1])
+ */
+CGLM_INLINE
+void
+glm_vec2_div(vec2 a, vec2 b, vec2 dest) {
+  dest[0] = a[0] / b[0];
+  dest[1] = a[1] / b[1];
+}
+
+/*!
+ * @brief div vector with scalar: d = v / s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest result = (a[0]/s, a[1]/s)
+ */
+CGLM_INLINE
+void
+glm_vec2_divs(vec2 v, float s, vec2 dest) {
+  dest[0] = v[0] / s;
+  dest[1] = v[1] / s;
+}
+
+/*!
+ * @brief add two vectors and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a + b)
+ */
+CGLM_INLINE
+void
+glm_vec2_addadd(vec2 a, vec2 b, vec2 dest) {
+  dest[0] += a[0] + b[0];
+  dest[1] += a[1] + b[1];
+}
+
+/*!
+ * @brief sub two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a + b)
+ */
+CGLM_INLINE
+void
+glm_vec2_subadd(vec2 a, vec2 b, vec2 dest) {
+  dest[0] += a[0] - b[0];
+  dest[1] += a[1] - b[1];
+}
+
+/*!
+ * @brief mul two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec2_muladd(vec2 a, vec2 b, vec2 dest) {
+  dest[0] += a[0] * b[0];
+  dest[1] += a[1] * b[1];
+}
+
+/*!
+ * @brief mul vector with scalar and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec2_muladds(vec2 a, float s, vec2 dest) {
+  dest[0] += a[0] * s;
+  dest[1] += a[1] * s;
+}
+
+/*!
+ * @brief add max of two vectors to result/dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += max(a, b)
+ */
+CGLM_INLINE
+void
+glm_vec2_maxadd(vec2 a, vec2 b, vec2 dest) {
+  dest[0] += glm_max(a[0], b[0]);
+  dest[1] += glm_max(a[1], b[1]);
+}
+
+/*!
+ * @brief add min of two vectors to result/dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += min(a, b)
+ */
+CGLM_INLINE
+void
+glm_vec2_minadd(vec2 a, vec2 b, vec2 dest) {
+  dest[0] += glm_min(a[0], b[0]);
+  dest[1] += glm_min(a[1], b[1]);
+}
+
+/*!
+ * @brief sub two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= (a - b)
+ */
+CGLM_INLINE
+void
+glm_vec2_subsub(vec2 a, vec2 b, vec2 dest) {
+  dest[0] -= a[0] - b[0];
+  dest[1] -= a[1] - b[1];
+}
+
+/*!
+ * @brief add two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= (a + b)
+ */
+CGLM_INLINE
+void
+glm_vec2_addsub(vec2 a, vec2 b, vec2 dest) {
+  dest[0] -= a[0] + b[0];
+  dest[1] -= a[1] + b[1];
+}
+
+/*!
+ * @brief mul two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec2_mulsub(vec2 a, vec2 b, vec2 dest) {
+  dest[0] -= a[0] * b[0];
+  dest[1] -= a[1] * b[1];
+}
+
+/*!
+ * @brief mul vector with scalar and sub result to sum
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest -= (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec2_mulsubs(vec2 a, float s, vec2 dest) {
+  dest[0] -= a[0] * s;
+  dest[1] -= a[1] * s;
+}
+
+/*!
+ * @brief sub max of two vectors to result/dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= max(a, b)
+ */
+CGLM_INLINE
+void
+glm_vec2_maxsub(vec2 a, vec2 b, vec2 dest) {
+  dest[0] -= glm_max(a[0], b[0]);
+  dest[1] -= glm_max(a[1], b[1]);
+}
+
+/*!
+ * @brief sub min of two vectors to result/dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= min(a, b)
+ */
+CGLM_INLINE
+void
+glm_vec2_minsub(vec2 a, vec2 b, vec2 dest) {
+  dest[0] -= glm_min(a[0], b[0]);
+  dest[1] -= glm_min(a[1], b[1]);
+}
+
+/*!
+ * @brief negate vector components and store result in dest
+ *
+ * @param[in]   v     vector
+ * @param[out]  dest  result vector
+ */
+CGLM_INLINE
+void
+glm_vec2_negate_to(vec2 v, vec2 dest) {
+  dest[0] = -v[0];
+  dest[1] = -v[1];
+}
+
+/*!
+ * @brief negate vector components
+ *
+ * @param[in, out]  v  vector
+ */
+CGLM_INLINE
+void
+glm_vec2_negate(vec2 v) {
+  glm_vec2_negate_to(v, v);
+}
+
+/*!
+ * @brief normalize vector and store result in same vec
+ *
+ * @param[in, out] v vector
+ */
+CGLM_INLINE
+void
+glm_vec2_normalize(vec2 v) {
+  float norm;
+
+  norm = glm_vec2_norm(v);
+
+  if (CGLM_UNLIKELY(norm < FLT_EPSILON)) {
+    v[0] = v[1] = 0.0f;
+    return;
+  }
+
+  glm_vec2_scale(v, 1.0f / norm, v);
+}
+
+/*!
+ * @brief normalize vector to dest
+ *
+ * @param[in]  v    source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec2_normalize_to(vec2 v, vec2 dest) {
+  float norm;
+
+  norm = glm_vec2_norm(v);
+
+  if (CGLM_UNLIKELY(norm < FLT_EPSILON)) {
+    glm_vec2_zero(dest);
+    return;
+  }
+
+  glm_vec2_scale(v, 1.0f / norm, dest);
+}
+
+/*!
+ * @brief rotate vec2 around origin by angle (CCW: counterclockwise)
+ *
+ * Formula:
+ *   𝑥2 = cos(a)𝑥1 − sin(a)𝑦1
+ *   𝑦2 = sin(a)𝑥1 + cos(a)𝑦1
+ *
+ * @param[in]  v     vector to rotate
+ * @param[in]  angle angle by radians
+ * @param[out] dest  destination vector
+ */
+CGLM_INLINE
+void
+glm_vec2_rotate(vec2 v, float angle, vec2 dest) {
+  float c, s, x1, y1;
+
+  c  = cosf(angle);
+  s  = sinf(angle);
+
+  x1 = v[0];
+  y1 = v[1];
+
+  dest[0] = c * x1 - s * y1;
+  dest[1] = s * x1 + c * y1;
+}
+
+/**
+ * @brief find center point of two vector
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest center point
+ */
+CGLM_INLINE
+void
+glm_vec2_center(vec2 a, vec2 b, vec2 dest) {
+  glm_vec2_add(a, b, dest);
+  glm_vec2_scale(dest, 0.5f, dest);
+}
+
+/**
+ * @brief squared distance between two vectors
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ * @return returns squared distance (distance * distance)
+ */
+CGLM_INLINE
+float
+glm_vec2_distance2(vec2 a, vec2 b) {
+  return glm_pow2(b[0] - a[0]) + glm_pow2(b[1] - a[1]);
+}
+
+/**
+ * @brief distance between two vectors
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ * @return returns distance
+ */
+CGLM_INLINE
+float
+glm_vec2_distance(vec2 a, vec2 b) {
+  return sqrtf(glm_vec2_distance2(a, b));
+}
+
+/*!
+ * @brief max values of vectors
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec2_maxv(vec2 a, vec2 b, vec2 dest) {
+  dest[0] = glm_max(a[0], b[0]);
+  dest[1] = glm_max(a[1], b[1]);
+}
+
+/*!
+ * @brief min values of vectors
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec2_minv(vec2 a, vec2 b, vec2 dest) {
+  dest[0] = glm_min(a[0], b[0]);
+  dest[1] = glm_min(a[1], b[1]);
+}
+
+/*!
+ * @brief clamp vector's individual members between min and max values
+ *
+ * @param[in, out]  v      vector
+ * @param[in]       minval minimum value
+ * @param[in]       maxval maximum value
+ */
+CGLM_INLINE
+void
+glm_vec2_clamp(vec2 v, float minval, float maxval) {
+  v[0] = glm_clamp(v[0], minval, maxval);
+  v[1] = glm_clamp(v[1], minval, maxval);
+}
+
+/*!
+ * @brief swizzle vector components
+ *
+ * @param[in]  v    source
+ * @param[in]  mask mask
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec2_swizzle(vec2 v, int mask, vec2 dest) {
+  vec2 t;
+
+  t[0] = v[(mask & (3 << 0))];
+  t[1] = v[(mask & (3 << 2)) >> 2];
+
+  glm_vec2_copy(t, dest);
+}
+
+/*!
+ * @brief linear interpolation between two vector
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount) clamped between 0 and 1
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec2_lerp(vec2 from, vec2 to, float t, vec2 dest) {
+  vec2 s, v;
+
+  /* from + s * (to - from) */
+  glm_vec2_fill(s, glm_clamp_zo(t));
+  glm_vec2_sub(to, from, v);
+  glm_vec2_mul(s, v, v);
+  glm_vec2_add(from, v, dest);
+}
+
+/*!
+ * @brief threshold function
+ *
+ * @param[in]   edge    threshold
+ * @param[in]   x       value to test against threshold
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec2_step(vec2 edge, vec2 x, vec2 dest) {
+  dest[0] = glm_step(edge[0], x[0]);
+  dest[1] = glm_step(edge[1], x[1]);
+}
+
+/*!
+ * @brief Create two dimensional vector from pointer
+ *
+ * @param[in]  src  pointer to an array of floats
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec2_make(const float * __restrict src, vec2 dest) {
+  dest[0] = src[0]; dest[1] = src[1];
+}
+
+/*!
+ * @brief reflection vector using an incident ray and a surface normal
+ *
+ * @param[in]  v    incident vector
+ * @param[in]  n    normalized normal vector
+ * @param[out] dest destination vector for the reflection result
+ */
+CGLM_INLINE
+void
+glm_vec2_reflect(vec2 v, vec2 n, vec2 dest) {
+  vec2 temp;
+  glm_vec2_scale(n, 2.0f * glm_vec2_dot(v, n), temp);
+  glm_vec2_sub(v, temp, dest);
+}
+
+/*!
+ * @brief computes refraction vector for an incident vector and a surface normal.
+ *
+ * calculates the refraction vector based on Snell's law. If total internal reflection
+ * occurs (angle too great given eta), dest is set to zero and returns false.
+ * Otherwise, computes refraction vector, stores it in dest, and returns true.
+ *
+ * @param[in]  v    normalized incident vector
+ * @param[in]  n    normalized normal vector
+ * @param[in]  eta  ratio of indices of refraction (incident/transmitted)
+ * @param[out] dest refraction vector if refraction occurs; zero vector otherwise
+ *
+ * @returns true if refraction occurs; false if total internal reflection occurs.
+ */
+CGLM_INLINE
+bool
+glm_vec2_refract(vec2 v, vec2 n, float eta, vec2 dest) {
+  float ndi, eni, k;
+
+  ndi = glm_vec2_dot(n, v);
+  eni = eta * ndi;
+  k   = 1.0f - eta * eta + eni * eni;
+
+  if (k < 0.0f) {
+    glm_vec2_zero(dest);
+    return false;
+  }
+
+  glm_vec2_scale(v, eta, dest);
+  glm_vec2_mulsubs(n, eni + sqrtf(k), dest);
+  return true;
+}
+
+#endif /* cglm_vec2_h */
diff --git a/external/cglm/vec3-ext.h b/external/cglm/vec3-ext.h
new file mode 100644
index 0000000..4413cc2
--- /dev/null
+++ b/external/cglm/vec3-ext.h
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*!
+ * @brief SIMD like functions
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void  glm_vec3_broadcast(float val, vec3 d);
+   CGLM_INLINE void  glm_vec3_fill(vec3 v, float val);
+   CGLM_INLINE bool  glm_vec3_eq(vec3 v, float val);
+   CGLM_INLINE bool  glm_vec3_eq_eps(vec3 v, float val);
+   CGLM_INLINE bool  glm_vec3_eq_all(vec3 v);
+   CGLM_INLINE bool  glm_vec3_eqv(vec3 a, vec3 b);
+   CGLM_INLINE bool  glm_vec3_eqv_eps(vec3 a, vec3 b);
+   CGLM_INLINE float glm_vec3_max(vec3 v);
+   CGLM_INLINE float glm_vec3_min(vec3 v);
+   CGLM_INLINE bool  glm_vec3_isnan(vec3 v);
+   CGLM_INLINE bool  glm_vec3_isinf(vec3 v);
+   CGLM_INLINE bool  glm_vec3_isvalid(vec3 v);
+   CGLM_INLINE void  glm_vec3_sign(vec3 v, vec3 dest);
+   CGLM_INLINE void  glm_vec3_abs(vec3 v, vec3 dest);
+   CGLM_INLINE void  glm_vec3_fract(vec3 v, vec3 dest);
+   CGLM_INLINE void  glm_vec3_floor(vec3 v, vec3 dest);
+   CGLM_INLINE float glm_vec3_mods(vec3 v, float s, vec3 dest);
+   CGLM_INLINE float glm_vec3_steps(float edge, vec3 v, vec3 dest);
+   CGLM_INLINE void  glm_vec3_stepr(vec3 edge, float v, vec3 dest);
+   CGLM_INLINE float glm_vec3_hadd(vec3 v);
+   CGLM_INLINE void  glm_vec3_sqrt(vec3 v, vec3 dest);
+ */
+
+#ifndef cglm_vec3_ext_h
+#define cglm_vec3_ext_h
+
+#include "common.h"
+#include "util.h"
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param[in]  val value
+ * @param[out] d   dest
+ */
+CGLM_INLINE
+void
+glm_vec3_broadcast(float val, vec3 d) {
+  d[0] = d[1] = d[2] = val;
+}
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param[out] v   dest
+ * @param[in]  val value
+ */
+CGLM_INLINE
+void
+glm_vec3_fill(vec3 v, float val) {
+  v[0] = v[1] = v[2] = val;
+}
+
+/*!
+ * @brief check if vector is equal to value (without epsilon)
+ *
+ * @param[in] v   vector
+ * @param[in] val value
+ */
+CGLM_INLINE
+bool
+glm_vec3_eq(vec3 v, float val) {
+  return v[0] == val && v[0] == v[1] && v[0] == v[2];
+}
+
+/*!
+ * @brief check if vector is equal to value (with epsilon)
+ *
+ * @param[in] v   vector
+ * @param[in] val value
+ */
+CGLM_INLINE
+bool
+glm_vec3_eq_eps(vec3 v, float val) {
+  return fabsf(v[0] - val) <= GLM_FLT_EPSILON
+         && fabsf(v[1] - val) <= GLM_FLT_EPSILON
+         && fabsf(v[2] - val) <= GLM_FLT_EPSILON;
+}
+
+/*!
+ * @brief check if vector members are equal (without epsilon)
+ *
+ * @param[in] v   vector
+ */
+CGLM_INLINE
+bool
+glm_vec3_eq_all(vec3 v) {
+  return glm_vec3_eq_eps(v, v[0]);
+}
+
+/*!
+ * @brief check if vector is equal to another (without epsilon)
+ *
+ * @param[in] a vector
+ * @param[in] b vector
+ */
+CGLM_INLINE
+bool
+glm_vec3_eqv(vec3 a, vec3 b) {
+  return a[0] == b[0]
+         && a[1] == b[1]
+         && a[2] == b[2];
+}
+
+/*!
+ * @brief check if vector is equal to another (with epsilon)
+ *
+ * @param[in] a vector
+ * @param[in] b vector
+ */
+CGLM_INLINE
+bool
+glm_vec3_eqv_eps(vec3 a, vec3 b) {
+  return fabsf(a[0] - b[0]) <= GLM_FLT_EPSILON
+         && fabsf(a[1] - b[1]) <= GLM_FLT_EPSILON
+         && fabsf(a[2] - b[2]) <= GLM_FLT_EPSILON;
+}
+
+/*!
+ * @brief max value of vector
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+float
+glm_vec3_max(vec3 v) {
+  float max;
+
+  max = v[0];
+  if (v[1] > max)
+    max = v[1];
+  if (v[2] > max)
+    max = v[2];
+
+  return max;
+}
+
+/*!
+ * @brief min value of vector
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+float
+glm_vec3_min(vec3 v) {
+  float min;
+
+  min = v[0];
+  if (v[1] < min)
+    min = v[1];
+  if (v[2] < min)
+    min = v[2];
+
+  return min;
+}
+
+/*!
+ * @brief check if one of items is NaN (not a number)
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec3_isnan(vec3 v) {
+#ifndef CGLM_FAST_MATH
+  return isnan(v[0]) || isnan(v[1]) || isnan(v[2]);
+#else
+  return false;
+#endif
+}
+
+/*!
+ * @brief check if one of items is INFINITY
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec3_isinf(vec3 v) {
+#ifndef CGLM_FAST_MATH
+  return isinf(v[0]) || isinf(v[1]) || isinf(v[2]);
+#else
+  return false;
+#endif
+}
+
+/*!
+ * @brief check if all items are valid number
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec3_isvalid(vec3 v) {
+  return !glm_vec3_isnan(v) && !glm_vec3_isinf(v);
+}
+
+/*!
+ * @brief get sign of 32 bit float as +1, -1, 0
+ *
+ * Important: It returns 0 for zero/NaN input
+ *
+ * @param v vector
+ */
+CGLM_INLINE
+void
+glm_vec3_sign(vec3 v, vec3 dest) {
+  dest[0] = glm_signf(v[0]);
+  dest[1] = glm_signf(v[1]);
+  dest[2] = glm_signf(v[2]);
+}
+
+/*!
+ * @brief absolute value of each vector item
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec3_abs(vec3 v, vec3 dest) {
+  dest[0] = fabsf(v[0]);
+  dest[1] = fabsf(v[1]);
+  dest[2] = fabsf(v[2]);
+}
+
+/*!
+ * @brief fractional part of each vector item
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec3_fract(vec3 v, vec3 dest) {
+  dest[0] = fminf(v[0] - floorf(v[0]), 0.999999940395355224609375f);
+  dest[1] = fminf(v[1] - floorf(v[1]), 0.999999940395355224609375f);
+  dest[2] = fminf(v[2] - floorf(v[2]), 0.999999940395355224609375f);
+}
+
+/*!
+ * @brief floor of each vector item
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec3_floor(vec3 v, vec3 dest) {
+  dest[0] = floorf(v[0]);
+  dest[1] = floorf(v[1]);
+  dest[2] = floorf(v[2]);
+}
+
+/*!
+ * @brief mod of each vector item, result is written to dest (dest = v % s)
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec3_mods(vec3 v, float s, vec3 dest) {
+  dest[0] = fmodf(v[0], s);
+  dest[1] = fmodf(v[1], s);
+  dest[2] = fmodf(v[2], s);
+}
+
+/*!
+ * @brief threshold each vector item with scalar
+ *        condition is: (x[i] < edge) ? 0.0 : 1.0
+ *
+ * @param[in]   edge    threshold
+ * @param[in]   x       vector to test against threshold
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec3_steps(float edge, vec3 x, vec3 dest) {
+  dest[0] = glm_step(edge, x[0]);
+  dest[1] = glm_step(edge, x[1]);
+  dest[2] = glm_step(edge, x[2]);
+}
+
+/*!
+ * @brief threshold a value with *vector* as the threshold
+ *        condition is: (x < edge[i]) ? 0.0 : 1.0
+ *
+ * @param[in]   edge    threshold vector
+ * @param[in]   x       value to test against threshold
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec3_stepr(vec3 edge, float x, vec3 dest) {
+  dest[0] = glm_step(edge[0], x);
+  dest[1] = glm_step(edge[1], x);
+  dest[2] = glm_step(edge[2], x);
+}
+
+/*!
+ * @brief vector reduction by summation
+ * @warning could overflow
+ *
+ * @param[in]  v    vector
+ * @return     sum of all vector's elements
+ */
+CGLM_INLINE
+float
+glm_vec3_hadd(vec3 v) {
+  return v[0] + v[1] + v[2];
+}
+
+/*!
+ * @brief square root of each vector item
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec3_sqrt(vec3 v, vec3 dest) {
+  dest[0] = sqrtf(v[0]);
+  dest[1] = sqrtf(v[1]);
+  dest[2] = sqrtf(v[2]);
+}
+
+#endif /* cglm_vec3_ext_h */
diff --git a/external/cglm/vec3.h b/external/cglm/vec3.h
new file mode 100644
index 0000000..1350818
--- /dev/null
+++ b/external/cglm/vec3.h
@@ -0,0 +1,1264 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_VEC3_ONE_INIT
+   GLM_VEC3_ZERO_INIT
+   GLM_VEC3_ONE
+   GLM_VEC3_ZERO
+   GLM_YUP
+   GLM_ZUP
+   GLM_XUP
+
+ Functions:
+   CGLM_INLINE void  glm_vec3(vec4 v4, vec3 dest);
+   CGLM_INLINE void  glm_vec3_copy(vec3 a, vec3 dest);
+   CGLM_INLINE void  glm_vec3_zero(vec3 v);
+   CGLM_INLINE void  glm_vec3_one(vec3 v);
+   CGLM_INLINE float glm_vec3_dot(vec3 a, vec3 b);
+   CGLM_INLINE float glm_vec3_norm2(vec3 v);
+   CGLM_INLINE float glm_vec3_norm(vec3 v);
+   CGLM_INLINE float glm_vec3_norm_one(vec3 v);
+   CGLM_INLINE float glm_vec3_norm_inf(vec3 v);
+   CGLM_INLINE void  glm_vec3_add(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_adds(vec3 a, float s, vec3 dest);
+   CGLM_INLINE void  glm_vec3_sub(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_subs(vec3 a, float s, vec3 dest);
+   CGLM_INLINE void  glm_vec3_mul(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_scale(vec3 v, float s, vec3 dest);
+   CGLM_INLINE void  glm_vec3_scale_as(vec3 v, float s, vec3 dest);
+   CGLM_INLINE void  glm_vec3_div(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_divs(vec3 a, float s, vec3 dest);
+   CGLM_INLINE void  glm_vec3_addadd(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_subadd(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_muladd(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_muladds(vec3 a, float s, vec3 dest);
+   CGLM_INLINE void  glm_vec3_maxadd(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_minadd(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_subsub(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_addsub(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_mulsub(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_mulsubs(vec3 a, float s, vec3 dest);
+   CGLM_INLINE void  glm_vec3_maxsub(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_minsub(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_flipsign(vec3 v);
+   CGLM_INLINE void  glm_vec3_flipsign_to(vec3 v, vec3 dest);
+   CGLM_INLINE void  glm_vec3_negate_to(vec3 v, vec3 dest);
+   CGLM_INLINE void  glm_vec3_negate(vec3 v);
+   CGLM_INLINE void  glm_vec3_inv(vec3 v);
+   CGLM_INLINE void  glm_vec3_inv_to(vec3 v, vec3 dest);
+   CGLM_INLINE void  glm_vec3_normalize(vec3 v);
+   CGLM_INLINE void  glm_vec3_normalize_to(vec3 v, vec3 dest);
+   CGLM_INLINE void  glm_vec3_cross(vec3 a, vec3 b, vec3 d);
+   CGLM_INLINE void  glm_vec3_crossn(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE float glm_vec3_angle(vec3 a, vec3 b);
+   CGLM_INLINE void  glm_vec3_rotate(vec3 v, float angle, vec3 axis);
+   CGLM_INLINE void  glm_vec3_rotate_m4(mat4 m, vec3 v, vec3 dest);
+   CGLM_INLINE void  glm_vec3_rotate_m3(mat3 m, vec3 v, vec3 dest);
+   CGLM_INLINE void  glm_vec3_proj(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_center(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE float glm_vec3_distance(vec3 a, vec3 b);
+   CGLM_INLINE float glm_vec3_distance2(vec3 a, vec3 b);
+   CGLM_INLINE void  glm_vec3_maxv(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_minv(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec3_ortho(vec3 v, vec3 dest);
+   CGLM_INLINE void  glm_vec3_clamp(vec3 v, float minVal, float maxVal);
+   CGLM_INLINE void  glm_vec3_lerp(vec3 from, vec3 to, float t, vec3 dest);
+   CGLM_INLINE void  glm_vec3_lerpc(vec3 from, vec3 to, float t, vec3 dest);
+   CGLM_INLINE void  glm_vec3_mix(vec3 from, vec3 to, float t, vec3 dest);
+   CGLM_INLINE void  glm_vec3_mixc(vec3 from, vec3 to, float t, vec3 dest);
+   CGLM_INLINE void  glm_vec3_step(vec3 edge, vec3 x, vec3 dest);
+   CGLM_INLINE void  glm_vec3_smoothstep_uni(float edge0, float edge1, vec3 x, vec3 dest);
+   CGLM_INLINE void  glm_vec3_smoothstep(vec3 edge0, vec3 edge1, vec3 x, vec3 dest);
+   CGLM_INLINE void  glm_vec3_smoothinterp(vec3 from, vec3 to, float t, vec3 dest);
+   CGLM_INLINE void  glm_vec3_smoothinterpc(vec3 from, vec3 to, float t, vec3 dest);
+   CGLM_INLINE void  glm_vec3_swizzle(vec3 v, int mask, vec3 dest);
+   CGLM_INLINE void  glm_vec3_make(float * restrict src, vec3 dest);
+   CGLM_INLINE void  glm_vec3_faceforward(vec3 n, vec3 v, vec3 nref, vec3 dest);
+   CGLM_INLINE void  glm_vec3_reflect(vec3 v, vec3 n, vec3 dest);
+   CGLM_INLINE void  glm_vec3_refract(vec3 v, vec3 n, float eta, vec3 dest);
+
+ Convenient:
+   CGLM_INLINE void  glm_cross(vec3 a, vec3 b, vec3 d);
+   CGLM_INLINE float glm_dot(vec3 a, vec3 b);
+   CGLM_INLINE void  glm_normalize(vec3 v);
+   CGLM_INLINE void  glm_normalize_to(vec3 v, vec3 dest);
+
+ DEPRECATED:
+   glm_vec3_dup
+   glm_vec3_flipsign
+   glm_vec3_flipsign_to
+   glm_vec3_inv
+   glm_vec3_inv_to
+   glm_vec3_mulv
+   glm_vec3_step_uni  -->  use glm_vec3_steps
+ */
+
+#ifndef cglm_vec3_h
+#define cglm_vec3_h
+
+#include "common.h"
+#include "vec4.h"
+#include "vec3-ext.h"
+#include "util.h"
+
+/* DEPRECATED! use _copy, _ucopy versions */
+#define glm_vec3_dup(v, dest)         glm_vec3_copy(v, dest)
+#define glm_vec3_flipsign(v)          glm_vec3_negate(v)
+#define glm_vec3_flipsign_to(v, dest) glm_vec3_negate_to(v, dest)
+#define glm_vec3_inv(v)               glm_vec3_negate(v)
+#define glm_vec3_inv_to(v, dest)      glm_vec3_negate_to(v, dest)
+#define glm_vec3_mulv(a, b, d)        glm_vec3_mul(a, b, d)
+#define glm_vec3_step_uni(edge, x, dest) glm_vec3_steps(edge, x, dest)
+
+#define GLM_VEC3_ONE_INIT   {1.0f, 1.0f, 1.0f}
+#define GLM_VEC3_ZERO_INIT  {0.0f, 0.0f, 0.0f}
+
+#define GLM_VEC3_ONE  ((vec3)GLM_VEC3_ONE_INIT)
+#define GLM_VEC3_ZERO ((vec3)GLM_VEC3_ZERO_INIT)
+
+#define GLM_YUP       ((vec3){0.0f,  1.0f,  0.0f})
+#define GLM_ZUP       ((vec3){0.0f,  0.0f,  1.0f})
+#define GLM_XUP       ((vec3){1.0f,  0.0f,  0.0f})
+#define GLM_FORWARD   ((vec3){0.0f,  0.0f, -1.0f})
+
+#define GLM_XXX GLM_SHUFFLE3(0, 0, 0)
+#define GLM_YYY GLM_SHUFFLE3(1, 1, 1)
+#define GLM_ZZZ GLM_SHUFFLE3(2, 2, 2)
+#define GLM_ZYX GLM_SHUFFLE3(0, 1, 2)
+
+/*!
+ * @brief init vec3 using vec4
+ *
+ * @param[in]  v4   vector4
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3(vec4 v4, vec3 dest) {
+  dest[0] = v4[0];
+  dest[1] = v4[1];
+  dest[2] = v4[2];
+}
+
+/*!
+ * @brief copy all members of [a] to [dest]
+ *
+ * @param[in]  a    source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3_copy(vec3 a, vec3 dest) {
+  dest[0] = a[0];
+  dest[1] = a[1];
+  dest[2] = a[2];
+}
+
+/*!
+ * @brief make vector zero
+ *
+ * @param[in, out]  v vector
+ */
+CGLM_INLINE
+void
+glm_vec3_zero(vec3 v) {
+  v[0] = v[1] = v[2] = 0.0f;
+}
+
+/*!
+ * @brief make vector one
+ *
+ * @param[in, out]  v vector
+ */
+CGLM_INLINE
+void
+glm_vec3_one(vec3 v) {
+  v[0] = v[1] = v[2] = 1.0f;
+}
+
+/*!
+ * @brief vec3 dot product
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ *
+ * @return dot product
+ */
+CGLM_INLINE
+float
+glm_vec3_dot(vec3 a, vec3 b) {
+  return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+}
+
+/*!
+ * @brief norm * norm (magnitude) of vec
+ *
+ * we can use this func instead of calling norm * norm, because it would call
+ * sqrtf function twice but with this func we can avoid func call, maybe this is
+ * not good name for this func
+ *
+ * @param[in] v vector
+ *
+ * @return norm * norm
+ */
+CGLM_INLINE
+float
+glm_vec3_norm2(vec3 v) {
+  return glm_vec3_dot(v, v);
+}
+
+/*!
+ * @brief euclidean norm (magnitude), also called L2 norm
+ *        this will give magnitude of vector in euclidean space
+ *
+ * @param[in] v vector
+ *
+ * @return norm
+ */
+CGLM_INLINE
+float
+glm_vec3_norm(vec3 v) {
+  return sqrtf(glm_vec3_norm2(v));
+}
+
+/*!
+ * @brief L1 norm of vec3
+ * Also known as Manhattan Distance or Taxicab norm.
+ * L1 Norm is the sum of the magnitudes of the vectors in a space.
+ * It is calculated as the sum of the absolute values of the vector components.
+ * In this norm, all the components of the vector are weighted equally.
+ *
+ * This computes:
+ * R = |v[0]| + |v[1]| + |v[2]|
+ *
+ * @param[in] v vector
+ *
+ * @return L1 norm
+ */
+CGLM_INLINE
+float
+glm_vec3_norm_one(vec3 v) {
+  vec3 t;
+  glm_vec3_abs(v, t);
+  return glm_vec3_hadd(t);
+}
+
+/*!
+ * @brief infinity norm of vec3
+ * Also known as Maximum norm.
+ * Infinity Norm is the largest magnitude among each element of a vector.
+ * It is calculated as the maximum of the absolute values of the vector components.
+ *
+ * This computes:
+ * inf norm = max(|v[0]|, |v[1]|, |v[2]|)
+ *
+ * @param[in] v vector
+ *
+ * @return infinity norm
+ */
+CGLM_INLINE
+float
+glm_vec3_norm_inf(vec3 v) {
+  vec3 t;
+  glm_vec3_abs(v, t);
+  return glm_vec3_max(t);
+}
+
+/*!
+ * @brief add a vector to b vector store result in dest
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec3_add(vec3 a, vec3 b, vec3 dest) {
+  dest[0] = a[0] + b[0];
+  dest[1] = a[1] + b[1];
+  dest[2] = a[2] + b[2];
+}
+
+/*!
+ * @brief add scalar to v vector store result in dest (d = v + s)
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec3_adds(vec3 v, float s, vec3 dest) {
+  dest[0] = v[0] + s;
+  dest[1] = v[1] + s;
+  dest[2] = v[2] + s;
+}
+
+/*!
+ * @brief subtract b vector from a vector store result in dest
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec3_sub(vec3 a, vec3 b, vec3 dest) {
+  dest[0] = a[0] - b[0];
+  dest[1] = a[1] - b[1];
+  dest[2] = a[2] - b[2];
+}
+
+/*!
+ * @brief subtract scalar from v vector store result in dest (d = v - s)
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec3_subs(vec3 v, float s, vec3 dest) {
+  dest[0] = v[0] - s;
+  dest[1] = v[1] - s;
+  dest[2] = v[2] - s;
+}
+
+/*!
+ * @brief multiply two vectors (component-wise multiplication)
+ *
+ * @param a    vector1
+ * @param b    vector2
+ * @param dest v3 = (a[0] * b[0], a[1] * b[1], a[2] * b[2])
+ */
+CGLM_INLINE
+void
+glm_vec3_mul(vec3 a, vec3 b, vec3 dest) {
+  dest[0] = a[0] * b[0];
+  dest[1] = a[1] * b[1];
+  dest[2] = a[2] * b[2];
+}
+
+/*!
+ * @brief multiply/scale vec3 vector with scalar: result = v * s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec3_scale(vec3 v, float s, vec3 dest) {
+  dest[0] = v[0] * s;
+  dest[1] = v[1] * s;
+  dest[2] = v[2] * s;
+}
+
+/*!
+ * @brief make vec3 vector scale as specified: result = unit(v) * s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec3_scale_as(vec3 v, float s, vec3 dest) {
+  float norm;
+  norm = glm_vec3_norm(v);
+
+  if (CGLM_UNLIKELY(norm < FLT_EPSILON)) {
+    glm_vec3_zero(dest);
+    return;
+  }
+
+  glm_vec3_scale(v, s / norm, dest);
+}
+
+/*!
+ * @brief div vector with another component-wise division: d = a / b
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest result = (a[0]/b[0], a[1]/b[1], a[2]/b[2])
+ */
+CGLM_INLINE
+void
+glm_vec3_div(vec3 a, vec3 b, vec3 dest) {
+  dest[0] = a[0] / b[0];
+  dest[1] = a[1] / b[1];
+  dest[2] = a[2] / b[2];
+}
+
+/*!
+ * @brief div vector with scalar: d = v / s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest result = (a[0]/s, a[1]/s, a[2]/s)
+ */
+CGLM_INLINE
+void
+glm_vec3_divs(vec3 v, float s, vec3 dest) {
+  dest[0] = v[0] / s;
+  dest[1] = v[1] / s;
+  dest[2] = v[2] / s;
+}
+
+/*!
+ * @brief add two vectors and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a + b)
+ */
+CGLM_INLINE
+void
+glm_vec3_addadd(vec3 a, vec3 b, vec3 dest) {
+  dest[0] += a[0] + b[0];
+  dest[1] += a[1] + b[1];
+  dest[2] += a[2] + b[2];
+}
+
+/*!
+ * @brief sub two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a + b)
+ */
+CGLM_INLINE
+void
+glm_vec3_subadd(vec3 a, vec3 b, vec3 dest) {
+  dest[0] += a[0] - b[0];
+  dest[1] += a[1] - b[1];
+  dest[2] += a[2] - b[2];
+}
+
+/*!
+ * @brief mul two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec3_muladd(vec3 a, vec3 b, vec3 dest) {
+  dest[0] += a[0] * b[0];
+  dest[1] += a[1] * b[1];
+  dest[2] += a[2] * b[2];
+}
+
+/*!
+ * @brief mul vector with scalar and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec3_muladds(vec3 a, float s, vec3 dest) {
+  dest[0] += a[0] * s;
+  dest[1] += a[1] * s;
+  dest[2] += a[2] * s;
+}
+
+/*!
+ * @brief add max of two vectors to result/dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += max(a, b)
+ */
+CGLM_INLINE
+void
+glm_vec3_maxadd(vec3 a, vec3 b, vec3 dest) {
+  dest[0] += glm_max(a[0], b[0]);
+  dest[1] += glm_max(a[1], b[1]);
+  dest[2] += glm_max(a[2], b[2]);
+}
+
+/*!
+ * @brief add min of two vectors to result/dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += min(a, b)
+ */
+CGLM_INLINE
+void
+glm_vec3_minadd(vec3 a, vec3 b, vec3 dest) {
+  dest[0] += glm_min(a[0], b[0]);
+  dest[1] += glm_min(a[1], b[1]);
+  dest[2] += glm_min(a[2], b[2]);
+}
+
+/*!
+ * @brief sub two vectors and sub result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= (a - b)
+ */
+CGLM_INLINE
+void
+glm_vec3_subsub(vec3 a, vec3 b, vec3 dest) {
+  dest[0] -= a[0] - b[0];
+  dest[1] -= a[1] - b[1];
+  dest[2] -= a[2] - b[2];
+}
+
+/*!
+ * @brief add two vectors and sub result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= (a + b)
+ */
+CGLM_INLINE
+void
+glm_vec3_addsub(vec3 a, vec3 b, vec3 dest) {
+  dest[0] -= a[0] + b[0];
+  dest[1] -= a[1] + b[1];
+  dest[2] -= a[2] + b[2];
+}
+
+/*!
+ * @brief mul two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec3_mulsub(vec3 a, vec3 b, vec3 dest) {
+  dest[0] -= a[0] * b[0];
+  dest[1] -= a[1] * b[1];
+  dest[2] -= a[2] * b[2];
+}
+
+/*!
+ * @brief mul vector with scalar and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest -= (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec3_mulsubs(vec3 a, float s, vec3 dest) {
+  dest[0] -= a[0] * s;
+  dest[1] -= a[1] * s;
+  dest[2] -= a[2] * s;
+}
+
+/*!
+ * @brief sub max of two vectors to result/dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= max(a, b)
+ */
+CGLM_INLINE
+void
+glm_vec3_maxsub(vec3 a, vec3 b, vec3 dest) {
+  dest[0] -= glm_max(a[0], b[0]);
+  dest[1] -= glm_max(a[1], b[1]);
+  dest[2] -= glm_max(a[2], b[2]);
+}
+
+/*!
+ * @brief sub min of two vectors to result/dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= min(a, b)
+ */
+CGLM_INLINE
+void
+glm_vec3_minsub(vec3 a, vec3 b, vec3 dest) {
+  dest[0] -= glm_min(a[0], b[0]);
+  dest[1] -= glm_min(a[1], b[1]);
+  dest[2] -= glm_min(a[2], b[2]);
+}
+
+/*!
+ * @brief negate vector components and store result in dest
+ *
+ * @param[in]   v     vector
+ * @param[out]  dest  result vector
+ */
+CGLM_INLINE
+void
+glm_vec3_negate_to(vec3 v, vec3 dest) {
+  dest[0] = -v[0];
+  dest[1] = -v[1];
+  dest[2] = -v[2];
+}
+
+/*!
+ * @brief negate vector components
+ *
+ * @param[in, out]  v  vector
+ */
+CGLM_INLINE
+void
+glm_vec3_negate(vec3 v) {
+  glm_vec3_negate_to(v, v);
+}
+
+/*!
+ * @brief normalize vec3 and store result in same vec
+ *
+ * @param[in, out] v vector
+ */
+CGLM_INLINE
+void
+glm_vec3_normalize(vec3 v) {
+  float norm;
+
+  norm = glm_vec3_norm(v);
+
+  if (CGLM_UNLIKELY(norm < FLT_EPSILON)) {
+    v[0] = v[1] = v[2] = 0.0f;
+    return;
+  }
+
+  glm_vec3_scale(v, 1.0f / norm, v);
+}
+
+/*!
+ * @brief normalize vec3 to dest
+ *
+ * @param[in]  v    source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3_normalize_to(vec3 v, vec3 dest) {
+  float norm;
+
+  norm = glm_vec3_norm(v);
+
+  if (CGLM_UNLIKELY(norm < FLT_EPSILON)) {
+    glm_vec3_zero(dest);
+    return;
+  }
+
+  glm_vec3_scale(v, 1.0f / norm, dest);
+}
+
+/*!
+ * @brief cross product of two vector (RH)
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3_cross(vec3 a, vec3 b, vec3 dest) {
+  vec3 c;
+  /* (u2.v3 - u3.v2, u3.v1 - u1.v3, u1.v2 - u2.v1) */
+  c[0] = a[1] * b[2] - a[2] * b[1];
+  c[1] = a[2] * b[0] - a[0] * b[2];
+  c[2] = a[0] * b[1] - a[1] * b[0];
+  glm_vec3_copy(c, dest);
+}
+
+/*!
+ * @brief cross product of two vector (RH) and normalize the result
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3_crossn(vec3 a, vec3 b, vec3 dest) {
+  glm_vec3_cross(a, b, dest);
+  glm_vec3_normalize(dest);
+}
+
+/*!
+ * @brief angle between two vector
+ *
+ * @param[in] a  vector1
+ * @param[in] b  vector2
+ *
+ * @return angle as radians
+ */
+CGLM_INLINE
+float
+glm_vec3_angle(vec3 a, vec3 b) {
+  float norm, dot;
+
+  /* maybe compiler generate approximation instruction (rcp) */
+  norm = 1.0f / (glm_vec3_norm(a) * glm_vec3_norm(b));
+  dot  = glm_vec3_dot(a, b) * norm;
+
+  if (dot > 1.0f)
+    return 0.0f;
+  else if (dot < -1.0f)
+    return CGLM_PI;
+
+  return acosf(dot);
+}
+
+/*!
+ * @brief rotate vec3 around axis by angle using Rodrigues' rotation formula
+ *
+ * @param[in, out] v     vector
+ * @param[in]      axis  axis vector (must be unit vector)
+ * @param[in]      angle angle by radians
+ */
+CGLM_INLINE
+void
+glm_vec3_rotate(vec3 v, float angle, vec3 axis) {
+  vec3   v1, v2, k;
+  float  c, s;
+
+  c = cosf(angle);
+  s = sinf(angle);
+
+  glm_vec3_normalize_to(axis, k);
+
+  /* Right Hand, Rodrigues' rotation formula:
+        v = v*cos(t) + (kxv)sin(t) + k*(k.v)(1 - cos(t))
+   */
+  glm_vec3_scale(v, c, v1);
+
+  glm_vec3_cross(k, v, v2);
+  glm_vec3_scale(v2, s, v2);
+
+  glm_vec3_add(v1, v2, v1);
+
+  glm_vec3_scale(k, glm_vec3_dot(k, v) * (1.0f - c), v2);
+  glm_vec3_add(v1, v2, v);
+}
+
+/*!
+ * @brief apply rotation matrix to vector
+ *
+ *  matrix format should be (no perspective):
+ *   a  b  c  x
+ *   e  f  g  y
+ *   i  j  k  z
+ *   0  0  0  w
+ *
+ * @param[in]  m    affine matrix or rot matrix
+ * @param[in]  v    vector
+ * @param[out] dest rotated vector
+ */
+CGLM_INLINE
+void
+glm_vec3_rotate_m4(mat4 m, vec3 v, vec3 dest) {
+  vec4 x, y, z, res;
+
+  glm_vec4_normalize_to(m[0], x);
+  glm_vec4_normalize_to(m[1], y);
+  glm_vec4_normalize_to(m[2], z);
+
+  glm_vec4_scale(x,   v[0], res);
+  glm_vec4_muladds(y, v[1], res);
+  glm_vec4_muladds(z, v[2], res);
+
+  glm_vec3(res, dest);
+}
+
+/*!
+ * @brief apply rotation matrix to vector
+ *
+ * @param[in]  m    affine matrix or rot matrix
+ * @param[in]  v    vector
+ * @param[out] dest rotated vector
+ */
+CGLM_INLINE
+void
+glm_vec3_rotate_m3(mat3 m, vec3 v, vec3 dest) {
+  vec4 res, x, y, z;
+
+  glm_vec4(m[0], 0.0f, x);
+  glm_vec4(m[1], 0.0f, y);
+  glm_vec4(m[2], 0.0f, z);
+
+  glm_vec4_normalize(x);
+  glm_vec4_normalize(y);
+  glm_vec4_normalize(z);
+
+  glm_vec4_scale(x,   v[0], res);
+  glm_vec4_muladds(y, v[1], res);
+  glm_vec4_muladds(z, v[2], res);
+
+  glm_vec3(res, dest);
+}
+
+/*!
+ * @brief project a vector onto b vector
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest projected vector
+ */
+CGLM_INLINE
+void
+glm_vec3_proj(vec3 a, vec3 b, vec3 dest) {
+  glm_vec3_scale(b,
+                 glm_vec3_dot(a, b) / glm_vec3_norm2(b),
+                 dest);
+}
+
+/**
+ * @brief find center point of two vector
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest center point
+ */
+CGLM_INLINE
+void
+glm_vec3_center(vec3 a, vec3 b, vec3 dest) {
+  glm_vec3_add(a, b, dest);
+  glm_vec3_scale(dest, 0.5f, dest);
+}
+
+/**
+ * @brief squared distance between two vectors
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ * @return returns squared distance (distance * distance)
+ */
+CGLM_INLINE
+float
+glm_vec3_distance2(vec3 a, vec3 b) {
+  return glm_pow2(a[0] - b[0])
+       + glm_pow2(a[1] - b[1])
+       + glm_pow2(a[2] - b[2]);
+}
+
+/**
+ * @brief distance between two vectors
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ * @return returns distance
+ */
+CGLM_INLINE
+float
+glm_vec3_distance(vec3 a, vec3 b) {
+  return sqrtf(glm_vec3_distance2(a, b));
+}
+
+/*!
+ * @brief max values of vectors
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3_maxv(vec3 a, vec3 b, vec3 dest) {
+  dest[0] = glm_max(a[0], b[0]);
+  dest[1] = glm_max(a[1], b[1]);
+  dest[2] = glm_max(a[2], b[2]);
+}
+
+/*!
+ * @brief min values of vectors
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3_minv(vec3 a, vec3 b, vec3 dest) {
+  dest[0] = glm_min(a[0], b[0]);
+  dest[1] = glm_min(a[1], b[1]);
+  dest[2] = glm_min(a[2], b[2]);
+}
+
+/*!
+ * @brief possible orthogonal/perpendicular vector
+ *
+ * @param[in]  v    vector
+ * @param[out] dest orthogonal/perpendicular vector
+ */
+CGLM_INLINE
+void
+glm_vec3_ortho(vec3 v, vec3 dest) {
+  float ignore;
+  float f      = modff(fabsf(v[0]) + 0.5f, &ignore);
+  vec3  result = {-v[1], v[0] - f * v[2], f * v[1]};
+  glm_vec3_copy(result, dest);
+}
+
+/*!
+ * @brief clamp vector's individual members between min and max values
+ *
+ * @param[in, out]  v      vector
+ * @param[in]       minVal minimum value
+ * @param[in]       maxVal maximum value
+ */
+CGLM_INLINE
+void
+glm_vec3_clamp(vec3 v, float minVal, float maxVal) {
+  v[0] = glm_clamp(v[0], minVal, maxVal);
+  v[1] = glm_clamp(v[1], minVal, maxVal);
+  v[2] = glm_clamp(v[2], minVal, maxVal);
+}
+
+/*!
+ * @brief linear interpolation between two vectors
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount)
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3_lerp(vec3 from, vec3 to, float t, vec3 dest) {
+  vec3 s, v;
+
+  /* from + s * (to - from) */
+  glm_vec3_broadcast(t, s);
+  glm_vec3_sub(to, from, v);
+  glm_vec3_mul(s, v, v);
+  glm_vec3_add(from, v, dest);
+}
+
+/*!
+ * @brief linear interpolation between two vectors (clamped)
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount) clamped between 0 and 1
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3_lerpc(vec3 from, vec3 to, float t, vec3 dest) {
+  glm_vec3_lerp(from, to, glm_clamp_zo(t), dest);
+}
+
+/*!
+ * @brief linear interpolation between two vectors
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount)
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3_mix(vec3 from, vec3 to, float t, vec3 dest) {
+  glm_vec3_lerp(from, to, t, dest);
+}
+
+/*!
+ * @brief linear interpolation between two vectors (clamped)
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount) clamped between 0 and 1
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3_mixc(vec3 from, vec3 to, float t, vec3 dest) {
+  glm_vec3_lerpc(from, to, t, dest);
+}
+
+/*!
+ * @brief threshold function
+ *
+ * @param[in]   edge    threshold
+ * @param[in]   x       value to test against threshold
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec3_step(vec3 edge, vec3 x, vec3 dest) {
+  dest[0] = glm_step(edge[0], x[0]);
+  dest[1] = glm_step(edge[1], x[1]);
+  dest[2] = glm_step(edge[2], x[2]);
+}
+
+/*!
+ * @brief threshold function with a smooth transition (unidimensional)
+ *
+ * @param[in]   edge0   low threshold
+ * @param[in]   edge1   high threshold
+ * @param[in]   x       value to test against threshold
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec3_smoothstep_uni(float edge0, float edge1, vec3 x, vec3 dest) {
+  dest[0] = glm_smoothstep(edge0, edge1, x[0]);
+  dest[1] = glm_smoothstep(edge0, edge1, x[1]);
+  dest[2] = glm_smoothstep(edge0, edge1, x[2]);
+}
+
+/*!
+ * @brief threshold function with a smooth transition
+ *
+ * @param[in]   edge0   low threshold
+ * @param[in]   edge1   high threshold
+ * @param[in]   x       value to test against threshold
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec3_smoothstep(vec3 edge0, vec3 edge1, vec3 x, vec3 dest) {
+  dest[0] = glm_smoothstep(edge0[0], edge1[0], x[0]);
+  dest[1] = glm_smoothstep(edge0[1], edge1[1], x[1]);
+  dest[2] = glm_smoothstep(edge0[2], edge1[2], x[2]);
+}
+
+/*!
+ * @brief smooth Hermite interpolation between two vectors
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount)
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3_smoothinterp(vec3 from, vec3 to, float t, vec3 dest) {
+  vec3 s, v;
+    
+  /* from + s * (to - from) */
+  glm_vec3_broadcast(glm_smooth(t), s);
+  glm_vec3_sub(to, from, v);
+  glm_vec3_mul(s, v, v);
+  glm_vec3_add(from, v, dest);
+}
+
+/*!
+ * @brief smooth Hermite interpolation between two vectors (clamped)
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount) clamped between 0 and 1
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3_smoothinterpc(vec3 from, vec3 to, float t, vec3 dest) {
+  glm_vec3_smoothinterp(from, to, glm_clamp_zo(t), dest);
+}
+
+/*!
+ * @brief swizzle vector components
+ *
+ * you can use existing masks e.g. GLM_XXX, GLM_ZYX
+ *
+ * @param[in]  v    source
+ * @param[in]  mask mask
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec3_swizzle(vec3 v, int mask, vec3 dest) {
+  vec3 t;
+
+  t[0] = v[(mask & (3 << 0))];
+  t[1] = v[(mask & (3 << 2)) >> 2];
+  t[2] = v[(mask & (3 << 4)) >> 4];
+
+  glm_vec3_copy(t, dest);
+}
+
+/*!
+ * @brief vec3 cross product
+ *
+ * this is just convenient wrapper
+ *
+ * @param[in]  a source 1
+ * @param[in]  b source 2
+ * @param[out] d destination
+ */
+CGLM_INLINE
+void
+glm_cross(vec3 a, vec3 b, vec3 d) {
+  glm_vec3_cross(a, b, d);
+}
+
+/*!
+ * @brief vec3 dot product
+ *
+ * this is just convenient wrapper
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ *
+ * @return dot product
+ */
+CGLM_INLINE
+float
+glm_dot(vec3 a, vec3 b) {
+  return glm_vec3_dot(a, b);
+}
+
+/*!
+ * @brief normalize vec3 and store result in same vec
+ *
+ * this is just convenient wrapper
+ *
+ * @param[in, out] v vector
+ */
+CGLM_INLINE
+void
+glm_normalize(vec3 v) {
+  glm_vec3_normalize(v);
+}
+
+/*!
+ * @brief normalize vec3 to dest
+ *
+ * this is just convenient wrapper
+ *
+ * @param[in]  v    source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_normalize_to(vec3 v, vec3 dest) {
+  glm_vec3_normalize_to(v, dest);
+}
+
+/*!
+ * @brief Create three dimensional vector from pointer
+ *
+ * @param[in]  src  pointer to an array of floats
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec3_make(const float * __restrict src, vec3 dest) {
+  dest[0] = src[0];
+  dest[1] = src[1];
+  dest[2] = src[2];
+}
+
+/*!
+ * @brief a vector pointing in the same direction as another
+ *
+ * orients a vector to point away from a surface as defined by its normal
+ *
+ * @param[in] n      vector to orient
+ * @param[in] v      incident vector
+ * @param[in] nref   reference vector
+ * @param[out] dest  oriented vector, pointing away from the surface
+ */
+CGLM_INLINE
+void
+glm_vec3_faceforward(vec3 n, vec3 v, vec3 nref, vec3 dest) {
+  if (glm_vec3_dot(v, nref) < 0.0f) {
+    /* N is facing away from I */
+    glm_vec3_copy(n, dest);
+  } else {
+    /* N is facing towards I, negate it */
+    glm_vec3_negate_to(n, dest);
+  }
+}
+
+/*!
+ * @brief reflection vector using an incident ray and a surface normal
+ *
+ * @param[in]  v    incident vector
+ * @param[in]  n    normalized normal vector
+ * @param[out] dest reflection result
+ */
+CGLM_INLINE
+void
+glm_vec3_reflect(vec3 v, vec3 n, vec3 dest) {
+  vec3 temp;
+  glm_vec3_scale(n, 2.0f * glm_vec3_dot(v, n), temp);
+  glm_vec3_sub(v, temp, dest);
+}
+
+/*!
+ * @brief computes refraction vector for an incident vector and a surface normal.
+ *
+ * calculates the refraction vector based on Snell's law. If total internal reflection
+ * occurs (angle too great given eta), dest is set to zero and returns false.
+ * Otherwise, computes refraction vector, stores it in dest, and returns true.
+ *
+ * @param[in]  v    normalized incident vector
+ * @param[in]  n    normalized normal vector
+ * @param[in]  eta  ratio of indices of refraction (incident/transmitted)
+ * @param[out] dest refraction vector if refraction occurs; zero vector otherwise
+ *
+ * @returns true if refraction occurs; false if total internal reflection occurs.
+ */
+CGLM_INLINE
+bool
+glm_vec3_refract(vec3 v, vec3 n, float eta, vec3 dest) {
+  float ndi, eni, k;
+
+  ndi = glm_vec3_dot(n, v);
+  eni = eta * ndi;
+  k   = 1.0f - eta * eta + eni * eni;
+
+  if (k < 0.0f) {
+    glm_vec3_zero(dest);
+    return false;
+  }
+
+  glm_vec3_scale(v, eta, dest);
+  glm_vec3_mulsubs(n, eni + sqrtf(k), dest);
+  return true;
+}
+
+#endif /* cglm_vec3_h */
diff --git a/external/cglm/vec4-ext.h b/external/cglm/vec4-ext.h
new file mode 100644
index 0000000..193a5e9
--- /dev/null
+++ b/external/cglm/vec4-ext.h
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*!
+ * @brief SIMD like functions
+ */
+
+/*
+ Functions:
+   CGLM_INLINE void  glm_vec4_broadcast(float val, vec4 d);
+   CGLM_INLINE void  glm_vec4_fill(vec4 v, float val);
+   CGLM_INLINE bool  glm_vec4_eq(vec4 v, float val);
+   CGLM_INLINE bool  glm_vec4_eq_eps(vec4 v, float val);
+   CGLM_INLINE bool  glm_vec4_eq_all(vec4 v);
+   CGLM_INLINE bool  glm_vec4_eqv(vec4 a, vec4 b);
+   CGLM_INLINE bool  glm_vec4_eqv_eps(vec4 a, vec4 b);
+   CGLM_INLINE float glm_vec4_max(vec4 v);
+   CGLM_INLINE float glm_vec4_min(vec4 v);
+   CGLM_INLINE bool  glm_vec4_isnan(vec4 v);
+   CGLM_INLINE bool  glm_vec4_isinf(vec4 v);
+   CGLM_INLINE bool  glm_vec4_isvalid(vec4 v);
+   CGLM_INLINE void  glm_vec4_sign(vec4 v, vec4 dest);
+   CGLM_INLINE void  glm_vec4_abs(vec4 v, vec4 dest);
+   CGLM_INLINE void  glm_vec4_fract(vec4 v, vec4 dest);
+   CGLM_INLINE void  glm_vec4_floor(vec4 v, vec4 dest);
+   CGLM_INLINE float glm_vec4_mods(vec4 v, float s, vec4 dest);
+   CGLM_INLINE float glm_vec4_steps(float edge, vec4 v, vec4 dest);
+   CGLM_INLINE void  glm_vec4_stepr(vec4 edge, float v, vec4 dest);
+   CGLM_INLINE float glm_vec4_hadd(vec4 v);
+   CGLM_INLINE void  glm_vec4_sqrt(vec4 v, vec4 dest);
+ */
+
+#ifndef cglm_vec4_ext_h
+#define cglm_vec4_ext_h
+
+#include "common.h"
+#include "vec3-ext.h"
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param val value
+ * @param d   dest
+ */
+CGLM_INLINE
+void
+glm_vec4_broadcast(float val, vec4 d) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(d, wasm_f32x4_splat(val));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(d, glmm_set1(val));
+#else
+  d[0] = d[1] = d[2] = d[3] = val;
+#endif
+}
+
+/*!
+ * @brief fill a vector with specified value
+ *
+ * @param v   dest
+ * @param val value
+ */
+CGLM_INLINE
+void
+glm_vec4_fill(vec4 v, float val) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(v, wasm_f32x4_splat(val));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(v, glmm_set1(val));
+#else
+  v[0] = v[1] = v[2] = v[3] = val;
+#endif
+}
+
+/*!
+ * @brief check if vector is equal to value (without epsilon)
+ *
+ * @param v   vector
+ * @param val value
+ */
+CGLM_INLINE
+bool
+glm_vec4_eq(vec4 v, float val) {
+  return v[0] == val
+         && v[0] == v[1]
+         && v[0] == v[2]
+         && v[0] == v[3];
+}
+
+/*!
+ * @brief check if vector is equal to value (with epsilon)
+ *
+ * @param v   vector
+ * @param val value
+ */
+CGLM_INLINE
+bool
+glm_vec4_eq_eps(vec4 v, float val) {
+  return fabsf(v[0] - val) <= GLM_FLT_EPSILON
+         && fabsf(v[1] - val) <= GLM_FLT_EPSILON
+         && fabsf(v[2] - val) <= GLM_FLT_EPSILON
+         && fabsf(v[3] - val) <= GLM_FLT_EPSILON;
+}
+
+/*!
+ * @brief check if vector members are equal (without epsilon)
+ *
+ * @param v   vector
+ */
+CGLM_INLINE
+bool
+glm_vec4_eq_all(vec4 v) {
+  return glm_vec4_eq_eps(v, v[0]);
+}
+
+/*!
+ * @brief check if vector is equal to another (without epsilon)
+ *
+ * @param a vector
+ * @param b vector
+ */
+CGLM_INLINE
+bool
+glm_vec4_eqv(vec4 a, vec4 b) {
+  return a[0] == b[0]
+         && a[1] == b[1]
+         && a[2] == b[2]
+         && a[3] == b[3];
+}
+
+/*!
+ * @brief check if vector is equal to another (with epsilon)
+ *
+ * @param a vector
+ * @param b vector
+ */
+CGLM_INLINE
+bool
+glm_vec4_eqv_eps(vec4 a, vec4 b) {
+  return fabsf(a[0] - b[0]) <= GLM_FLT_EPSILON
+         && fabsf(a[1] - b[1]) <= GLM_FLT_EPSILON
+         && fabsf(a[2] - b[2]) <= GLM_FLT_EPSILON
+         && fabsf(a[3] - b[3]) <= GLM_FLT_EPSILON;
+}
+
+/*!
+ * @brief max value of vector
+ *
+ * @param v vector
+ */
+CGLM_INLINE
+float
+glm_vec4_max(vec4 v) {
+  float max;
+
+  max = glm_vec3_max(v);
+  if (v[3] > max)
+    max = v[3];
+
+  return max;
+}
+
+/*!
+ * @brief min value of vector
+ *
+ * @param v vector
+ */
+CGLM_INLINE
+float
+glm_vec4_min(vec4 v) {
+  float min;
+
+  min = glm_vec3_min(v);
+  if (v[3] < min)
+    min = v[3];
+
+  return min;
+}
+
+/*!
+ * @brief check if one of items is NaN (not a number)
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec4_isnan(vec4 v) {
+#ifndef CGLM_FAST_MATH
+  return isnan(v[0]) || isnan(v[1]) || isnan(v[2]) || isnan(v[3]);
+#else
+  return false;
+#endif
+}
+
+/*!
+ * @brief check if one of items is INFINITY
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec4_isinf(vec4 v) {
+#ifndef CGLM_FAST_MATH
+  return isinf(v[0]) || isinf(v[1]) || isinf(v[2]) || isinf(v[3]);
+#else
+  return false;
+#endif
+}
+
+/*!
+ * @brief check if all items are valid number
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec4_isvalid(vec4 v) {
+  return !glm_vec4_isnan(v) && !glm_vec4_isinf(v);
+}
+
+/*!
+ * @brief get sign of 32 bit float as +1, -1, 0
+ *
+ * Important: It returns 0 for zero/NaN input
+ *
+ * @param v vector
+ */
+CGLM_INLINE
+void
+glm_vec4_sign(vec4 v, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  __m128 x0, x1, x2, x3, x4;
+
+  x0 = glmm_load(v);
+  x1 = _mm_set_ps(0.0f, 0.0f, 1.0f, -1.0f);
+  x2 = glmm_splat(x1, 2);
+
+  x3 = _mm_and_ps(_mm_cmpgt_ps(x0, x2), glmm_splat(x1, 1));
+  x4 = _mm_and_ps(_mm_cmplt_ps(x0, x2), glmm_splat(x1, 0));
+
+  glmm_store(dest, _mm_or_ps(x3, x4));
+#else
+  dest[0] = glm_signf(v[0]);
+  dest[1] = glm_signf(v[1]);
+  dest[2] = glm_signf(v[2]);
+  dest[3] = glm_signf(v[3]);
+#endif
+}
+
+/*!
+ * @brief absolute value of each vector item
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_abs(vec4 v, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, glmm_abs(glmm_load(v)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, glmm_abs(glmm_load(v)));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest, vabsq_f32(vld1q_f32(v)));
+#else
+  dest[0] = fabsf(v[0]);
+  dest[1] = fabsf(v[1]);
+  dest[2] = fabsf(v[2]);
+  dest[3] = fabsf(v[3]);
+#endif
+}
+
+/*!
+ * @brief fractional part of each vector item
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_fract(vec4 v, vec4 dest) {
+  dest[0] = fminf(v[0] - floorf(v[0]), 0.999999940395355224609375f);
+  dest[1] = fminf(v[1] - floorf(v[1]), 0.999999940395355224609375f);
+  dest[2] = fminf(v[2] - floorf(v[2]), 0.999999940395355224609375f);
+  dest[3] = fminf(v[3] - floorf(v[3]), 0.999999940395355224609375f);
+}
+
+/*!
+ * @brief floor of each vector item
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_floor(vec4 v, vec4 dest) {
+  dest[0] = floorf(v[0]);
+  dest[1] = floorf(v[1]);
+  dest[2] = floorf(v[2]);
+  dest[3] = floorf(v[3]);
+}
+
+/*!
+ * @brief mod of each vector item, result is written to dest (dest = v % s)
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_mods(vec4 v, float s, vec4 dest) {
+  dest[0] = fmodf(v[0], s);
+  dest[1] = fmodf(v[1], s);
+  dest[2] = fmodf(v[2], s);
+  dest[3] = fmodf(v[3], s);
+}
+
+/*!
+ * @brief threshold each vector item with scalar
+ *        condition is: (x[i] < edge) ? 0.0 : 1.0
+ *
+ * @param[in]   edge    threshold
+ * @param[in]   x       vector to test against threshold
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec4_steps(float edge, vec4 x, vec4 dest) {
+  dest[0] = glm_step(edge, x[0]);
+  dest[1] = glm_step(edge, x[1]);
+  dest[2] = glm_step(edge, x[2]);
+  dest[3] = glm_step(edge, x[3]);
+}
+
+/*!
+ * @brief threshold a value with *vector* as the threshold
+ *        condition is: (x < edge[i]) ? 0.0 : 1.0
+ *
+ * @param[in]   edge    threshold vector
+ * @param[in]   x       value to test against threshold
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec4_stepr(vec4 edge, float x, vec4 dest) {
+  dest[0] = glm_step(edge[0], x);
+  dest[1] = glm_step(edge[1], x);
+  dest[2] = glm_step(edge[2], x);
+  dest[3] = glm_step(edge[3], x);
+}
+
+/*!
+ * @brief vector reduction by summation
+ * @warning could overflow
+ *
+ * @param[in]   v    vector
+ * @return      sum of all vector's elements
+ */
+CGLM_INLINE
+float
+glm_vec4_hadd(vec4 v) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  return glmm_hadd(glmm_load(v));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  return glmm_hadd(glmm_load(v));
+#else
+  return v[0] + v[1] + v[2] + v[3];
+#endif
+}
+
+/*!
+ * @brief square root of each vector item
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_sqrt(vec4 v, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_sqrt(glmm_load(v)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_sqrt_ps(glmm_load(v)));
+#else
+  dest[0] = sqrtf(v[0]);
+  dest[1] = sqrtf(v[1]);
+  dest[2] = sqrtf(v[2]);
+  dest[3] = sqrtf(v[3]);
+#endif
+}
+
+#endif /* cglm_vec4_ext_h */
diff --git a/external/cglm/vec4.h b/external/cglm/vec4.h
new file mode 100644
index 0000000..ded09c9
--- /dev/null
+++ b/external/cglm/vec4.h
@@ -0,0 +1,1348 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+/*
+ Macros:
+   GLM_VEC4_ONE_INIT
+   GLM_VEC4_BLACK_INIT
+   GLM_VEC4_ZERO_INIT
+   GLM_VEC4_ONE
+   GLM_VEC4_BLACK
+   GLM_VEC4_ZERO
+
+ Functions:
+   CGLM_INLINE void  glm_vec4(vec3 v3, float last, vec4 dest);
+   CGLM_INLINE void  glm_vec4_copy3(vec4 a, vec3 dest);
+   CGLM_INLINE void  glm_vec4_copy(vec4 v, vec4 dest);
+   CGLM_INLINE void  glm_vec4_ucopy(vec4 v, vec4 dest);
+   CGLM_INLINE float glm_vec4_dot(vec4 a, vec4 b);
+   CGLM_INLINE float glm_vec4_norm2(vec4 v);
+   CGLM_INLINE float glm_vec4_norm(vec4 v);
+   CGLM_INLINE float glm_vec4_norm_one(vec4 v);
+   CGLM_INLINE float glm_vec4_norm_inf(vec4 v);
+   CGLM_INLINE void  glm_vec4_add(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_adds(vec4 v, float s, vec4 dest);
+   CGLM_INLINE void  glm_vec4_sub(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_subs(vec4 v, float s, vec4 dest);
+   CGLM_INLINE void  glm_vec4_mul(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_scale(vec4 v, float s, vec4 dest);
+   CGLM_INLINE void  glm_vec4_scale_as(vec4 v, float s, vec4 dest);
+   CGLM_INLINE void  glm_vec4_div(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_divs(vec4 v, float s, vec4 dest);
+   CGLM_INLINE void  glm_vec4_addadd(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_subadd(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_muladd(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_muladds(vec4 a, float s, vec4 dest);
+   CGLM_INLINE void  glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_minadd(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_subsub(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_addsub(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_mulsub(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_mulsubs(vec4 a, float s, vec4 dest);
+   CGLM_INLINE void  glm_vec4_maxsub(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_minsub(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_negate(vec4 v);
+   CGLM_INLINE void  glm_vec4_inv(vec4 v);
+   CGLM_INLINE void  glm_vec4_inv_to(vec4 v, vec4 dest);
+   CGLM_INLINE void  glm_vec4_normalize(vec4 v);
+   CGLM_INLINE void  glm_vec4_normalize_to(vec4 vec, vec4 dest);
+   CGLM_INLINE float glm_vec4_distance(vec4 a, vec4 b);
+   CGLM_INLINE float glm_vec4_distance2(vec4 a, vec4 b);
+   CGLM_INLINE void  glm_vec4_maxv(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_minv(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_clamp(vec4 v, float minVal, float maxVal);
+   CGLM_INLINE void  glm_vec4_lerp(vec4 from, vec4 to, float t, vec4 dest);
+   CGLM_INLINE void  glm_vec4_lerpc(vec4 from, vec4 to, float t, vec4 dest);
+   CGLM_INLINE void  glm_vec4_step(vec4 edge, vec4 x, vec4 dest);
+   CGLM_INLINE void  glm_vec4_smoothstep_uni(float edge0, float edge1, vec4 x, vec4 dest);
+   CGLM_INLINE void  glm_vec4_smoothstep(vec4 edge0, vec4 edge1, vec4 x, vec4 dest);
+   CGLM_INLINE void  glm_vec4_smoothinterp(vec4 from, vec4 to, float t, vec4 dest);
+   CGLM_INLINE void  glm_vec4_smoothinterpc(vec4 from, vec4 to, float t, vec4 dest);
+   CGLM_INLINE void  glm_vec4_swizzle(vec4 v, int mask, vec4 dest);
+   CGLM_INLINE void  glm_vec4_make(float * restrict src, vec4 dest);
+   CGLM_INLINE void  glm_vec4_reflect(vec4 v, vec4 n, vec4 dest);
+   CGLM_INLINE void  glm_vec4_refract(vec4 v, vec4 n, float eta, vec4 dest);
+
+ DEPRECATED:
+   glm_vec4_dup
+   glm_vec4_flipsign
+   glm_vec4_flipsign_to
+   glm_vec4_inv
+   glm_vec4_inv_to
+   glm_vec4_mulv
+   glm_vec4_step_uni  --> use glm_vec4_steps
+ */
+
+#ifndef cglm_vec4_h
+#define cglm_vec4_h
+
+#include "common.h"
+#include "vec4-ext.h"
+#include "util.h"
+
+/* DEPRECATED! functions */
+#define glm_vec4_dup3(v, dest)         glm_vec4_copy3(v, dest)
+#define glm_vec4_dup(v, dest)          glm_vec4_copy(v, dest)
+#define glm_vec4_flipsign(v)           glm_vec4_negate(v)
+#define glm_vec4_flipsign_to(v, dest)  glm_vec4_negate_to(v, dest)
+#define glm_vec4_inv(v)                glm_vec4_negate(v)
+#define glm_vec4_inv_to(v, dest)       glm_vec4_negate_to(v, dest)
+#define glm_vec4_mulv(a, b, d)         glm_vec4_mul(a, b, d)
+#define glm_vec4_step_uni(edge, x, dest) glm_vec4_steps(edge, x, dest)
+
+#define GLM_VEC4_ONE_INIT   {1.0f, 1.0f, 1.0f, 1.0f}
+#define GLM_VEC4_BLACK_INIT {0.0f, 0.0f, 0.0f, 1.0f}
+#define GLM_VEC4_ZERO_INIT  {0.0f, 0.0f, 0.0f, 0.0f}
+
+#define GLM_VEC4_ONE        ((vec4)GLM_VEC4_ONE_INIT)
+#define GLM_VEC4_BLACK      ((vec4)GLM_VEC4_BLACK_INIT)
+#define GLM_VEC4_ZERO       ((vec4)GLM_VEC4_ZERO_INIT)
+
+#define GLM_XXXX GLM_SHUFFLE4(0, 0, 0, 0)
+#define GLM_YYYY GLM_SHUFFLE4(1, 1, 1, 1)
+#define GLM_ZZZZ GLM_SHUFFLE4(2, 2, 2, 2)
+#define GLM_WWWW GLM_SHUFFLE4(3, 3, 3, 3)
+#define GLM_WZYX GLM_SHUFFLE4(0, 1, 2, 3)
+
+/*!
+ * @brief init vec4 using vec3
+ *
+ * @param[in]  v3   vector3
+ * @param[in]  last last item
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4(vec3 v3, float last, vec4 dest) {
+  dest[0] = v3[0];
+  dest[1] = v3[1];
+  dest[2] = v3[2];
+  dest[3] = last;
+}
+
+/*!
+ * @brief copy first 3 members of [a] to [dest]
+ *
+ * @param[in]  a    source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4_copy3(vec4 a, vec3 dest) {
+  dest[0] = a[0];
+  dest[1] = a[1];
+  dest[2] = a[2];
+}
+
+/*!
+ * @brief copy all members of [a] to [dest]
+ *
+ * @param[in]  v    source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4_copy(vec4 v, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, glmm_load(v));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, glmm_load(v));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest, vld1q_f32(v));
+#else
+  dest[0] = v[0];
+  dest[1] = v[1];
+  dest[2] = v[2];
+  dest[3] = v[3];
+#endif
+}
+
+/*!
+ * @brief copy all members of [a] to [dest]
+ *
+ * alignment is not required
+ *
+ * @param[in]  v    source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4_ucopy(vec4 v, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  /* note here wasm v128.load/v128.store support unaligned loads and stores */
+  wasm_v128_store(dest, wasm_v128_load(v));
+#else
+  dest[0] = v[0];
+  dest[1] = v[1];
+  dest[2] = v[2];
+  dest[3] = v[3];
+#endif
+}
+
+/*!
+ * @brief make vector zero
+ *
+ * @param[in, out]  v vector
+ */
+CGLM_INLINE
+void
+glm_vec4_zero(vec4 v) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(v, wasm_f32x4_const_splat(0.f));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(v, _mm_setzero_ps());
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(v, vdupq_n_f32(0.0f));
+#else
+  v[0] = 0.0f;
+  v[1] = 0.0f;
+  v[2] = 0.0f;
+  v[3] = 0.0f;
+#endif
+}
+
+/*!
+ * @brief make vector one
+ *
+ * @param[in, out]  v vector
+ */
+CGLM_INLINE
+void
+glm_vec4_one(vec4 v) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(v, wasm_f32x4_const_splat(1.0f));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(v, glmm_set1_rval(1.0f));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(v, vdupq_n_f32(1.0f));
+#else
+  v[0] = 1.0f;
+  v[1] = 1.0f;
+  v[2] = 1.0f;
+  v[3] = 1.0f;
+#endif
+}
+
+/*!
+ * @brief vec4 dot product
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ *
+ * @return dot product
+ */
+CGLM_INLINE
+float
+glm_vec4_dot(vec4 a, vec4 b) {
+#if defined(CGLM_SIMD)
+  return glmm_dot(glmm_load(a), glmm_load(b));
+#else
+  return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
+#endif
+}
+
+/*!
+ * @brief norm * norm (magnitude) of vec
+ *
+ * we can use this func instead of calling norm * norm, because it would call
+ * sqrtf function twice but with this func we can avoid func call, maybe this is
+ * not good name for this func
+ *
+ * @param[in] v vec4
+ *
+ * @return norm * norm
+ */
+CGLM_INLINE
+float
+glm_vec4_norm2(vec4 v) {
+  return glm_vec4_dot(v, v);
+}
+
+/*!
+ * @brief euclidean norm (magnitude), also called L2 norm
+ *        this will give magnitude of vector in euclidean space
+ *
+ * @param[in] v vector
+ *
+ * @return norm
+ */
+CGLM_INLINE
+float
+glm_vec4_norm(vec4 v) {
+#if defined(CGLM_SIMD)
+  return glmm_norm(glmm_load(v));
+#else
+  return sqrtf(glm_vec4_dot(v, v));
+#endif
+}
+
+/*!
+ * @brief L1 norm of vec4
+ * Also known as Manhattan Distance or Taxicab norm.
+ * L1 Norm is the sum of the magnitudes of the vectors in a space.
+ * It is calculated as the sum of the absolute values of the vector components.
+ * In this norm, all the components of the vector are weighted equally.
+ *
+ * This computes:
+ * L1 norm = |v[0]| + |v[1]| + |v[2]| + |v[3]|
+ *
+ * @param[in] v vector
+ *
+ * @return L1 norm
+ */
+CGLM_INLINE
+float
+glm_vec4_norm_one(vec4 v) {
+#if defined(CGLM_SIMD)
+  return glmm_norm_one(glmm_load(v));
+#else
+  vec4 t;
+  glm_vec4_abs(v, t);
+  return glm_vec4_hadd(t);
+#endif
+}
+
+/*!
+ * @brief infinity norm of vec4
+ * Also known as Maximum norm.
+ * Infinity Norm is the largest magnitude among each element of a vector.
+ * It is calculated as the maximum of the absolute values of the vector components.
+ *
+ * This computes:
+ * inf norm = max(|v[0]|, |v[1]|, |v[2]|, |v[3]|)
+ *
+ * @param[in] v vector
+ *
+ * @return infinity norm
+ */
+CGLM_INLINE
+float
+glm_vec4_norm_inf(vec4 v) {
+#if defined(CGLM_SIMD)
+  return glmm_norm_inf(glmm_load(v));
+#else
+  vec4 t;
+  glm_vec4_abs(v, t);
+  return glm_vec4_max(t);
+#endif
+}
+
+/*!
+ * @brief add b vector to a vector store result in dest
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_add(vec4 a, vec4 b, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_add(glmm_load(a), glmm_load(b)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_add_ps(glmm_load(a), glmm_load(b)));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest, vaddq_f32(vld1q_f32(a), vld1q_f32(b)));
+#else
+  dest[0] = a[0] + b[0];
+  dest[1] = a[1] + b[1];
+  dest[2] = a[2] + b[2];
+  dest[3] = a[3] + b[3];
+#endif
+}
+
+/*!
+ * @brief add scalar to v vector store result in dest (d = v + vec(s))
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_adds(vec4 v, float s, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_add(glmm_load(v), wasm_f32x4_splat(s)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_add_ps(glmm_load(v), glmm_set1(s)));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest, vaddq_f32(vld1q_f32(v), vdupq_n_f32(s)));
+#else
+  dest[0] = v[0] + s;
+  dest[1] = v[1] + s;
+  dest[2] = v[2] + s;
+  dest[3] = v[3] + s;
+#endif
+}
+
+/*!
+ * @brief subtract b vector from a vector store result in dest (d = a - b)
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_sub(vec4 a, vec4 b, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_sub(glmm_load(a), glmm_load(b)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_sub_ps(glmm_load(a), glmm_load(b)));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest, vsubq_f32(vld1q_f32(a), vld1q_f32(b)));
+#else
+  dest[0] = a[0] - b[0];
+  dest[1] = a[1] - b[1];
+  dest[2] = a[2] - b[2];
+  dest[3] = a[3] - b[3];
+#endif
+}
+
+/*!
+ * @brief subtract scalar from v vector store result in dest (d = v - vec(s))
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_subs(vec4 v, float s, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_sub(glmm_load(v), wasm_f32x4_splat(s)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_sub_ps(glmm_load(v), glmm_set1(s)));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest, vsubq_f32(vld1q_f32(v), vdupq_n_f32(s)));
+#else
+  dest[0] = v[0] - s;
+  dest[1] = v[1] - s;
+  dest[2] = v[2] - s;
+  dest[3] = v[3] - s;
+#endif
+}
+
+/*!
+ * @brief multiply two vectors (component-wise multiplication)
+ *
+ * @param a    vector1
+ * @param b    vector2
+ * @param dest dest = (a[0] * b[0], a[1] * b[1], a[2] * b[2], a[3] * b[3])
+ */
+CGLM_INLINE
+void
+glm_vec4_mul(vec4 a, vec4 b, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_mul(glmm_load(a), glmm_load(b)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_mul_ps(glmm_load(a), glmm_load(b)));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest, vmulq_f32(vld1q_f32(a), vld1q_f32(b)));
+#else
+  dest[0] = a[0] * b[0];
+  dest[1] = a[1] * b[1];
+  dest[2] = a[2] * b[2];
+  dest[3] = a[3] * b[3];
+#endif
+}
+
+/*!
+ * @brief multiply/scale vec4 vector with scalar: result = v * s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_scale(vec4 v, float s, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_mul(glmm_load(v), wasm_f32x4_splat(s)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_mul_ps(glmm_load(v), glmm_set1(s)));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest, vmulq_f32(vld1q_f32(v), vdupq_n_f32(s)));
+#else
+  dest[0] = v[0] * s;
+  dest[1] = v[1] * s;
+  dest[2] = v[2] * s;
+  dest[3] = v[3] * s;
+#endif
+}
+
+/*!
+ * @brief make vec4 vector scale as specified: result = unit(v) * s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_scale_as(vec4 v, float s, vec4 dest) {
+  float norm;
+  norm = glm_vec4_norm(v);
+
+  if (CGLM_UNLIKELY(norm < FLT_EPSILON)) {
+    glm_vec4_zero(dest);
+    return;
+  }
+
+  glm_vec4_scale(v, s / norm, dest);
+}
+
+/*!
+ * @brief div vector with another component-wise division: d = a / b
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest result = (a[0]/b[0], a[1]/b[1], a[2]/b[2], a[3]/b[3])
+ */
+CGLM_INLINE
+void
+glm_vec4_div(vec4 a, vec4 b, vec4 dest) {
+#if defined(CGLM_SIMD)
+  glmm_store(dest, glmm_div(glmm_load(a), glmm_load(b)));
+#else
+  dest[0] = a[0] / b[0];
+  dest[1] = a[1] / b[1];
+  dest[2] = a[2] / b[2];
+  dest[3] = a[3] / b[3];
+#endif
+}
+
+/*!
+ * @brief div vec4 vector with scalar: d = v / s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_divs(vec4 v, float s, vec4 dest) {
+#if defined(CGLM_SIMD)
+  glmm_store(dest, glmm_div(glmm_load(v), glmm_set1(s)));
+#else
+  glm_vec4_scale(v, 1.0f / s, dest);
+#endif
+}
+
+/*!
+ * @brief add two vectors and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a + b)
+ */
+CGLM_INLINE
+void
+glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_add(
+          glmm_load(dest),
+          wasm_f32x4_add(glmm_load(a), glmm_load(b))));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_add_ps(glmm_load(dest),
+                              _mm_add_ps(glmm_load(a),
+                                         glmm_load(b))));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest, vaddq_f32(vld1q_f32(dest),
+                            vaddq_f32(vld1q_f32(a),
+                                      vld1q_f32(b))));
+#else
+  dest[0] += a[0] + b[0];
+  dest[1] += a[1] + b[1];
+  dest[2] += a[2] + b[2];
+  dest[3] += a[3] + b[3];
+#endif
+}
+
+/*!
+ * @brief sub two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a - b)
+ */
+CGLM_INLINE
+void
+glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_add(
+          glmm_load(dest),
+          wasm_f32x4_sub(glmm_load(a), glmm_load(b))));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_add_ps(glmm_load(dest),
+                              _mm_sub_ps(glmm_load(a),
+                                         glmm_load(b))));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest, vaddq_f32(vld1q_f32(dest),
+                            vsubq_f32(vld1q_f32(a),
+                                      vld1q_f32(b))));
+#else
+  dest[0] += a[0] - b[0];
+  dest[1] += a[1] - b[1];
+  dest[2] += a[2] - b[2];
+  dest[3] += a[3] - b[3];
+#endif
+}
+
+/*!
+ * @brief mul two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec4_muladd(vec4 a, vec4 b, vec4 dest) {
+#if defined(CGLM_SIMD)
+  glmm_store(dest, glmm_fmadd(glmm_load(a), glmm_load(b), glmm_load(dest)));
+#else
+  dest[0] += a[0] * b[0];
+  dest[1] += a[1] * b[1];
+  dest[2] += a[2] * b[2];
+  dest[3] += a[3] * b[3];
+#endif
+}
+
+/*!
+ * @brief mul vector with scalar and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec4_muladds(vec4 a, float s, vec4 dest) {
+#if defined(CGLM_SIMD)
+  glmm_store(dest, glmm_fmadd(glmm_load(a), glmm_set1(s), glmm_load(dest)));
+#else
+  dest[0] += a[0] * s;
+  dest[1] += a[1] * s;
+  dest[2] += a[2] * s;
+  dest[3] += a[3] * s;
+#endif
+}
+
+/*!
+ * @brief add max of two vectors to result/dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += max(a, b)
+ */
+CGLM_INLINE
+void
+glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_add(glmm_load(dest),
+                                  glmm_max(glmm_load(a), glmm_load(b))));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_add_ps(glmm_load(dest),
+                              glmm_max(glmm_load(a), glmm_load(b))));
+#elif defined(CGLM_NEON_FP)
+  glmm_store(dest, vaddq_f32(glmm_load(dest),
+                             glmm_max(glmm_load(a), glmm_load(b))));
+#else
+  dest[0] += glm_max(a[0], b[0]);
+  dest[1] += glm_max(a[1], b[1]);
+  dest[2] += glm_max(a[2], b[2]);
+  dest[3] += glm_max(a[3], b[3]);
+#endif
+}
+
+/*!
+ * @brief add min of two vectors to result/dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += min(a, b)
+ */
+CGLM_INLINE
+void
+glm_vec4_minadd(vec4 a, vec4 b, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_add(glmm_load(dest),
+                                  glmm_min(glmm_load(a), glmm_load(b))));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_add_ps(glmm_load(dest),
+                              glmm_min(glmm_load(a), glmm_load(b))));
+#elif defined(CGLM_NEON_FP)
+  glmm_store(dest, vaddq_f32(glmm_load(dest),
+                             glmm_min(glmm_load(a), glmm_load(b))));
+#else
+  dest[0] += glm_min(a[0], b[0]);
+  dest[1] += glm_min(a[1], b[1]);
+  dest[2] += glm_min(a[2], b[2]);
+  dest[3] += glm_min(a[3], b[3]);
+#endif
+}
+
+/*!
+ * @brief sub two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= (a - b)
+ */
+CGLM_INLINE
+void
+glm_vec4_subsub(vec4 a, vec4 b, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_sub(
+          glmm_load(dest),
+          wasm_f32x4_sub(glmm_load(a), glmm_load(b))));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_sub_ps(glmm_load(dest),
+                              _mm_sub_ps(glmm_load(a),
+                                         glmm_load(b))));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest, vsubq_f32(vld1q_f32(dest),
+                            vsubq_f32(vld1q_f32(a),
+                                      vld1q_f32(b))));
+#else
+  dest[0] -= a[0] - b[0];
+  dest[1] -= a[1] - b[1];
+  dest[2] -= a[2] - b[2];
+  dest[3] -= a[3] - b[3];
+#endif
+}
+
+/*!
+ * @brief add two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= (a + b)
+ */
+CGLM_INLINE
+void
+glm_vec4_addsub(vec4 a, vec4 b, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_sub(
+          glmm_load(dest),
+          wasm_f32x4_add(glmm_load(a), glmm_load(b))));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_sub_ps(glmm_load(dest),
+                              _mm_add_ps(glmm_load(a),
+                                         glmm_load(b))));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest, vsubq_f32(vld1q_f32(dest),
+                            vaddq_f32(vld1q_f32(a),
+                                      vld1q_f32(b))));
+#else
+  dest[0] -= a[0] + b[0];
+  dest[1] -= a[1] + b[1];
+  dest[2] -= a[2] + b[2];
+  dest[3] -= a[3] + b[3];
+#endif
+}
+
+/*!
+ * @brief mul two vectors and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec4_mulsub(vec4 a, vec4 b, vec4 dest) {
+#if defined(CGLM_SIMD)
+  glmm_store(dest, glmm_fnmadd(glmm_load(a), glmm_load(b), glmm_load(dest)));
+#else
+  dest[0] -= a[0] * b[0];
+  dest[1] -= a[1] * b[1];
+  dest[2] -= a[2] * b[2];
+  dest[3] -= a[3] * b[3];
+#endif
+}
+
+/*!
+ * @brief mul vector with scalar and sub result to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest -= (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec4_mulsubs(vec4 a, float s, vec4 dest) {
+#if defined(CGLM_SIMD)
+  glmm_store(dest, glmm_fnmadd(glmm_load(a), glmm_set1(s), glmm_load(dest)));
+#else
+  dest[0] -= a[0] * s;
+  dest[1] -= a[1] * s;
+  dest[2] -= a[2] * s;
+  dest[3] -= a[3] * s;
+#endif
+}
+
+/*!
+ * @brief sub max of two vectors to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= max(a, b)
+ */
+CGLM_INLINE
+void
+glm_vec4_maxsub(vec4 a, vec4 b, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_sub(glmm_load(dest),
+                                  glmm_max(glmm_load(a), glmm_load(b))));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_sub_ps(glmm_load(dest),
+                              glmm_max(glmm_load(a), glmm_load(b))));
+#elif defined(CGLM_NEON_FP)
+  glmm_store(dest, vsubq_f32(glmm_load(dest),
+                             glmm_max(glmm_load(a), glmm_load(b))));
+#else
+  dest[0] -= glm_max(a[0], b[0]);
+  dest[1] -= glm_max(a[1], b[1]);
+  dest[2] -= glm_max(a[2], b[2]);
+  dest[3] -= glm_max(a[3], b[3]);
+#endif
+}
+
+/*!
+ * @brief sub min of two vectors to dest
+ *
+ * it applies -= operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest -= min(a, b)
+ */
+CGLM_INLINE
+void
+glm_vec4_minsub(vec4 a, vec4 b, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_sub(glmm_load(dest),
+                                  glmm_min(glmm_load(a), glmm_load(b))));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_sub_ps(glmm_load(dest),
+                              glmm_min(glmm_load(a), glmm_load(b))));
+#elif defined(CGLM_NEON_FP)
+  glmm_store(dest, vsubq_f32(vld1q_f32(dest),
+                             glmm_min(glmm_load(a), glmm_load(b))));
+#else
+  dest[0] -= glm_min(a[0], b[0]);
+  dest[1] -= glm_min(a[1], b[1]);
+  dest[2] -= glm_min(a[2], b[2]);
+  dest[3] -= glm_min(a[3], b[3]);
+#endif
+}
+
+/*!
+ * @brief negate vector components and store result in dest
+ *
+ * @param[in]  v     vector
+ * @param[out] dest  result vector
+ */
+CGLM_INLINE
+void
+glm_vec4_negate_to(vec4 v, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(dest, wasm_f32x4_neg(glmm_load(v)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_xor_ps(glmm_load(v), glmm_float32x4_SIGNMASK_NEG));
+#elif defined(CGLM_NEON_FP)
+  vst1q_f32(dest, vnegq_f32(vld1q_f32(v)));
+#else
+  dest[0] = -v[0];
+  dest[1] = -v[1];
+  dest[2] = -v[2];
+  dest[3] = -v[3];
+#endif
+}
+
+/*!
+ * @brief flip sign of all vec4 members
+ *
+ * @param[in, out]  v  vector
+ */
+CGLM_INLINE
+void
+glm_vec4_negate(vec4 v) {
+  glm_vec4_negate_to(v, v);
+}
+
+/*!
+ * @brief normalize vec4 to dest
+ *
+ * @param[in]  v    source
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4_normalize_to(vec4 v, vec4 dest) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_128 xdot, x0;
+  float  dot;
+
+  x0   = glmm_load(v);
+  xdot = glmm_vdot(x0, x0);
+  /* dot  = _mm_cvtss_f32(xdot); */
+  dot  = wasm_f32x4_extract_lane(xdot, 0);
+
+  if (CGLM_UNLIKELY(dot < FLT_EPSILON)) {
+    glmm_store(dest, wasm_f32x4_const_splat(0.f));
+    return;
+  }
+
+  glmm_store(dest, glmm_div(x0, wasm_f32x4_sqrt(xdot)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  __m128 xdot, x0;
+  float  dot;
+
+  x0   = glmm_load(v);
+  xdot = glmm_vdot(x0, x0);
+  dot  = _mm_cvtss_f32(xdot);
+
+  if (CGLM_UNLIKELY(dot < FLT_EPSILON)) {
+    glmm_store(dest, _mm_setzero_ps());
+    return;
+  }
+
+  glmm_store(dest, glmm_div(x0, _mm_sqrt_ps(xdot)));
+#else
+  float norm;
+
+  norm = glm_vec4_norm(v);
+
+  if (CGLM_UNLIKELY(norm < FLT_EPSILON)) {
+    glm_vec4_zero(dest);
+    return;
+  }
+
+  glm_vec4_scale(v, 1.0f / norm, dest);
+#endif
+}
+
+/*!
+ * @brief normalize vec4 and store result in same vec
+ *
+ * @param[in, out] v vector
+ */
+CGLM_INLINE
+void
+glm_vec4_normalize(vec4 v) {
+  glm_vec4_normalize_to(v, v);
+}
+
+/**
+ * @brief distance between two vectors
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ * @return returns distance
+ */
+CGLM_INLINE
+float
+glm_vec4_distance(vec4 a, vec4 b) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  return glmm_norm(wasm_f32x4_sub(glmm_load(a), glmm_load(b)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  return glmm_norm(_mm_sub_ps(glmm_load(a), glmm_load(b)));
+#elif defined(CGLM_NEON_FP)
+  return glmm_norm(vsubq_f32(glmm_load(a), glmm_load(b)));
+#else
+  return sqrtf(glm_pow2(a[0] - b[0])
+             + glm_pow2(a[1] - b[1])
+             + glm_pow2(a[2] - b[2])
+             + glm_pow2(a[3] - b[3]));
+#endif
+}
+
+/**
+ * @brief squared distance between two vectors
+ *
+ * @param[in] a vector1
+ * @param[in] b vector2
+ * @return returns squared distance
+ */
+CGLM_INLINE
+float
+glm_vec4_distance2(vec4 a, vec4 b) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  return glmm_norm2(wasm_f32x4_sub(glmm_load(a), glmm_load(b)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  return glmm_norm2(_mm_sub_ps(glmm_load(a), glmm_load(b)));
+#elif defined(CGLM_NEON_FP)
+  return glmm_norm2(vsubq_f32(glmm_load(a), glmm_load(b)));
+#else
+  return glm_pow2(a[0] - b[0])
+       + glm_pow2(a[1] - b[1])
+       + glm_pow2(a[2] - b[2])
+       + glm_pow2(a[3] - b[3]);
+#endif
+}
+
+/*!
+ * @brief max values of vectors
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) {
+#if defined(CGLM_SIMD)
+  glmm_store(dest, glmm_max(glmm_load(a), glmm_load(b)));
+#else
+  dest[0] = glm_max(a[0], b[0]);
+  dest[1] = glm_max(a[1], b[1]);
+  dest[2] = glm_max(a[2], b[2]);
+  dest[3] = glm_max(a[3], b[3]);
+#endif
+}
+
+/*!
+ * @brief min values of vectors
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4_minv(vec4 a, vec4 b, vec4 dest) {
+#if defined(CGLM_SIMD)
+  glmm_store(dest, glmm_min(glmm_load(a), glmm_load(b)));
+#else
+  dest[0] = glm_min(a[0], b[0]);
+  dest[1] = glm_min(a[1], b[1]);
+  dest[2] = glm_min(a[2], b[2]);
+  dest[3] = glm_min(a[3], b[3]);
+#endif
+}
+
+/*!
+ * @brief clamp vector's individual members between min and max values
+ *
+ * @param[in, out]  v      vector
+ * @param[in]       minVal minimum value
+ * @param[in]       maxVal maximum value
+ */
+CGLM_INLINE
+void
+glm_vec4_clamp(vec4 v, float minVal, float maxVal) {
+#if defined(__wasm__) && defined(__wasm_simd128__)
+  glmm_store(v, glmm_min(glmm_max(glmm_load(v), wasm_f32x4_splat(minVal)),
+                         wasm_f32x4_splat(maxVal)));
+#elif defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(v, glmm_min(glmm_max(glmm_load(v), glmm_set1(minVal)),
+                         glmm_set1(maxVal)));
+#elif defined(CGLM_NEON_FP)
+  glmm_store(v, glmm_min(glmm_max(vld1q_f32(v), vdupq_n_f32(minVal)),
+                         vdupq_n_f32(maxVal)));
+#else
+  v[0] = glm_clamp(v[0], minVal, maxVal);
+  v[1] = glm_clamp(v[1], minVal, maxVal);
+  v[2] = glm_clamp(v[2], minVal, maxVal);
+  v[3] = glm_clamp(v[3], minVal, maxVal);
+#endif
+}
+
+/*!
+ * @brief linear interpolation between two vectors
+ *
+ * formula:  from + t * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount)
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4_lerp(vec4 from, vec4 to, float t, vec4 dest) {
+  vec4 s, v;
+
+  /* from + s * (to - from) */
+  glm_vec4_broadcast(t, s);
+  glm_vec4_sub(to, from, v);
+  glm_vec4_mul(s, v, v);
+  glm_vec4_add(from, v, dest);
+}
+
+/*!
+ * @brief linear interpolation between two vectors (clamped)
+ *
+ * formula:  from + t * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount) clamped between 0 and 1
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4_lerpc(vec4 from, vec4 to, float t, vec4 dest) {
+  glm_vec4_lerp(from, to, glm_clamp_zo(t), dest);
+}
+
+/*!
+ * @brief linear interpolation between two vectors
+ *
+ * formula:  from + t * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount)
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4_mix(vec4 from, vec4 to, float t, vec4 dest) {
+  glm_vec4_lerp(from, to, t, dest);
+}
+
+/*!
+ * @brief linear interpolation between two vectors (clamped)
+ *
+ * formula:  from + t * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount) clamped between 0 and 1
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4_mixc(vec4 from, vec4 to, float t, vec4 dest) {
+  glm_vec4_lerpc(from, to, t, dest);
+}
+
+/*!
+ * @brief threshold function
+ *
+ * @param[in]   edge    threshold
+ * @param[in]   x       value to test against threshold
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec4_step(vec4 edge, vec4 x, vec4 dest) {
+  dest[0] = glm_step(edge[0], x[0]);
+  dest[1] = glm_step(edge[1], x[1]);
+  dest[2] = glm_step(edge[2], x[2]);
+  dest[3] = glm_step(edge[3], x[3]);
+}
+
+/*!
+ * @brief threshold function with a smooth transition (unidimensional)
+ *
+ * @param[in]   edge0   low threshold
+ * @param[in]   edge1   high threshold
+ * @param[in]   x       value to test against threshold
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec4_smoothstep_uni(float edge0, float edge1, vec4 x, vec4 dest) {
+  dest[0] = glm_smoothstep(edge0, edge1, x[0]);
+  dest[1] = glm_smoothstep(edge0, edge1, x[1]);
+  dest[2] = glm_smoothstep(edge0, edge1, x[2]);
+  dest[3] = glm_smoothstep(edge0, edge1, x[3]);
+}
+
+/*!
+ * @brief threshold function with a smooth transition
+ *
+ * @param[in]   edge0   low threshold
+ * @param[in]   edge1   high threshold
+ * @param[in]   x       value to test against threshold
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec4_smoothstep(vec4 edge0, vec4 edge1, vec4 x, vec4 dest) {
+  dest[0] = glm_smoothstep(edge0[0], edge1[0], x[0]);
+  dest[1] = glm_smoothstep(edge0[1], edge1[1], x[1]);
+  dest[2] = glm_smoothstep(edge0[2], edge1[2], x[2]);
+  dest[3] = glm_smoothstep(edge0[3], edge1[3], x[3]);
+}
+
+/*!
+ * @brief smooth Hermite interpolation between two vectors
+ *
+ * formula:  t^2 * (3 - 2*t)
+ *
+ * @param[in]   from    from value
+ * @param[in]   to      to value
+ * @param[in]   t       interpolant (amount)
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec4_smoothinterp(vec4 from, vec4 to, float t, vec4 dest) {
+  vec4 s, v;
+    
+  /* from + smoothstep * (to - from) */
+  glm_vec4_broadcast(glm_smooth(t), s);
+  glm_vec4_sub(to, from, v);
+  glm_vec4_mul(s, v, v);
+  glm_vec4_add(from, v, dest);
+}
+
+/*!
+ * @brief smooth Hermite interpolation between two vectors (clamped)
+ *
+ * formula:  t^2 * (3 - 2*t)
+ *
+ * @param[in]   from    from value
+ * @param[in]   to      to value
+ * @param[in]   t       interpolant (amount) clamped between 0 and 1
+ * @param[out]  dest    destination
+ */
+CGLM_INLINE
+void
+glm_vec4_smoothinterpc(vec4 from, vec4 to, float t, vec4 dest) {
+  glm_vec4_smoothinterp(from, to, glm_clamp_zo(t), dest);
+}
+
+/*!
+ * @brief helper to fill vec4 as [S^3, S^2, S, 1]
+ *
+ * @param[in]   s    parameter
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4_cubic(float s, vec4 dest) {
+  float ss;
+
+  ss = s * s;
+
+  dest[0] = ss * s;
+  dest[1] = ss;
+  dest[2] = s;
+  dest[3] = 1.0f;
+}
+
+/*!
+ * @brief swizzle vector components
+ *
+ * you can use existing masks e.g. GLM_XXXX, GLM_WZYX
+ *
+ * @param[in]  v    source
+ * @param[in]  mask mask
+ * @param[out] dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4_swizzle(vec4 v, int mask, vec4 dest) {
+  vec4 t;
+
+  t[0] = v[(mask & (3 << 0))];
+  t[1] = v[(mask & (3 << 2)) >> 2];
+  t[2] = v[(mask & (3 << 4)) >> 4];
+  t[3] = v[(mask & (3 << 6)) >> 6];
+
+  glm_vec4_copy(t, dest);
+}
+
+/*!
+ * @brief Create four dimensional vector from pointer
+ *
+ * @param[in]  src  pointer to an array of floats
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_make(const float * __restrict src, vec4 dest) {
+  dest[0] = src[0]; dest[1] = src[1];
+  dest[2] = src[2]; dest[3] = src[3];
+}
+
+/*!
+ * @brief reflection vector using an incident ray and a surface normal
+ *
+ * @param[in]  v    incident vector
+ * @param[in]  n    normalized normal vector
+ * @param[out] dest destination vector for the reflection result
+ */
+CGLM_INLINE
+void
+glm_vec4_reflect(vec4 v, vec4 n, vec4 dest) {
+  vec4 temp;
+
+  /* TODO: direct simd touch */
+  glm_vec4_scale(n, 2.0f * glm_vec4_dot(v, n), temp);
+  glm_vec4_sub(v, temp, dest);
+
+  dest[3] = v[3];
+}
+
+/*!
+ * @brief computes refraction vector for an incident vector and a surface normal.
+ *
+ * calculates the refraction vector based on Snell's law. If total internal reflection
+ * occurs (angle too great given eta), dest is set to zero and returns false.
+ * Otherwise, computes refraction vector, stores it in dest, and returns true.
+ *
+ * this implementation does not explicitly preserve the 'w' component of the
+ * incident vector 'I' in the output 'dest', users requiring the preservation of
+ * the 'w' component should manually adjust 'dest' after calling this function.
+ *
+ * @param[in]  v    normalized incident vector
+ * @param[in]  n    normalized normal vector
+ * @param[in]  eta  ratio of indices of refraction (incident/transmitted)
+ * @param[out] dest refraction vector if refraction occurs; zero vector otherwise
+ *
+ * @returns true if refraction occurs; false if total internal reflection occurs.
+ */
+CGLM_INLINE
+bool
+glm_vec4_refract(vec4 v, vec4 n, float eta, vec4 dest) {
+  float ndi, eni, k;
+
+  ndi = glm_vec4_dot(n, v);
+  eni = eta * ndi;
+  k   = 1.0f - eta * eta + eni * eni;
+
+  if (k < 0.0f) {
+    glm_vec4_zero(dest);
+    return false;
+  }
+
+  glm_vec4_scale(v, eta, dest);
+  glm_vec4_mulsubs(n, eni + sqrtf(k), dest);
+  return true;
+}
+
+#endif /* cglm_vec4_h */
diff --git a/external/cglm/version.h b/external/cglm/version.h
new file mode 100644
index 0000000..9e815d4
--- /dev/null
+++ b/external/cglm/version.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_version_h
+#define cglm_version_h
+
+#define CGLM_VERSION_MAJOR 0
+#define CGLM_VERSION_MINOR 9
+#define CGLM_VERSION_PATCH 6
+
+#endif /* cglm_version_h */
diff --git a/external/m3d/m3d.c b/external/m3d/m3d.c
new file mode 100644
index 0000000..769d9a4
--- /dev/null
+++ b/external/m3d/m3d.c
@@ -0,0 +1,3 @@
+#define M3D_IMPLEMENTATION
+
+#include "m3d.h"
diff --git a/external/m3d/m3d.h b/external/m3d/m3d.h
new file mode 100644
index 0000000..8bba183
--- /dev/null
+++ b/external/m3d/m3d.h
@@ -0,0 +1,6574 @@
+/*
+ * m3d.h
+ * https://gitlab.com/bztsrc/model3d
+ *
+ * Copyright (C) 2020 bzt (bztsrc@gitlab)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * @brief ANSI C89 / C++11 single header importer / exporter SDK for the Model 3D (.M3D) format
+ * https://gitlab.com/bztsrc/model3d
+ *
+ * PNG decompressor included from (with minor modifications to make it C89 valid):
+ *  stb_image - v2.13 - public domain image loader - http://nothings.org/stb_image.h
+ *
+ * @version: 1.0.0
+ */
+
+#ifndef _M3D_H_
+#define _M3D_H_
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+/*** configuration ***/
+#ifndef M3D_MALLOC
+# define M3D_MALLOC(sz)     malloc(sz)
+#endif
+#ifndef M3D_REALLOC
+# define M3D_REALLOC(p,nsz) realloc(p,nsz)
+#endif
+#ifndef M3D_FREE
+# define M3D_FREE(p)        free(p)
+#endif
+#ifndef M3D_LOG
+# define M3D_LOG(x)
+#endif
+#ifndef M3D_APIVERSION
+#define M3D_APIVERSION      0x0100
+#ifndef M3D_DOUBLE
+typedef float M3D_FLOAT;
+#ifndef M3D_EPSILON
+/* carefully choosen for IEEE 754 don't change */
+#define M3D_EPSILON ((M3D_FLOAT)1e-7)
+#endif
+#else
+typedef double M3D_FLOAT;
+#ifndef M3D_EPSILON
+#define M3D_EPSILON ((M3D_FLOAT)1e-14)
+#endif
+#endif
+#if !defined(M3D_SMALLINDEX)
+typedef uint32_t M3D_INDEX;
+typedef uint16_t M3D_VOXEL;
+#define M3D_UNDEF 0xffffffff
+#define M3D_INDEXMAX 0xfffffffe
+#define M3D_VOXUNDEF 0xffff
+#define M3D_VOXCLEAR 0xfffe
+#else
+typedef uint16_t M3D_INDEX;
+typedef uint8_t M3D_VOXEL;
+#define M3D_UNDEF 0xffff
+#define M3D_INDEXMAX 0xfffe
+#define M3D_VOXUNDEF 0xff
+#define M3D_VOXCLEAR 0xfe
+#endif
+#define M3D_NOTDEFINED 0xffffffff
+#ifndef M3D_NUMBONE
+#define M3D_NUMBONE 4
+#endif
+#ifndef M3D_BONEMAXLEVEL
+#define M3D_BONEMAXLEVEL 64
+#endif
+#ifndef _MSC_VER
+#ifndef _inline
+#define _inline __inline__
+#endif
+#define _pack __attribute__((packed))
+#define _unused __attribute__((unused))
+#else
+#define _inline
+#define _pack
+#define _unused __pragma(warning(suppress:4100))
+#endif
+#ifndef  __cplusplus
+#define _register register
+#else
+#define _register
+#endif
+
+/*** File format structures ***/
+
+/**
+ * M3D file format structure
+ *  3DMO m3dchunk_t file header chunk, may followed by compressed data
+ *  PRVW preview chunk (optional)
+ *  HEAD m3dhdr_t model header chunk
+ *  n x m3dchunk_t more chunks follow
+ *      CMAP color map chunk (optional)
+ *      TMAP texture map chunk (optional)
+ *      VRTS vertex data chunk (optional if it's a material library)
+ *      BONE bind-pose skeleton, bone hierarchy chunk (optional)
+ *          n x m3db_t contains propably more, but at least one bone
+ *          n x m3ds_t skin group records
+ *      MTRL* material chunk(s), can be more (optional)
+ *          n x m3dp_t each material contains propapbly more, but at least one property
+ *                     the properties are configurable with a static array, see m3d_propertytypes
+ *      n x m3dchunk_t at least one, but maybe more face chunks
+ *          PROC* procedural face, or
+ *          MESH* triangle mesh (vertex index list) or
+ *          VOXT, VOXD* voxel image (converted to mesh) or
+ *          SHPE* mathematical shapes like parameterized surfaces
+ *      LBLS* annotation label chunks, can be more (optional)
+ *      ACTN* action chunk(s), animation-pose skeletons, can be more (optional)
+ *          n x m3dfr_t each action contains probably more, but at least one frame
+ *              n x m3dtr_t each frame contains probably more, but at least one transformation
+ *      ASET* inlined asset chunk(s), can be more (optional)
+ *  OMD3 end chunk
+ *
+ * Typical chunks for a game engine: 3DMO, HEAD, CMAP, TMAP, VRTS, BONE, MTRL, MESH, ACTN, OMD3
+ * Typical chunks for distibution:   3DMO, PRVW, HEAD, CMAP, TMAP, VRTS, BONE, MTRL, MESH, ACTN, ASET, OMD3
+ * Typical chunks for voxel image:   3DMO, HEAD, CMAP, MTRL, VOXT, VOXD, VOXD, VOXD, OMD3
+ * Typical chunks for CAD software:  3DMO, PRVW, HEAD, CMAP, TMAP, VRTS, MTRL, SHPE, LBLS, OMD3
+ */
+#ifdef _MSC_VER
+#pragma pack(push)
+#pragma pack(1)
+#endif
+
+typedef struct {
+    char magic[4];
+    uint32_t length;
+    float scale; /* deliberately not M3D_FLOAT */
+    uint32_t types;
+} _pack m3dhdr_t;
+
+typedef struct {
+    char magic[4];
+    uint32_t length;
+} _pack m3dchunk_t;
+
+#ifdef _MSC_VER
+#pragma pack(pop)
+#endif
+
+/*** in-memory model structure ***/
+
+/* textmap entry */
+typedef struct {
+    M3D_FLOAT u;
+    M3D_FLOAT v;
+} m3dti_t;
+#define m3d_textureindex_t m3dti_t
+
+/* texture */
+typedef struct {
+    char *name;                 /* texture name */
+    uint8_t *d;                 /* pixels data */
+    uint16_t w;                 /* width */
+    uint16_t h;                 /* height */
+    uint8_t f;                  /* format, 1 = grayscale, 2 = grayscale+alpha, 3 = rgb, 4 = rgba */
+} m3dtx_t;
+#define m3d_texturedata_t m3dtx_t
+
+typedef struct {
+    M3D_INDEX vertexid;
+    M3D_FLOAT weight;
+} m3dw_t;
+#define m3d_weight_t m3dw_t
+
+/* bone entry */
+typedef struct {
+    M3D_INDEX parent;           /* parent bone index */
+    char *name;                 /* name for this bone */
+    M3D_INDEX pos;              /* vertex index position */
+    M3D_INDEX ori;              /* vertex index orientation (quaternion) */
+    M3D_INDEX numweight;        /* number of controlled vertices */
+    m3dw_t *weight;             /* weights for those vertices */
+    M3D_FLOAT mat4[16];         /* transformation matrix */
+} m3db_t;
+#define m3d_bone_t m3db_t
+
+/* skin: bone per vertex entry */
+typedef struct {
+    M3D_INDEX boneid[M3D_NUMBONE];
+    M3D_FLOAT weight[M3D_NUMBONE];
+} m3ds_t;
+#define m3d_skin_t m3ds_t
+
+/* vertex entry */
+typedef struct {
+    M3D_FLOAT x;                /* 3D coordinates and weight */
+    M3D_FLOAT y;
+    M3D_FLOAT z;
+    M3D_FLOAT w;
+    uint32_t color;             /* default vertex color */
+    M3D_INDEX skinid;           /* skin index */
+#ifdef M3D_VERTEXTYPE
+    uint8_t type;
+#endif
+} m3dv_t;
+#define m3d_vertex_t m3dv_t
+
+/* material property formats */
+enum {
+    m3dpf_color,
+    m3dpf_uint8,
+    m3dpf_uint16,
+    m3dpf_uint32,
+    m3dpf_float,
+    m3dpf_map
+};
+typedef struct {
+    uint8_t format;
+    uint8_t id;
+#ifdef M3D_ASCII
+#define M3D_PROPERTYDEF(f,i,n) { (f), (i), (char*)(n) }
+    char *key;
+#endif
+#ifndef M3D_ASCII
+#define M3D_PROPERTYDEF(f,i,n) { (f), (i) }
+#endif
+} m3dpd_t;
+
+/* material property types */
+/* You shouldn't change the first 8 display and first 4 physical property. Assign the rest as you like. */
+enum {
+    m3dp_Kd = 0,                /* scalar display properties */
+    m3dp_Ka,
+    m3dp_Ks,
+    m3dp_Ns,
+    m3dp_Ke,
+    m3dp_Tf,
+    m3dp_Km,
+    m3dp_d,
+    m3dp_il,
+
+    m3dp_Pr = 64,               /* scalar physical properties */
+    m3dp_Pm,
+    m3dp_Ps,
+    m3dp_Ni,
+    m3dp_Nt,
+
+    m3dp_map_Kd = 128,          /* textured display map properties */
+    m3dp_map_Ka,
+    m3dp_map_Ks,
+    m3dp_map_Ns,
+    m3dp_map_Ke,
+    m3dp_map_Tf,
+    m3dp_map_Km, /* bump map */
+    m3dp_map_D,
+    m3dp_map_N,  /* normal map */
+
+    m3dp_map_Pr = 192,          /* textured physical map properties */
+    m3dp_map_Pm,
+    m3dp_map_Ps,
+    m3dp_map_Ni,
+    m3dp_map_Nt
+};
+enum {                          /* aliases */
+    m3dp_bump = m3dp_map_Km,
+    m3dp_map_il = m3dp_map_N,
+    m3dp_refl = m3dp_map_Pm
+};
+
+/* material property */
+typedef struct {
+    uint8_t type;               /* property type, see "m3dp_*" enumeration */
+    union {
+        uint32_t color;         /* if value is a color, m3dpf_color */
+        uint32_t num;           /* if value is a number, m3dpf_uint8, m3pf_uint16, m3dpf_uint32 */
+        float    fnum;          /* if value is a floating point number, m3dpf_float */
+        M3D_INDEX textureid;    /* if value is a texture, m3dpf_map */
+    } value;
+} m3dp_t;
+#define m3d_property_t m3dp_t
+
+/* material entry */
+typedef struct {
+    char *name;                 /* name of the material */
+    uint8_t numprop;            /* number of properties */
+    m3dp_t *prop;               /* properties array */
+} m3dm_t;
+#define m3d_material_t m3dm_t
+
+/* face entry */
+typedef struct {
+    M3D_INDEX materialid;       /* material index */
+    M3D_INDEX vertex[3];        /* 3D points of the triangle in CCW order */
+    M3D_INDEX normal[3];        /* normal vectors */
+    M3D_INDEX texcoord[3];      /* UV coordinates */
+#ifdef M3D_VERTEXMAX
+    M3D_INDEX paramid;          /* parameter index */
+    M3D_INDEX vertmax[3];       /* maximum 3D points of the triangle in CCW order */
+#endif
+} m3df_t;
+#define m3d_face_t m3df_t
+
+typedef struct {
+    uint16_t count;
+    char *name;
+} m3dvi_t;
+#define m3d_voxelitem_t m3dvi_t
+#define m3d_parameter_t m3dvi_t
+
+/* voxel types (voxel palette) */
+typedef struct {
+    char *name;                 /* technical name of the voxel */
+    uint8_t rotation;           /* rotation info */
+    uint16_t voxshape;          /* voxel shape */
+    M3D_INDEX materialid;       /* material index */
+    uint32_t color;             /* default voxel color */
+    M3D_INDEX skinid;           /* skin index */
+    uint8_t numitem;            /* number of sub-voxels */
+    m3dvi_t *item;              /* list of sub-voxels */
+} m3dvt_t;
+#define m3d_voxeltype_t m3dvt_t
+
+/* voxel data blocks */
+typedef struct {
+    char *name;                 /* name of the block */
+    int32_t x, y, z;            /* position */
+    uint32_t w, h, d;           /* dimension */
+    uint8_t uncertain;          /* probability */
+    uint8_t groupid;            /* block group id */
+    M3D_VOXEL *data;            /* voxel data, indices to voxel type */
+} m3dvx_t;
+#define m3d_voxel_t m3dvx_t
+
+/* shape command types. must match the row in m3d_commandtypes */
+enum {
+    /* special commands */
+    m3dc_use = 0,               /* use material */
+    m3dc_inc,                   /* include another shape */
+    m3dc_mesh,                  /* include part of polygon mesh */
+    /* approximations */
+    m3dc_div,                   /* subdivision by constant resolution for both u, v */
+    m3dc_sub,                   /* subdivision by constant, different for u and v */
+    m3dc_len,                   /* spacial subdivision by maxlength */
+    m3dc_dist,                  /* subdivision by maxdistance and maxangle */
+    /* modifiers */
+    m3dc_degu,                  /* degree for both u, v */
+    m3dc_deg,                   /* separate degree for u and v */
+    m3dc_rangeu,                /* range for u */
+    m3dc_range,                 /* range for u and v */
+    m3dc_paru,                  /* u parameters (knots) */
+    m3dc_parv,                  /* v parameters */
+    m3dc_trim,                  /* outer trimming curve */
+    m3dc_hole,                  /* inner trimming curve */
+    m3dc_scrv,                  /* spacial curve */
+    m3dc_sp,                    /* special points */
+    /* helper curves */
+    m3dc_bez1,                  /* Bezier 1D */
+    m3dc_bsp1,                  /* B-spline 1D */
+    m3dc_bez2,                  /* bezier 2D */
+    m3dc_bsp2,                  /* B-spline 2D */
+    /* surfaces */
+    m3dc_bezun,                 /* Bezier 3D with control, UV, normal */
+    m3dc_bezu,                  /* with control and UV */
+    m3dc_bezn,                  /* with control and normal */
+    m3dc_bez,                   /* control points only */
+    m3dc_nurbsun,               /* B-spline 3D */
+    m3dc_nurbsu,
+    m3dc_nurbsn,
+    m3dc_nurbs,
+    m3dc_conn,                 /* connect surfaces */
+    /* geometrical */
+    m3dc_line,
+    m3dc_polygon,
+    m3dc_circle,
+    m3dc_cylinder,
+    m3dc_shpere,
+    m3dc_torus,
+    m3dc_cone,
+    m3dc_cube
+};
+
+/* shape command argument types */
+enum {
+    m3dcp_mi_t = 1,             /* material index */
+    m3dcp_hi_t,                 /* shape index */
+    m3dcp_fi_t,                 /* face index */
+    m3dcp_ti_t,                 /* texture map index */
+    m3dcp_vi_t,                 /* vertex index */
+    m3dcp_qi_t,                 /* vertex index for quaternions */
+    m3dcp_vc_t,                 /* coordinate or radius, float scalar */
+    m3dcp_i1_t,                 /* int8 scalar */
+    m3dcp_i2_t,                 /* int16 scalar */
+    m3dcp_i4_t,                 /* int32 scalar */
+    m3dcp_va_t                  /* variadic arguments */
+};
+
+#define M3D_CMDMAXARG 8         /* if you increase this, add more arguments to the macro below */
+typedef struct {
+#ifdef M3D_ASCII
+#define M3D_CMDDEF(t,n,p,a,b,c,d,e,f,g,h) { (char*)(n), (p), { (a), (b), (c), (d), (e), (f), (g), (h) } }
+    char *key;
+#endif
+#ifndef M3D_ASCII
+#define M3D_CMDDEF(t,n,p,a,b,c,d,e,f,g,h) { (p), { (a), (b), (c), (d), (e), (f), (g), (h) } }
+#endif
+    uint8_t p;
+    uint8_t a[M3D_CMDMAXARG];
+} m3dcd_t;
+
+/* shape command */
+typedef struct {
+    uint16_t type;              /* shape type */
+    uint32_t *arg;              /* arguments array */
+} m3dc_t;
+#define m3d_shapecommand_t m3dc_t
+
+/* shape entry */
+typedef struct {
+    char *name;                 /* name of the mathematical shape */
+    M3D_INDEX group;            /* group this shape belongs to or -1 */
+    uint32_t numcmd;            /* number of commands */
+    m3dc_t *cmd;                /* commands array */
+} m3dh_t;
+#define m3d_shape_t m3dh_t
+
+/* label entry */
+typedef struct {
+    char *name;                 /* name of the annotation layer or NULL */
+    char *lang;                 /* language code or NULL */
+    char *text;                 /* the label text */
+    uint32_t color;             /* color */
+    M3D_INDEX vertexid;         /* the vertex the label refers to */
+} m3dl_t;
+#define m3d_label_t m3dl_t
+
+/* frame transformations / working copy skeleton entry */
+typedef struct {
+    M3D_INDEX boneid;           /* selects a node in bone hierarchy */
+    M3D_INDEX pos;              /* vertex index new position */
+    M3D_INDEX ori;              /* vertex index new orientation (quaternion) */
+} m3dtr_t;
+#define m3d_transform_t m3dtr_t
+
+/* animation frame entry */
+typedef struct {
+    uint32_t msec;              /* frame's position on the timeline, timestamp */
+    M3D_INDEX numtransform;     /* number of transformations in this frame */
+    m3dtr_t *transform;         /* transformations */
+} m3dfr_t;
+#define m3d_frame_t m3dfr_t
+
+/* model action entry */
+typedef struct {
+    char *name;                 /* name of the action */
+    uint32_t durationmsec;      /* duration in millisec (1/1000 sec) */
+    M3D_INDEX numframe;         /* number of frames in this animation */
+    m3dfr_t *frame;             /* frames array */
+} m3da_t;
+#define m3d_action_t m3da_t
+
+/* inlined asset */
+typedef struct {
+    char *name;                 /* asset name (same pointer as in texture[].name) */
+    uint8_t *data;              /* compressed asset data */
+    uint32_t length;            /* compressed data length */
+} m3di_t;
+#define m3d_inlinedasset_t m3di_t
+
+/*** in-memory model structure ***/
+#define M3D_FLG_FREERAW     (1<<0)
+#define M3D_FLG_FREESTR     (1<<1)
+#define M3D_FLG_MTLLIB      (1<<2)
+#define M3D_FLG_GENNORM     (1<<3)
+
+typedef struct {
+    m3dhdr_t *raw;              /* pointer to raw data */
+    char flags;                 /* internal flags */
+    signed char errcode;        /* returned error code */
+    char vc_s, vi_s, si_s, ci_s, ti_s, bi_s, nb_s, sk_s, fc_s, hi_s, fi_s, vd_s, vp_s;  /* decoded sizes for types */
+    char *name;                 /* name of the model, like "Utah teapot" */
+    char *license;              /* usage condition or license, like "MIT", "LGPL" or "BSD-3clause" */
+    char *author;               /* nickname, email, homepage or github URL etc. */
+    char *desc;                 /* comments, descriptions. May contain '\n' newline character */
+    M3D_FLOAT scale;            /* the model's bounding cube's size in SI meters */
+    M3D_INDEX numcmap;
+    uint32_t *cmap;             /* color map */
+    M3D_INDEX numtmap;
+    m3dti_t *tmap;              /* texture map indices */
+    M3D_INDEX numtexture;
+    m3dtx_t *texture;           /* uncompressed textures */
+    M3D_INDEX numbone;
+    m3db_t *bone;               /* bone hierarchy */
+    M3D_INDEX numvertex;
+    m3dv_t *vertex;             /* vertex data */
+    M3D_INDEX numskin;
+    m3ds_t *skin;               /* skin data */
+    M3D_INDEX nummaterial;
+    m3dm_t *material;           /* material list */
+#ifdef M3D_VERTEXMAX
+    M3D_INDEX numparam;
+    m3dvi_t *param;             /* parameters and their values list */
+#endif
+    M3D_INDEX numface;
+    m3df_t *face;               /* model face, polygon (triangle) mesh */
+    M3D_INDEX numvoxtype;
+    m3dvt_t *voxtype;           /* model face, voxel types */
+    M3D_INDEX numvoxel;
+    m3dvx_t *voxel;             /* model face, cubes compressed into voxels */
+    M3D_INDEX numshape;
+    m3dh_t *shape;              /* model face, shape commands */
+    M3D_INDEX numlabel;
+    m3dl_t *label;              /* annotation labels */
+    M3D_INDEX numaction;
+    m3da_t *action;             /* action animations */
+    M3D_INDEX numinlined;
+    m3di_t *inlined;            /* inlined assets */
+    M3D_INDEX numextra;
+    m3dchunk_t **extra;         /* unknown chunks, application / engine specific data probably */
+    m3di_t preview;             /* preview chunk */
+} m3d_t;
+
+/*** export parameters ***/
+#define M3D_EXP_INT8        0
+#define M3D_EXP_INT16       1
+#define M3D_EXP_FLOAT       2
+#define M3D_EXP_DOUBLE      3
+
+#define M3D_EXP_NOCMAP      (1<<0)
+#define M3D_EXP_NOMATERIAL  (1<<1)
+#define M3D_EXP_NOFACE      (1<<2)
+#define M3D_EXP_NONORMAL    (1<<3)
+#define M3D_EXP_NOTXTCRD    (1<<4)
+#define M3D_EXP_FLIPTXTCRD  (1<<5)
+#define M3D_EXP_NORECALC    (1<<6)
+#define M3D_EXP_IDOSUCK     (1<<7)
+#define M3D_EXP_NOBONE      (1<<8)
+#define M3D_EXP_NOACTION    (1<<9)
+#define M3D_EXP_INLINE      (1<<10)
+#define M3D_EXP_EXTRA       (1<<11)
+#define M3D_EXP_NOZLIB      (1<<14)
+#define M3D_EXP_ASCII       (1<<15)
+#define M3D_EXP_NOVRTMAX    (1<<16)
+
+/*** error codes ***/
+#define M3D_SUCCESS         0
+#define M3D_ERR_ALLOC       -1
+#define M3D_ERR_BADFILE     -2
+#define M3D_ERR_UNIMPL      -65
+#define M3D_ERR_UNKPROP     -66
+#define M3D_ERR_UNKMESH     -67
+#define M3D_ERR_UNKIMG      -68
+#define M3D_ERR_UNKFRAME    -69
+#define M3D_ERR_UNKCMD      -70
+#define M3D_ERR_UNKVOX      -71
+#define M3D_ERR_TRUNC       -72
+#define M3D_ERR_CMAP        -73
+#define M3D_ERR_TMAP        -74
+#define M3D_ERR_VRTS        -75
+#define M3D_ERR_BONE        -76
+#define M3D_ERR_MTRL        -77
+#define M3D_ERR_SHPE        -78
+#define M3D_ERR_VOXT        -79
+
+#define M3D_ERR_ISFATAL(x)  ((x) < 0 && (x) > -65)
+
+/* callbacks */
+typedef unsigned char *(*m3dread_t)(char *filename, unsigned int *size);                        /* read file contents into buffer */
+typedef void (*m3dfree_t)(void *buffer);                                                        /* free file contents buffer */
+typedef int (*m3dtxsc_t)(const char *name, const void *script, uint32_t len, m3dtx_t *output);  /* interpret texture script */
+typedef int (*m3dprsc_t)(const char *name, const void *script, uint32_t len, m3d_t *model);     /* interpret surface script */
+#endif /* ifndef M3D_APIVERSION */
+
+/*** C prototypes ***/
+/* import / export */
+m3d_t *m3d_load(unsigned char *data, m3dread_t readfilecb, m3dfree_t freecb, m3d_t *mtllib);
+unsigned char *m3d_save(m3d_t *model, int quality, int flags, unsigned int *size);
+void m3d_free(m3d_t *model);
+/* generate animation pose skeleton */
+m3dtr_t *m3d_frame(m3d_t *model, M3D_INDEX actionid, M3D_INDEX frameid, m3dtr_t *skeleton);
+m3db_t *m3d_pose(m3d_t *model, M3D_INDEX actionid, uint32_t msec);
+
+/* private prototypes used by both importer and exporter */
+char *_m3d_safestr(char *in, int morelines);
+
+/*** C implementation ***/
+#ifdef M3D_IMPLEMENTATION
+#if !defined(M3D_NOIMPORTER) || defined(M3D_EXPORTER)
+/* material property definitions */
+static m3dpd_t m3d_propertytypes[] = {
+    M3D_PROPERTYDEF(m3dpf_color, m3dp_Kd, "Kd"),    /* diffuse color */
+    M3D_PROPERTYDEF(m3dpf_color, m3dp_Ka, "Ka"),    /* ambient color */
+    M3D_PROPERTYDEF(m3dpf_color, m3dp_Ks, "Ks"),    /* specular color */
+    M3D_PROPERTYDEF(m3dpf_float, m3dp_Ns, "Ns"),    /* specular exponent */
+    M3D_PROPERTYDEF(m3dpf_color, m3dp_Ke, "Ke"),    /* emissive (emitting light of this color) */
+    M3D_PROPERTYDEF(m3dpf_color, m3dp_Tf, "Tf"),    /* transmission color */
+    M3D_PROPERTYDEF(m3dpf_float, m3dp_Km, "Km"),    /* bump strength */
+    M3D_PROPERTYDEF(m3dpf_float, m3dp_d,  "d"),     /* dissolve (transparency) */
+    M3D_PROPERTYDEF(m3dpf_uint8, m3dp_il, "il"),    /* illumination model (informational, ignored by PBR-shaders) */
+
+    M3D_PROPERTYDEF(m3dpf_float, m3dp_Pr, "Pr"),    /* roughness */
+    M3D_PROPERTYDEF(m3dpf_float, m3dp_Pm, "Pm"),    /* metallic, also reflection */
+    M3D_PROPERTYDEF(m3dpf_float, m3dp_Ps, "Ps"),    /* sheen */
+    M3D_PROPERTYDEF(m3dpf_float, m3dp_Ni, "Ni"),    /* index of refraction (optical density) */
+    M3D_PROPERTYDEF(m3dpf_float, m3dp_Nt, "Nt"),    /* thickness of face in millimeter, for printing */
+
+    /* aliases, note that "map_*" aliases are handled automatically */
+    M3D_PROPERTYDEF(m3dpf_map, m3dp_map_Km, "bump"),
+    M3D_PROPERTYDEF(m3dpf_map, m3dp_map_N, "map_N"),/* as normal map has no scalar version, it's counterpart is 'il' */
+    M3D_PROPERTYDEF(m3dpf_map, m3dp_map_Pm, "refl")
+};
+/* shape command definitions. if more commands start with the same string, the longer must come first */
+static m3dcd_t m3d_commandtypes[] = {
+    /* technical */
+    M3D_CMDDEF(m3dc_use,     "use",     1, m3dcp_mi_t, 0, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_inc,     "inc",     3, m3dcp_hi_t, m3dcp_vi_t, m3dcp_qi_t, m3dcp_vi_t, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_mesh,    "mesh",    1, m3dcp_fi_t, m3dcp_fi_t, m3dcp_vi_t, m3dcp_qi_t, m3dcp_vi_t, 0, 0, 0),
+    /* approximations */
+    M3D_CMDDEF(m3dc_div,     "div",     1, m3dcp_vc_t, 0, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_sub,     "sub",     2, m3dcp_vc_t, m3dcp_vc_t, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_len,     "len",     1, m3dcp_vc_t, 0, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_dist,    "dist",    2, m3dcp_vc_t, m3dcp_vc_t, 0, 0, 0, 0, 0, 0),
+    /* modifiers */
+    M3D_CMDDEF(m3dc_degu,    "degu",    1, m3dcp_i1_t, 0, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_deg,     "deg",     2, m3dcp_i1_t, m3dcp_i1_t, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_rangeu,  "rangeu",  1, m3dcp_ti_t, 0, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_range,   "range",   2, m3dcp_ti_t, m3dcp_ti_t, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_paru,    "paru",    2, m3dcp_va_t, m3dcp_vc_t, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_parv,    "parv",    2, m3dcp_va_t, m3dcp_vc_t, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_trim,    "trim",    3, m3dcp_va_t, m3dcp_ti_t, m3dcp_i2_t, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_hole,    "hole",    3, m3dcp_va_t, m3dcp_ti_t, m3dcp_i2_t, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_scrv,    "scrv",    3, m3dcp_va_t, m3dcp_ti_t, m3dcp_i2_t, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_sp,      "sp",      2, m3dcp_va_t, m3dcp_vi_t, 0, 0, 0, 0, 0, 0),
+    /* helper curves */
+    M3D_CMDDEF(m3dc_bez1,    "bez1",    2, m3dcp_va_t, m3dcp_vi_t, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_bsp1,    "bsp1",    2, m3dcp_va_t, m3dcp_vi_t, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_bez2,    "bez2",    2, m3dcp_va_t, m3dcp_vi_t, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_bsp2,    "bsp2",    2, m3dcp_va_t, m3dcp_vi_t, 0, 0, 0, 0, 0, 0),
+    /* surfaces */
+    M3D_CMDDEF(m3dc_bezun,   "bezun",   4, m3dcp_va_t, m3dcp_vi_t, m3dcp_ti_t, m3dcp_vi_t, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_bezu,    "bezu",    3, m3dcp_va_t, m3dcp_vi_t, m3dcp_ti_t, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_bezn,    "bezn",    3, m3dcp_va_t, m3dcp_vi_t, m3dcp_vi_t, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_bez,     "bez",     2, m3dcp_va_t, m3dcp_vi_t, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_nurbsun, "nurbsun", 4, m3dcp_va_t, m3dcp_vi_t, m3dcp_ti_t, m3dcp_vi_t, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_nurbsu,  "nurbsu",  3, m3dcp_va_t, m3dcp_vi_t, m3dcp_ti_t, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_nurbsn,  "nurbsn",  3, m3dcp_va_t, m3dcp_vi_t, m3dcp_vi_t, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_nurbs,   "nurbs",   2, m3dcp_va_t, m3dcp_vi_t, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_conn,    "conn",    6, m3dcp_i2_t, m3dcp_ti_t, m3dcp_i2_t, m3dcp_i2_t, m3dcp_ti_t, m3dcp_i2_t, 0, 0),
+    /* geometrical */
+    M3D_CMDDEF(m3dc_line,    "line",    2, m3dcp_va_t, m3dcp_vi_t, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_polygon, "polygon", 2, m3dcp_va_t, m3dcp_vi_t, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_circle,  "circle",  3, m3dcp_vi_t, m3dcp_qi_t, m3dcp_vc_t, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_cylinder,"cylinder",6, m3dcp_vi_t, m3dcp_qi_t, m3dcp_vc_t, m3dcp_vi_t, m3dcp_qi_t, m3dcp_vc_t, 0, 0),
+    M3D_CMDDEF(m3dc_shpere,  "shpere",  2, m3dcp_vi_t, m3dcp_vc_t, 0, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_torus,   "torus",   4, m3dcp_vi_t, m3dcp_qi_t, m3dcp_vc_t, m3dcp_vc_t, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_cone,    "cone",    3, m3dcp_vi_t, m3dcp_vi_t, m3dcp_vi_t, 0, 0, 0, 0, 0),
+    M3D_CMDDEF(m3dc_cube,    "cube",    3, m3dcp_vi_t, m3dcp_vi_t, m3dcp_vi_t, 0, 0, 0, 0, 0)
+};
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+/* we'll need this with M3D_NOTEXTURE */
+char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+
+#ifndef M3D_NOTEXTURE
+#if !defined(M3D_NOIMPORTER) && !defined(STBI_INCLUDE_STB_IMAGE_H)
+/* PNG decompressor from
+
+   stb_image - v2.23 - public domain image loader - http://nothings.org/stb_image.h
+*/
+static const char *_m3dstbi__g_failure_reason;
+
+enum
+{
+   STBI_default = 0,
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
+
+typedef unsigned short _m3dstbi_us;
+
+typedef uint16_t _m3dstbi__uint16;
+typedef int16_t  _m3dstbi__int16;
+typedef uint32_t _m3dstbi__uint32;
+typedef int32_t  _m3dstbi__int32;
+
+typedef struct
+{
+   _m3dstbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   void *io_user_data;
+
+   int read_from_callbacks;
+   int buflen;
+   unsigned char buffer_start[128];
+
+   unsigned char *img_buffer, *img_buffer_end;
+   unsigned char *img_buffer_original, *img_buffer_original_end;
+} _m3dstbi__context;
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} _m3dstbi__result_info;
+
+#define STBI_ASSERT(v)
+#define STBI_NOTUSED(v)  (void)sizeof(v)
+#define STBI__BYTECAST(x)  ((unsigned char) ((x) & 255))
+#define STBI_MALLOC(sz)           M3D_MALLOC(sz)
+#define STBI_REALLOC(p,newsz)     M3D_REALLOC(p,newsz)
+#define STBI_FREE(p)              M3D_FREE(p)
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+
+_inline static unsigned char _m3dstbi__get8(_m3dstbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   return 0;
+}
+
+_inline static int _m3dstbi__at_eof(_m3dstbi__context *s)
+{
+   return s->img_buffer >= s->img_buffer_end;
+}
+
+static void _m3dstbi__skip(_m3dstbi__context *s, int n)
+{
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   s->img_buffer += n;
+}
+
+static int _m3dstbi__getn(_m3dstbi__context *s, unsigned char *buffer, int n)
+{
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+
+static int _m3dstbi__get16be(_m3dstbi__context *s)
+{
+   int z = _m3dstbi__get8(s);
+   return (z << 8) + _m3dstbi__get8(s);
+}
+
+static _m3dstbi__uint32 _m3dstbi__get32be(_m3dstbi__context *s)
+{
+   _m3dstbi__uint32 z = _m3dstbi__get16be(s);
+   return (z << 16) + _m3dstbi__get16be(s);
+}
+
+#define _m3dstbi__err(x,y)  _m3dstbi__errstr(y)
+static int _m3dstbi__errstr(const char *str)
+{
+   _m3dstbi__g_failure_reason = str;
+   return 0;
+}
+
+_inline static void *_m3dstbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
+static int _m3dstbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   return a <= 2147483647 - b;
+}
+
+static int _m3dstbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1;
+   return a <= 2147483647/b;
+}
+
+static int _m3dstbi__mad2sizes_valid(int a, int b, int add)
+{
+   return _m3dstbi__mul2sizes_valid(a, b) && _m3dstbi__addsizes_valid(a*b, add);
+}
+
+static int _m3dstbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return _m3dstbi__mul2sizes_valid(a, b) && _m3dstbi__mul2sizes_valid(a*b, c) &&
+      _m3dstbi__addsizes_valid(a*b*c, add);
+}
+
+static void *_m3dstbi__malloc_mad2(int a, int b, int add)
+{
+   if (!_m3dstbi__mad2sizes_valid(a, b, add)) return NULL;
+   return _m3dstbi__malloc(a*b + add);
+}
+
+static void *_m3dstbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!_m3dstbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return _m3dstbi__malloc(a*b*c + add);
+}
+
+static unsigned char _m3dstbi__compute_y(int r, int g, int b)
+{
+   return (unsigned char) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+
+static unsigned char *_m3dstbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) _m3dstbi__malloc_mad3(req_comp, x, y, 0);
+   if (good == NULL) {
+      STBI_FREE(data);
+      _m3dstbi__err("outofmem", "Out of memory");
+      return NULL;
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0], dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=_m3dstbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=_m3dstbi__compute_y(src[0],src[1],src[2]), dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=_m3dstbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=_m3dstbi__compute_y(src[0],src[1],src[2]), dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0);
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+
+static _m3dstbi__uint16 _m3dstbi__compute_y_16(int r, int g, int b)
+{
+   return (_m3dstbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+
+static _m3dstbi__uint16 *_m3dstbi__convert_format16(_m3dstbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   _m3dstbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (_m3dstbi__uint16 *) _m3dstbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      _m3dstbi__err("outofmem", "Out of memory");
+      return NULL;
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      _m3dstbi__uint16 *src  = data + j * x * img_n   ;
+      _m3dstbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0], dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=_m3dstbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=_m3dstbi__compute_y_16(src[0],src[1],src[2]), dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=_m3dstbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=_m3dstbi__compute_y_16(src[0],src[1],src[2]), dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2];                       } break;
+         default: STBI_ASSERT(0);
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+
+#define STBI__ZFAST_BITS  9
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+
+typedef struct
+{
+   _m3dstbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   _m3dstbi__uint16 firstcode[16];
+   int maxcode[17];
+   _m3dstbi__uint16 firstsymbol[16];
+   unsigned char  size[288];
+   _m3dstbi__uint16 value[288];
+} _m3dstbi__zhuffman;
+
+_inline static int _m3dstbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+_inline static int _m3dstbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   return _m3dstbi__bitreverse16(v) >> (16-bits);
+}
+
+static int _m3dstbi__zbuild_huffman(_m3dstbi__zhuffman *z, unsigned char *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return _m3dstbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (_m3dstbi__uint16) code;
+      z->firstsymbol[i] = (_m3dstbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return _m3dstbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i);
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000;
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         _m3dstbi__uint16 fastv = (_m3dstbi__uint16) ((s << 9) | i);
+         z->size [c] = (unsigned char     ) s;
+         z->value[c] = (_m3dstbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = _m3dstbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+typedef struct
+{
+   unsigned char *zbuffer, *zbuffer_end;
+   int num_bits;
+   _m3dstbi__uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   _m3dstbi__zhuffman z_length, z_distance;
+} _m3dstbi__zbuf;
+
+_inline static unsigned char _m3dstbi__zget8(_m3dstbi__zbuf *z)
+{
+   if (z->zbuffer >= z->zbuffer_end) return 0;
+   return *z->zbuffer++;
+}
+
+static void _m3dstbi__fill_bits(_m3dstbi__zbuf *z)
+{
+   do {
+      STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
+      z->code_buffer |= (unsigned int) _m3dstbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+_inline static unsigned int _m3dstbi__zreceive(_m3dstbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) _m3dstbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
+
+static int _m3dstbi__zhuffman_decode_slowpath(_m3dstbi__zbuf *a, _m3dstbi__zhuffman *z)
+{
+   int b,s,k;
+   k = _m3dstbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s == 16) return -1;
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   STBI_ASSERT(z->size[b] == s);
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+_inline static int _m3dstbi__zhuffman_decode(_m3dstbi__zbuf *a, _m3dstbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) _m3dstbi__fill_bits(a);
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return _m3dstbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int _m3dstbi__zexpand(_m3dstbi__zbuf *z, char *zout, int n)
+{
+   char *q;
+   int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return _m3dstbi__err("output buffer limit","Corrupt PNG");
+   cur   = (int) (z->zout     - z->zout_start);
+   limit = old_limit = (int) (z->zout_end - z->zout_start);
+   while (cur + n > limit)
+      limit *= 2;
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return _m3dstbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static int _m3dstbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static int _m3dstbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static int _m3dstbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static int _m3dstbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int _m3dstbi__parse_huffman_block(_m3dstbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = _m3dstbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return _m3dstbi__err("bad huffman code","Corrupt PNG");
+         if (zout >= a->zout_end) {
+            if (!_m3dstbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         unsigned char *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            return 1;
+         }
+         z -= 257;
+         len = _m3dstbi__zlength_base[z];
+         if (_m3dstbi__zlength_extra[z]) len += _m3dstbi__zreceive(a, _m3dstbi__zlength_extra[z]);
+         z = _m3dstbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0) return _m3dstbi__err("bad huffman code","Corrupt PNG");
+         dist = _m3dstbi__zdist_base[z];
+         if (_m3dstbi__zdist_extra[z]) dist += _m3dstbi__zreceive(a, _m3dstbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return _m3dstbi__err("bad dist","Corrupt PNG");
+         if (zout + len > a->zout_end) {
+            if (!_m3dstbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (unsigned char *) (zout - dist);
+         if (dist == 1) {
+            unsigned char v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int _m3dstbi__compute_huffman_codes(_m3dstbi__zbuf *a)
+{
+   static unsigned char length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   _m3dstbi__zhuffman z_codelength;
+   unsigned char lencodes[286+32+137];
+   unsigned char codelength_sizes[19];
+   int i,n;
+
+   int hlit  = _m3dstbi__zreceive(a,5) + 257;
+   int hdist = _m3dstbi__zreceive(a,5) + 1;
+   int hclen = _m3dstbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = _m3dstbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (unsigned char) s;
+   }
+   if (!_m3dstbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < ntot) {
+      int c = _m3dstbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return _m3dstbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (unsigned char) c;
+      else {
+         unsigned char fill = 0;
+         if (c == 16) {
+            c = _m3dstbi__zreceive(a,2)+3;
+            if (n == 0) return _m3dstbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17)
+            c = _m3dstbi__zreceive(a,3)+3;
+         else {
+            STBI_ASSERT(c == 18);
+            c = _m3dstbi__zreceive(a,7)+11;
+         }
+         if (ntot - n < c) return _m3dstbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
+         n += c;
+      }
+   }
+   if (n != ntot) return _m3dstbi__err("bad codelengths","Corrupt PNG");
+   if (!_m3dstbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!_m3dstbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+_inline static int _m3dstbi__parse_uncompressed_block(_m3dstbi__zbuf *a)
+{
+   unsigned char header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      _m3dstbi__zreceive(a, a->num_bits & 7);
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (unsigned char) (a->code_buffer & 255);
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   STBI_ASSERT(a->num_bits == 0);
+   while (k < 4)
+      header[k++] = _m3dstbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return _m3dstbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return _m3dstbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!_m3dstbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int _m3dstbi__parse_zlib_header(_m3dstbi__zbuf *a)
+{
+   int cmf   = _m3dstbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = _m3dstbi__zget8(a);
+   if ((cmf*256+flg) % 31 != 0) return _m3dstbi__err("bad zlib header","Corrupt PNG");
+   if (flg & 32) return _m3dstbi__err("no preset dict","Corrupt PNG");
+   if (cm != 8) return _m3dstbi__err("bad compression","Corrupt PNG");
+   return 1;
+}
+
+static unsigned char _m3dstbi__zdefault_length[288], _m3dstbi__zdefault_distance[32];
+static void _m3dstbi__init_zdefaults(void)
+{
+   int i;
+   for (i=0; i <= 143; ++i)     _m3dstbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     _m3dstbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     _m3dstbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     _m3dstbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     _m3dstbi__zdefault_distance[i] = 5;
+}
+
+static int _m3dstbi__parse_zlib(_m3dstbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!_m3dstbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   do {
+      final = _m3dstbi__zreceive(a,1);
+      type = _m3dstbi__zreceive(a,2);
+      if (type == 0) {
+         if (!_m3dstbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            if (!_m3dstbi__zbuild_huffman(&a->z_length  , _m3dstbi__zdefault_length  , 288)) return 0;
+            if (!_m3dstbi__zbuild_huffman(&a->z_distance, _m3dstbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!_m3dstbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!_m3dstbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
+}
+
+static int _m3dstbi__do_zlib(_m3dstbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+   _m3dstbi__init_zdefaults();
+   return _m3dstbi__parse_zlib(a, parse_header);
+}
+
+char *_m3dstbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   _m3dstbi__zbuf a;
+   char *p = (char *) _m3dstbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (unsigned char *) buffer;
+   a.zbuffer_end = (unsigned char *) buffer + len;
+   if (_m3dstbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+typedef struct
+{
+   _m3dstbi__uint32 length;
+   _m3dstbi__uint32 type;
+} _m3dstbi__pngchunk;
+
+static _m3dstbi__pngchunk _m3dstbi__get_chunk_header(_m3dstbi__context *s)
+{
+   _m3dstbi__pngchunk c;
+   c.length = _m3dstbi__get32be(s);
+   c.type   = _m3dstbi__get32be(s);
+   return c;
+}
+
+_inline static int _m3dstbi__check_png_header(_m3dstbi__context *s)
+{
+   static unsigned char png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (_m3dstbi__get8(s) != png_sig[i]) return _m3dstbi__err("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   _m3dstbi__context *s;
+   unsigned char *idata, *expanded, *out;
+   int depth;
+} _m3dstbi__png;
+
+
+enum {
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static unsigned char first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static int _m3dstbi__paeth(int a, int b, int c)
+{
+   int p = a + b - c;
+   int pa = abs(p-a);
+   int pb = abs(p-b);
+   int pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return a;
+   if (pb <= pc) return b;
+   return c;
+}
+
+static unsigned char _m3dstbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+static int _m3dstbi__create_png_image_raw(_m3dstbi__png *a, unsigned char *raw, _m3dstbi__uint32 raw_len, int out_n, _m3dstbi__uint32 x, _m3dstbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16? 2 : 1);
+   _m3dstbi__context *s = a->s;
+   _m3dstbi__uint32 i,j,stride = x*out_n*bytes;
+   _m3dstbi__uint32 img_len, img_width_bytes;
+   int k;
+   int img_n = s->img_n;
+
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (unsigned char *) _m3dstbi__malloc_mad3(x, y, output_bytes, 0);
+   if (!a->out) return _m3dstbi__err("outofmem", "Out of memory");
+
+   if (!_m3dstbi__mad3sizes_valid(img_n, x, depth, 7)) return _m3dstbi__err("too large", "Corrupt PNG");
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   img_len = (img_width_bytes + 1) * y;
+   if (s->img_x == x && s->img_y == y) {
+      if (raw_len != img_len) return _m3dstbi__err("not enough pixels","Corrupt PNG");
+   } else {
+      if (raw_len < img_len) return _m3dstbi__err("not enough pixels","Corrupt PNG");
+   }
+
+   for (j=0; j < y; ++j) {
+      unsigned char *cur = a->out + stride*j;
+      unsigned char *prior = cur - stride;
+      int filter = *raw++;
+
+      if (filter > 4)
+         return _m3dstbi__err("invalid filter","Corrupt PNG");
+
+      if (depth < 8) {
+         STBI_ASSERT(img_width_bytes <= x);
+         cur += x*out_n - img_width_bytes;
+         filter_bytes = 1;
+         width = img_width_bytes;
+      }
+      prior = cur - stride;
+
+      if (j == 0) filter = first_row_filter[filter];
+
+      for (k=0; k < filter_bytes; ++k) {
+         switch (filter) {
+            case STBI__F_none       : cur[k] = raw[k]; break;
+            case STBI__F_sub        : cur[k] = raw[k]; break;
+            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
+            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + _m3dstbi__paeth(0,prior[k],0)); break;
+            case STBI__F_avg_first  : cur[k] = raw[k]; break;
+            case STBI__F_paeth_first: cur[k] = raw[k]; break;
+         }
+      }
+
+      if (depth == 8) {
+         if (img_n != out_n)
+            cur[img_n] = 255;
+         raw += img_n;
+         cur += out_n;
+         prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255;
+            cur[filter_bytes+1] = 255;
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
+      } else {
+         raw += 1;
+         cur += 1;
+         prior += 1;
+      }
+
+      if (depth < 8 || img_n == out_n) {
+         int nk = (width - 1)*filter_bytes;
+         #define STBI__CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+         switch (filter) {
+            case STBI__F_none:         memcpy(cur, raw, nk); break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + _m3dstbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + _m3dstbi__paeth(cur[k-filter_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+         raw += nk;
+      } else {
+         STBI_ASSERT(img_n+1 == out_n);
+         #define STBI__CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+         switch (filter) {
+            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + _m3dstbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + _m3dstbi__paeth(cur[k- output_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+
+         if (depth == 16) {
+            cur = a->out + stride*j;
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
+         }
+      }
+   }
+
+   if (depth < 8) {
+      for (j=0; j < y; ++j) {
+         unsigned char *cur = a->out + stride*j;
+         unsigned char *in  = a->out + stride*j + x*out_n - img_width_bytes;
+         unsigned char scale = (color == 0) ? _m3dstbi__depth_scale_table[depth] : 1;
+
+         if (depth == 4) {
+            for (k=x*img_n; k >= 2; k-=2, ++in) {
+               *cur++ = scale * ((*in >> 4)       );
+               *cur++ = scale * ((*in     ) & 0x0f);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 4)       );
+         } else if (depth == 2) {
+            for (k=x*img_n; k >= 4; k-=4, ++in) {
+               *cur++ = scale * ((*in >> 6)       );
+               *cur++ = scale * ((*in >> 4) & 0x03);
+               *cur++ = scale * ((*in >> 2) & 0x03);
+               *cur++ = scale * ((*in     ) & 0x03);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 6)       );
+            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+         } else if (depth == 1) {
+            for (k=x*img_n; k >= 8; k-=8, ++in) {
+               *cur++ = scale * ((*in >> 7)       );
+               *cur++ = scale * ((*in >> 6) & 0x01);
+               *cur++ = scale * ((*in >> 5) & 0x01);
+               *cur++ = scale * ((*in >> 4) & 0x01);
+               *cur++ = scale * ((*in >> 3) & 0x01);
+               *cur++ = scale * ((*in >> 2) & 0x01);
+               *cur++ = scale * ((*in >> 1) & 0x01);
+               *cur++ = scale * ((*in     ) & 0x01);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 7)       );
+            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+         }
+         if (img_n != out_n) {
+            int q;
+            cur = a->out + stride*j;
+            if (img_n == 1) {
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
+               }
+            }
+         }
+      }
+   } else if (depth == 16) {
+      unsigned char *cur = a->out;
+      _m3dstbi__uint16 *cur16 = (_m3dstbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
+   }
+
+   return 1;
+}
+
+static int _m3dstbi__create_png_image(_m3dstbi__png *a, unsigned char *image_data, _m3dstbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
+   unsigned char *final;
+   int p;
+   if (!interlaced)
+      return _m3dstbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+   final = (unsigned char *) _m3dstbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         _m3dstbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!_m3dstbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
+
+   return 1;
+}
+
+static int _m3dstbi__compute_transparency(_m3dstbi__png *z, unsigned char tc[3], int out_n)
+{
+   _m3dstbi__context *s = z->s;
+   _m3dstbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   unsigned char *p = z->out;
+
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int _m3dstbi__compute_transparency16(_m3dstbi__png *z, _m3dstbi__uint16 tc[3], int out_n)
+{
+   _m3dstbi__context *s = z->s;
+   _m3dstbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   _m3dstbi__uint16 *p = (_m3dstbi__uint16*) z->out;
+
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int _m3dstbi__expand_png_palette(_m3dstbi__png *a, unsigned char *palette, int len, int pal_img_n)
+{
+   _m3dstbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   unsigned char *p, *temp_out, *orig = a->out;
+
+   p = (unsigned char *) _m3dstbi__malloc_mad2(pixel_count, pal_img_n, 0);
+   if (p == NULL) return _m3dstbi__err("outofmem", "Out of memory");
+
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
+
+static int _m3dstbi__parse_png_file(_m3dstbi__png *z, int scan, int req_comp)
+{
+   unsigned char palette[1024], pal_img_n=0;
+   unsigned char has_trans=0, tc[3];
+   _m3dstbi__uint16 tc16[3];
+   _m3dstbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0;
+   _m3dstbi__context *s = z->s;
+
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
+
+   if (!_m3dstbi__check_png_header(s)) return 0;
+
+   if (scan == STBI__SCAN_type) return 1;
+
+   for (;;) {
+      _m3dstbi__pngchunk c = _m3dstbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
+            _m3dstbi__skip(s, c.length);
+            break;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return _m3dstbi__err("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return _m3dstbi__err("bad IHDR len","Corrupt PNG");
+            s->img_x = _m3dstbi__get32be(s); if (s->img_x > (1 << 24)) return _m3dstbi__err("too large","Very large image (corrupt?)");
+            s->img_y = _m3dstbi__get32be(s); if (s->img_y > (1 << 24)) return _m3dstbi__err("too large","Very large image (corrupt?)");
+            z->depth = _m3dstbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return _m3dstbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = _m3dstbi__get8(s);  if (color > 6)         return _m3dstbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return _m3dstbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return _m3dstbi__err("bad ctype","Corrupt PNG");
+            comp  = _m3dstbi__get8(s);  if (comp) return _m3dstbi__err("bad comp method","Corrupt PNG");
+            filter= _m3dstbi__get8(s);  if (filter) return _m3dstbi__err("bad filter method","Corrupt PNG");
+            interlace = _m3dstbi__get8(s); if (interlace>1) return _m3dstbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return _m3dstbi__err("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return _m3dstbi__err("too large", "Image too large to decode");
+               if (scan == STBI__SCAN_header) return 1;
+            } else {
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return _m3dstbi__err("too large","Corrupt PNG");
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return _m3dstbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return _m3dstbi__err("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return _m3dstbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = _m3dstbi__get8(s);
+               palette[i*4+1] = _m3dstbi__get8(s);
+               palette[i*4+2] = _m3dstbi__get8(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return _m3dstbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return _m3dstbi__err("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return _m3dstbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return _m3dstbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = _m3dstbi__get8(s);
+            } else {
+               if (!(s->img_n & 1)) return _m3dstbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (_m3dstbi__uint32) s->img_n*2) return _m3dstbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = (_m3dstbi__uint16)_m3dstbi__get16be(s);
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (unsigned char)(_m3dstbi__get16be(s) & 255) * _m3dstbi__depth_scale_table[z->depth];
+               }
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return _m3dstbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return _m3dstbi__err("no PLTE","Corrupt PNG");
+            if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
+            if (ioff + c.length > idata_limit) {
+               _m3dstbi__uint32 idata_limit_old = idata_limit;
+               unsigned char *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (unsigned char *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return _m3dstbi__err("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!_m3dstbi__getn(s, z->idata+ioff,c.length)) return _m3dstbi__err("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            _m3dstbi__uint32 raw_len, bpl;
+            if (first) return _m3dstbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return _m3dstbi__err("no IDAT","Corrupt PNG");
+            bpl = (s->img_x * z->depth + 7) / 8;
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (unsigned char *) _m3dstbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, 1);
+            if (z->expanded == NULL) return 0;
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!_m3dstbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!_m3dstbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!_m3dstbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
+            if (pal_img_n) {
+               s->img_n = pal_img_n;
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!_m3dstbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            } else if (has_trans) {
+               ++s->img_n;
+            }
+            STBI_FREE(z->expanded); z->expanded = NULL;
+            return 1;
+         }
+
+         default:
+            if (first) return _m3dstbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               return _m3dstbi__err("invalid_chunk", "PNG not supported: unknown PNG chunk type");
+            }
+            _m3dstbi__skip(s, c.length);
+            break;
+      }
+      _m3dstbi__get32be(s);
+   }
+}
+
+static void *_m3dstbi__do_png(_m3dstbi__png *p, int *x, int *y, int *n, int req_comp, _m3dstbi__result_info *ri)
+{
+   void *result=NULL;
+   if (req_comp < 0 || req_comp > 4) { _m3dstbi__err("bad req_comp", "Internal error"); return NULL; }
+   if (_m3dstbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth < 8)
+         ri->bits_per_channel = 8;
+      else
+         ri->bits_per_channel = p->depth;
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         if (ri->bits_per_channel == 8)
+            result = _m3dstbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = _m3dstbi__convert_format16((_m3dstbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+static void *_m3dstbi__png_load(_m3dstbi__context *s, int *x, int *y, int *comp, int req_comp, _m3dstbi__result_info *ri)
+{
+   _m3dstbi__png p;
+   p.s = s;
+   return _m3dstbi__do_png(&p, x,y,comp,req_comp, ri);
+}
+#define stbi__context _m3dstbi__context
+#define stbi__result_info _m3dstbi__result_info
+#define stbi__png_load _m3dstbi__png_load
+#define stbi_zlib_decode_malloc_guesssize_headerflag _m3dstbi_zlib_decode_malloc_guesssize_headerflag
+#endif
+#if !defined(M3D_NOIMPORTER) && defined(STBI_INCLUDE_STB_IMAGE_H) && !defined(STB_IMAGE_IMPLEMENTATION)
+#error "stb_image.h included without STB_IMAGE_IMPLEMENTATION. Sorry, we need some stuff defined inside the ifguard for proper integration"
+#endif
+#else
+#if !defined(STBI_INCLUDE_STB_IMAGE_H) || defined(STBI_NO_ZLIB)
+#error "stb_image.h not included or STBI_NO_ZLIB defined. Sorry, we need its zlib implementation for proper integration"
+#endif
+#endif /* M3D_NOTEXTURE */
+
+#if defined(M3D_EXPORTER) && !defined(INCLUDE_STB_IMAGE_WRITE_H)
+/* zlib_compressor from
+
+   stb_image_write - v1.13 - public domain - http://nothings.org/stb/stb_image_write.h
+*/
+typedef unsigned char _m3dstbiw__uc;
+typedef unsigned short _m3dstbiw__us;
+
+typedef uint16_t _m3dstbiw__uint16;
+typedef int16_t  _m3dstbiw__int16;
+typedef uint32_t _m3dstbiw__uint32;
+typedef int32_t  _m3dstbiw__int32;
+
+#define STBIW_MALLOC(s)     M3D_MALLOC(s)
+#define STBIW_REALLOC(p,ns) M3D_REALLOC(p,ns)
+#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
+#define STBIW_FREE          M3D_FREE
+#define STBIW_MEMMOVE       memmove
+#define STBIW_UCHAR         (uint8_t)
+#define STBIW_ASSERT(x)
+#define _m3dstbiw___sbraw(a)     ((int *) (a) - 2)
+#define _m3dstbiw___sbm(a)       _m3dstbiw___sbraw(a)[0]
+#define _m3dstbiw___sbn(a)       _m3dstbiw___sbraw(a)[1]
+
+#define _m3dstbiw___sbneedgrow(a,n)  ((a)==0 || _m3dstbiw___sbn(a)+n >= _m3dstbiw___sbm(a))
+#define _m3dstbiw___sbmaybegrow(a,n) (_m3dstbiw___sbneedgrow(a,(n)) ? _m3dstbiw___sbgrow(a,n) : 0)
+#define _m3dstbiw___sbgrow(a,n)  _m3dstbiw___sbgrowf((void **) &(a), (n), sizeof(*(a)))
+
+#define _m3dstbiw___sbpush(a, v)      (_m3dstbiw___sbmaybegrow(a,1), (a)[_m3dstbiw___sbn(a)++] = (v))
+#define _m3dstbiw___sbcount(a)        ((a) ? _m3dstbiw___sbn(a) : 0)
+#define _m3dstbiw___sbfree(a)         ((a) ? STBIW_FREE(_m3dstbiw___sbraw(a)),0 : 0)
+
+static void *_m3dstbiw___sbgrowf(void **arr, int increment, int itemsize)
+{
+   int m = *arr ? 2*_m3dstbiw___sbm(*arr)+increment : increment+1;
+   void *p = STBIW_REALLOC_SIZED(*arr ? _m3dstbiw___sbraw(*arr) : 0, *arr ? (_m3dstbiw___sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
+   STBIW_ASSERT(p);
+   if (p) {
+      if (!*arr) ((int *) p)[1] = 0;
+      *arr = (void *) ((int *) p + 2);
+      _m3dstbiw___sbm(*arr) = m;
+   }
+   return *arr;
+}
+
+static unsigned char *_m3dstbiw___zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
+{
+   while (*bitcount >= 8) {
+      _m3dstbiw___sbpush(data, STBIW_UCHAR(*bitbuffer));
+      *bitbuffer >>= 8;
+      *bitcount -= 8;
+   }
+   return data;
+}
+
+static int _m3dstbiw___zlib_bitrev(int code, int codebits)
+{
+   int res=0;
+   while (codebits--) {
+      res = (res << 1) | (code & 1);
+      code >>= 1;
+   }
+   return res;
+}
+
+static unsigned int _m3dstbiw___zlib_countm(unsigned char *a, unsigned char *b, int limit)
+{
+   int i;
+   for (i=0; i < limit && i < 258; ++i)
+      if (a[i] != b[i]) break;
+   return i;
+}
+
+static unsigned int _m3dstbiw___zhash(unsigned char *data)
+{
+   _m3dstbiw__uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
+   hash ^= hash << 3;
+   hash += hash >> 5;
+   hash ^= hash << 4;
+   hash += hash >> 17;
+   hash ^= hash << 25;
+   hash += hash >> 6;
+   return hash;
+}
+
+#define _m3dstbiw___zlib_flush() (out = _m3dstbiw___zlib_flushf(out, &bitbuf, &bitcount))
+#define _m3dstbiw___zlib_add(code,codebits) \
+      (bitbuf |= (code) << bitcount, bitcount += (codebits), _m3dstbiw___zlib_flush())
+#define _m3dstbiw___zlib_huffa(b,c)  _m3dstbiw___zlib_add(_m3dstbiw___zlib_bitrev(b,c),c)
+#define _m3dstbiw___zlib_huff1(n)  _m3dstbiw___zlib_huffa(0x30 + (n), 8)
+#define _m3dstbiw___zlib_huff2(n)  _m3dstbiw___zlib_huffa(0x190 + (n)-144, 9)
+#define _m3dstbiw___zlib_huff3(n)  _m3dstbiw___zlib_huffa(0 + (n)-256,7)
+#define _m3dstbiw___zlib_huff4(n)  _m3dstbiw___zlib_huffa(0xc0 + (n)-280,8)
+#define _m3dstbiw___zlib_huff(n)  ((n) <= 143 ? _m3dstbiw___zlib_huff1(n) : (n) <= 255 ? _m3dstbiw___zlib_huff2(n) : (n) <= 279 ? _m3dstbiw___zlib_huff3(n) : _m3dstbiw___zlib_huff4(n))
+#define _m3dstbiw___zlib_huffb(n) ((n) <= 143 ? _m3dstbiw___zlib_huff1(n) : _m3dstbiw___zlib_huff2(n))
+
+#define _m3dstbiw___ZHASH   16384
+
+unsigned char * _m3dstbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
+{
+   static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
+   static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
+   static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
+   static unsigned char  disteb[]  = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
+   unsigned int bitbuf=0;
+   int i,j, bitcount=0;
+   unsigned char *out = NULL;
+   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(_m3dstbiw___ZHASH * sizeof(char**));
+   if (hash_table == NULL)
+      return NULL;
+   if (quality < 5) quality = 5;
+
+   _m3dstbiw___sbpush(out, 0x78);
+   _m3dstbiw___sbpush(out, 0x5e);
+   _m3dstbiw___zlib_add(1,1);
+   _m3dstbiw___zlib_add(1,2);
+
+   for (i=0; i < _m3dstbiw___ZHASH; ++i)
+      hash_table[i] = NULL;
+
+   i=0;
+   while (i < data_len-3) {
+      int h = _m3dstbiw___zhash(data+i)&(_m3dstbiw___ZHASH-1), best=3;
+      unsigned char *bestloc = 0;
+      unsigned char **hlist = hash_table[h];
+      int n = _m3dstbiw___sbcount(hlist);
+      for (j=0; j < n; ++j) {
+         if (hlist[j]-data > i-32768) {
+            int d = _m3dstbiw___zlib_countm(hlist[j], data+i, data_len-i);
+            if (d >= best) best=d,bestloc=hlist[j];
+         }
+      }
+      if (hash_table[h] && _m3dstbiw___sbn(hash_table[h]) == 2*quality) {
+         STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
+         _m3dstbiw___sbn(hash_table[h]) = quality;
+      }
+      _m3dstbiw___sbpush(hash_table[h],data+i);
+
+      if (bestloc) {
+         h = _m3dstbiw___zhash(data+i+1)&(_m3dstbiw___ZHASH-1);
+         hlist = hash_table[h];
+         n = _m3dstbiw___sbcount(hlist);
+         for (j=0; j < n; ++j) {
+            if (hlist[j]-data > i-32767) {
+               int e = _m3dstbiw___zlib_countm(hlist[j], data+i+1, data_len-i-1);
+               if (e > best) {
+                  bestloc = NULL;
+                  break;
+               }
+            }
+         }
+      }
+
+      if (bestloc) {
+         int d = (int) (data+i - bestloc);
+         STBIW_ASSERT(d <= 32767 && best <= 258);
+         for (j=0; best > lengthc[j+1]-1; ++j);
+         _m3dstbiw___zlib_huff(j+257);
+         if (lengtheb[j]) _m3dstbiw___zlib_add(best - lengthc[j], lengtheb[j]);
+         for (j=0; d > distc[j+1]-1; ++j);
+         _m3dstbiw___zlib_add(_m3dstbiw___zlib_bitrev(j,5),5);
+         if (disteb[j]) _m3dstbiw___zlib_add(d - distc[j], disteb[j]);
+         i += best;
+      } else {
+         _m3dstbiw___zlib_huffb(data[i]);
+         ++i;
+      }
+   }
+   for (;i < data_len; ++i)
+      _m3dstbiw___zlib_huffb(data[i]);
+   _m3dstbiw___zlib_huff(256);
+   while (bitcount)
+      _m3dstbiw___zlib_add(0,1);
+
+   for (i=0; i < _m3dstbiw___ZHASH; ++i)
+      (void) _m3dstbiw___sbfree(hash_table[i]);
+   STBIW_FREE(hash_table);
+
+   {
+      unsigned int s1=1, s2=0;
+      int blocklen = (int) (data_len % 5552);
+      j=0;
+      while (j < data_len) {
+         for (i=0; i < blocklen; ++i) s1 += data[j+i], s2 += s1;
+         s1 %= 65521, s2 %= 65521;
+         j += blocklen;
+         blocklen = 5552;
+      }
+      _m3dstbiw___sbpush(out, STBIW_UCHAR(s2 >> 8));
+      _m3dstbiw___sbpush(out, STBIW_UCHAR(s2));
+      _m3dstbiw___sbpush(out, STBIW_UCHAR(s1 >> 8));
+      _m3dstbiw___sbpush(out, STBIW_UCHAR(s1));
+   }
+   *out_len = _m3dstbiw___sbn(out);
+   STBIW_MEMMOVE(_m3dstbiw___sbraw(out), out, *out_len);
+   return (unsigned char *) _m3dstbiw___sbraw(out);
+}
+#define stbi_zlib_compress _m3dstbi_zlib_compress
+#else
+unsigned char * _m3dstbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality);
+#endif
+
+#define M3D_CHUNKMAGIC(m, a,b,c,d) ((m)[0]==(a) && (m)[1]==(b) && (m)[2]==(c) && (m)[3]==(d))
+
+#ifdef M3D_ASCII
+#include <stdio.h>          /* get sprintf */
+#include <locale.h>         /* sprintf and strtod cares about number locale */
+#endif
+#ifdef M3D_PROFILING
+#include <sys/time.h>
+#endif
+
+#if !defined(M3D_NOIMPORTER) && defined(M3D_ASCII)
+/* helper functions for the ASCII parser */
+static char *_m3d_findarg(char *s) {
+    while(s && *s && *s != ' ' && *s != '\t' && *s != '\r' && *s != '\n') s++;
+    while(s && *s && (*s == ' ' || *s == '\t')) s++;
+    return s;
+}
+static char *_m3d_findnl(char *s) {
+    while(s && *s && *s != '\r' && *s != '\n') s++;
+    if(*s == '\r') s++;
+    if(*s == '\n') s++;
+    return s;
+}
+static char *_m3d_gethex(char *s, uint32_t *ret)
+{
+    if(*s == '#') s++;
+    *ret = 0;
+    for(; *s; s++) {
+        if(*s >= '0' && *s <= '9') {      *ret <<= 4; *ret += (uint32_t)(*s-'0'); }
+        else if(*s >= 'a' && *s <= 'f') { *ret <<= 4; *ret += (uint32_t)(*s-'a'+10); }
+        else if(*s >= 'A' && *s <= 'F') { *ret <<= 4; *ret += (uint32_t)(*s-'A'+10); }
+        else break;
+    }
+    return _m3d_findarg(s);
+}
+static char *_m3d_getint(char *s, uint32_t *ret)
+{
+    char *e = s;
+    if(!s || !*s || *s == '\r' || *s == '\n') return s;
+    for(; *e >= '0' && *e <= '9'; e++);
+    *ret = atoi(s);
+    return e;
+}
+static char *_m3d_getfloat(char *s, M3D_FLOAT *ret)
+{
+    char *e = s;
+    if(!s || !*s || *s == '\r' || *s == '\n') return s;
+    for(; *e == '-' || *e == '+' || *e == '.' || (*e >= '0' && *e <= '9') || *e == 'e' || *e == 'E'; e++);
+    *ret = (M3D_FLOAT)strtod(s, NULL);
+    return _m3d_findarg(e);
+}
+#endif
+#if !defined(M3D_NODUP) && (!defined(M3D_NOIMPORTER) || defined(M3D_ASCII) || defined(M3D_EXPORTER))
+/* helper function to create safe strings */
+char *_m3d_safestr(char *in, int morelines)
+{
+    char *out, *o, *i = in;
+    int l;
+    if(!in || !*in) {
+        out = (char*)M3D_MALLOC(1);
+        if(!out) return NULL;
+        out[0] =0;
+    } else {
+        for(o = in, l = 0; *o && ((morelines & 1) || (*o != '\r' && *o != '\n')) && l < 256; o++, l++);
+        out = o = (char*)M3D_MALLOC(l+1);
+        if(!out) return NULL;
+        while(*i == ' ' || *i == '\t' || *i == '\r' || (morelines && *i == '\n')) i++;
+        for(; *i && (morelines || (*i != '\r' && *i != '\n')); i++) {
+            if(*i == '\r') continue;
+            if(*i == '\n') {
+                if(morelines >= 3 && o > out && *(o-1) == '\n') break;
+                if(i > in && *(i-1) == '\n') continue;
+                if(morelines & 1) {
+                    if(morelines == 1) *o++ = '\r';
+                    *o++ = '\n';
+                } else
+                    break;
+            } else
+            if(*i == ' ' || *i == '\t') {
+                *o++ = morelines? ' ' : '_';
+            } else
+                *o++ = !morelines && (*i == '/' || *i == '\\') ? '_' : *i;
+        }
+        for(; o > out && (*(o-1) == ' ' || *(o-1) == '\t' || *(o-1) == '\r' || *(o-1) == '\n'); o--);
+        *o = 0;
+        out = (char*)M3D_REALLOC(out, (uintptr_t)o - (uintptr_t)out + 1);
+    }
+    return out;
+}
+#endif
+#ifndef M3D_NOIMPORTER
+/* helper function to load and decode/generate a texture */
+M3D_INDEX _m3d_gettx(m3d_t *model, m3dread_t readfilecb, m3dfree_t freecb, char *fn)
+{
+    unsigned int i, len = 0;
+    unsigned char *buff = NULL;
+    char *fn2;
+#ifndef M3D_NOTEXTURE
+    unsigned int w, h;
+    stbi__context s;
+    stbi__result_info ri;
+#endif
+
+    /* failsafe */
+    if(!fn || !*fn) return M3D_UNDEF;
+    /* do we have loaded this texture already? */
+    for(i = 0; i < model->numtexture; i++)
+        if(!strcmp(fn, model->texture[i].name)) return i;
+    /* see if it's inlined in the model */
+    if(model->inlined) {
+        for(i = 0; i < model->numinlined; i++)
+            if(!strcmp(fn, model->inlined[i].name)) {
+                buff = model->inlined[i].data;
+                len = model->inlined[i].length;
+                freecb = NULL;
+                break;
+            }
+    }
+    /* try to load from external source */
+    if(!buff && readfilecb) {
+        i = (unsigned int)strlen(fn);
+        if(i < 5 || fn[i - 4] != '.') {
+            fn2 = (char*)M3D_MALLOC(i + 5);
+            if(!fn2) { model->errcode = M3D_ERR_ALLOC; return M3D_UNDEF; }
+            memcpy(fn2, fn, i);
+            memcpy(fn2+i, ".png", 5);
+            buff = (*readfilecb)(fn2, &len);
+            M3D_FREE(fn2);
+        }
+        if(!buff) {
+            buff = (*readfilecb)(fn, &len);
+            if(!buff) return M3D_UNDEF;
+        }
+    }
+    /* add to textures array */
+    i = model->numtexture++;
+    model->texture = (m3dtx_t*)M3D_REALLOC(model->texture, model->numtexture * sizeof(m3dtx_t));
+    if(!model->texture) {
+        if(buff && freecb) (*freecb)(buff);
+        model->errcode = M3D_ERR_ALLOC;
+        model->numtexture = 0;
+        return M3D_UNDEF;
+    }
+    memset(&model->texture[i], 0, sizeof(m3dtx_t));
+    model->texture[i].name = fn;
+    if(buff) {
+        if(buff[0] == 0x89 && buff[1] == 'P' && buff[2] == 'N' && buff[3] == 'G') {
+#ifndef M3D_NOTEXTURE
+            /* return pixel buffer of the decoded texture */
+            memset(&s, 0, sizeof(s));
+            memset(&ri, 0, sizeof(ri));
+            s.img_buffer = s.img_buffer_original = (unsigned char *) buff;
+            s.img_buffer_end = s.img_buffer_original_end = (unsigned char *) buff+len;
+            /* don't use model->texture[i].w directly, it's a uint16_t */
+            w = h = len = 0;
+            ri.bits_per_channel = 8;
+            model->texture[i].d = (uint8_t*)stbi__png_load(&s, (int*)&w, (int*)&h, (int*)&len, 0, &ri);
+            model->texture[i].w = w;
+            model->texture[i].h = h;
+            model->texture[i].f = (uint8_t)len;
+#else
+            /* return only the raw undecoded texture */
+            if((model->texture[i].d = (uint8_t*)M3D_MALLOC(len))) {
+                memcpy(model->texture[i].d, buff, len);
+                model->texture[i].w = len & 0xffff;
+                model->texture[i].h = (len >> 16) & 0xffff;
+                model->texture[i].f = 0;
+            } else
+                model->errcode = M3D_ERR_ALLOC;
+#endif
+        } else {
+#ifdef M3D_TX_INTERP
+            if((model->errcode = M3D_TX_INTERP(fn, buff, len, &model->texture[i])) != M3D_SUCCESS) {
+                M3D_LOG("Unable to generate texture");
+                M3D_LOG(fn);
+            }
+#else
+            M3D_LOG("Unimplemented interpreter");
+            M3D_LOG(fn);
+#endif
+        }
+        if(freecb) (*freecb)(buff);
+    }
+    if(!model->texture[i].d)
+        model->errcode = M3D_ERR_UNKIMG;
+    return i;
+}
+
+/* helper function to load and generate a procedural surface */
+void _m3d_getpr(m3d_t *model, _unused m3dread_t readfilecb, _unused  m3dfree_t freecb, _unused char *fn)
+{
+#ifdef M3D_PR_INTERP
+    unsigned int i, len = 0;
+    unsigned char *buff = readfilecb && fn && *fn ? (*readfilecb)(fn, &len) : NULL;
+
+    if(!buff && fn && *fn && model->inlined) {
+        for(i = 0; i < model->numinlined; i++)
+            if(!strcmp(fn, model->inlined[i].name)) {
+                buff = model->inlined[i].data;
+                len = model->inlined[i].length;
+                freecb = NULL;
+                break;
+            }
+    }
+    if(!buff || !len || (model->errcode = M3D_PR_INTERP(fn, buff, len, model)) != M3D_SUCCESS) {
+        M3D_LOG("Unable to generate procedural surface");
+        M3D_LOG(fn);
+        model->errcode = M3D_ERR_UNKIMG;
+    }
+    if(freecb && buff) (*freecb)(buff);
+#else
+    (void)readfilecb;
+    (void)freecb;
+    (void)fn;
+    M3D_LOG("Unimplemented interpreter");
+    M3D_LOG(fn);
+    model->errcode = M3D_ERR_UNIMPL;
+#endif
+}
+/* helpers to read indices from data stream */
+#define M3D_GETSTR(x) do{offs=0;data=_m3d_getidx(data,model->si_s,&offs);x=offs?((char*)model->raw+16+offs):NULL;}while(0)
+_inline static unsigned char *_m3d_getidx(unsigned char *data, char type, M3D_INDEX *idx)
+{
+    switch(type) {
+        case 1: *idx = data[0] > 253 ? (int8_t)data[0] : data[0]; data++; break;
+        case 2: *idx = *((uint16_t*)data) > 65533 ? *((int16_t*)data) : *((uint16_t*)data); data += 2; break;
+        case 4: *idx = *((int32_t*)data); data += 4; break;
+    }
+    return data;
+}
+
+#ifndef M3D_NOANIMATION
+/* multiply 4 x 4 matrices. Do not use float *r[16] as argument, because some compilers misinterpret that as
+ * 16 pointers each pointing to a float, but we need a single pointer to 16 floats. */
+void _m3d_mul(M3D_FLOAT *r, M3D_FLOAT *a, M3D_FLOAT *b)
+{
+    r[ 0] = b[ 0] * a[ 0] + b[ 4] * a[ 1] + b[ 8] * a[ 2] + b[12] * a[ 3];
+    r[ 1] = b[ 1] * a[ 0] + b[ 5] * a[ 1] + b[ 9] * a[ 2] + b[13] * a[ 3];
+    r[ 2] = b[ 2] * a[ 0] + b[ 6] * a[ 1] + b[10] * a[ 2] + b[14] * a[ 3];
+    r[ 3] = b[ 3] * a[ 0] + b[ 7] * a[ 1] + b[11] * a[ 2] + b[15] * a[ 3];
+    r[ 4] = b[ 0] * a[ 4] + b[ 4] * a[ 5] + b[ 8] * a[ 6] + b[12] * a[ 7];
+    r[ 5] = b[ 1] * a[ 4] + b[ 5] * a[ 5] + b[ 9] * a[ 6] + b[13] * a[ 7];
+    r[ 6] = b[ 2] * a[ 4] + b[ 6] * a[ 5] + b[10] * a[ 6] + b[14] * a[ 7];
+    r[ 7] = b[ 3] * a[ 4] + b[ 7] * a[ 5] + b[11] * a[ 6] + b[15] * a[ 7];
+    r[ 8] = b[ 0] * a[ 8] + b[ 4] * a[ 9] + b[ 8] * a[10] + b[12] * a[11];
+    r[ 9] = b[ 1] * a[ 8] + b[ 5] * a[ 9] + b[ 9] * a[10] + b[13] * a[11];
+    r[10] = b[ 2] * a[ 8] + b[ 6] * a[ 9] + b[10] * a[10] + b[14] * a[11];
+    r[11] = b[ 3] * a[ 8] + b[ 7] * a[ 9] + b[11] * a[10] + b[15] * a[11];
+    r[12] = b[ 0] * a[12] + b[ 4] * a[13] + b[ 8] * a[14] + b[12] * a[15];
+    r[13] = b[ 1] * a[12] + b[ 5] * a[13] + b[ 9] * a[14] + b[13] * a[15];
+    r[14] = b[ 2] * a[12] + b[ 6] * a[13] + b[10] * a[14] + b[14] * a[15];
+    r[15] = b[ 3] * a[12] + b[ 7] * a[13] + b[11] * a[14] + b[15] * a[15];
+}
+/* calculate 4 x 4 matrix inverse */
+void _m3d_inv(M3D_FLOAT *m)
+{
+    M3D_FLOAT r[16];
+    M3D_FLOAT det =
+          m[ 0]*m[ 5]*m[10]*m[15] - m[ 0]*m[ 5]*m[11]*m[14] + m[ 0]*m[ 6]*m[11]*m[13] - m[ 0]*m[ 6]*m[ 9]*m[15]
+        + m[ 0]*m[ 7]*m[ 9]*m[14] - m[ 0]*m[ 7]*m[10]*m[13] - m[ 1]*m[ 6]*m[11]*m[12] + m[ 1]*m[ 6]*m[ 8]*m[15]
+        - m[ 1]*m[ 7]*m[ 8]*m[14] + m[ 1]*m[ 7]*m[10]*m[12] - m[ 1]*m[ 4]*m[10]*m[15] + m[ 1]*m[ 4]*m[11]*m[14]
+        + m[ 2]*m[ 7]*m[ 8]*m[13] - m[ 2]*m[ 7]*m[ 9]*m[12] + m[ 2]*m[ 4]*m[ 9]*m[15] - m[ 2]*m[ 4]*m[11]*m[13]
+        + m[ 2]*m[ 5]*m[11]*m[12] - m[ 2]*m[ 5]*m[ 8]*m[15] - m[ 3]*m[ 4]*m[ 9]*m[14] + m[ 3]*m[ 4]*m[10]*m[13]
+        - m[ 3]*m[ 5]*m[10]*m[12] + m[ 3]*m[ 5]*m[ 8]*m[14] - m[ 3]*m[ 6]*m[ 8]*m[13] + m[ 3]*m[ 6]*m[ 9]*m[12];
+    if(det == (M3D_FLOAT)0.0 || det == (M3D_FLOAT)-0.0) det = (M3D_FLOAT)1.0; else det = (M3D_FLOAT)1.0 / det;
+    r[ 0] = det *(m[ 5]*(m[10]*m[15] - m[11]*m[14]) + m[ 6]*(m[11]*m[13] - m[ 9]*m[15]) + m[ 7]*(m[ 9]*m[14] - m[10]*m[13]));
+    r[ 1] = -det*(m[ 1]*(m[10]*m[15] - m[11]*m[14]) + m[ 2]*(m[11]*m[13] - m[ 9]*m[15]) + m[ 3]*(m[ 9]*m[14] - m[10]*m[13]));
+    r[ 2] = det *(m[ 1]*(m[ 6]*m[15] - m[ 7]*m[14]) + m[ 2]*(m[ 7]*m[13] - m[ 5]*m[15]) + m[ 3]*(m[ 5]*m[14] - m[ 6]*m[13]));
+    r[ 3] = -det*(m[ 1]*(m[ 6]*m[11] - m[ 7]*m[10]) + m[ 2]*(m[ 7]*m[ 9] - m[ 5]*m[11]) + m[ 3]*(m[ 5]*m[10] - m[ 6]*m[ 9]));
+    r[ 4] = -det*(m[ 4]*(m[10]*m[15] - m[11]*m[14]) + m[ 6]*(m[11]*m[12] - m[ 8]*m[15]) + m[ 7]*(m[ 8]*m[14] - m[10]*m[12]));
+    r[ 5] = det *(m[ 0]*(m[10]*m[15] - m[11]*m[14]) + m[ 2]*(m[11]*m[12] - m[ 8]*m[15]) + m[ 3]*(m[ 8]*m[14] - m[10]*m[12]));
+    r[ 6] = -det*(m[ 0]*(m[ 6]*m[15] - m[ 7]*m[14]) + m[ 2]*(m[ 7]*m[12] - m[ 4]*m[15]) + m[ 3]*(m[ 4]*m[14] - m[ 6]*m[12]));
+    r[ 7] = det *(m[ 0]*(m[ 6]*m[11] - m[ 7]*m[10]) + m[ 2]*(m[ 7]*m[ 8] - m[ 4]*m[11]) + m[ 3]*(m[ 4]*m[10] - m[ 6]*m[ 8]));
+    r[ 8] = det *(m[ 4]*(m[ 9]*m[15] - m[11]*m[13]) + m[ 5]*(m[11]*m[12] - m[ 8]*m[15]) + m[ 7]*(m[ 8]*m[13] - m[ 9]*m[12]));
+    r[ 9] = -det*(m[ 0]*(m[ 9]*m[15] - m[11]*m[13]) + m[ 1]*(m[11]*m[12] - m[ 8]*m[15]) + m[ 3]*(m[ 8]*m[13] - m[ 9]*m[12]));
+    r[10] = det *(m[ 0]*(m[ 5]*m[15] - m[ 7]*m[13]) + m[ 1]*(m[ 7]*m[12] - m[ 4]*m[15]) + m[ 3]*(m[ 4]*m[13] - m[ 5]*m[12]));
+    r[11] = -det*(m[ 0]*(m[ 5]*m[11] - m[ 7]*m[ 9]) + m[ 1]*(m[ 7]*m[ 8] - m[ 4]*m[11]) + m[ 3]*(m[ 4]*m[ 9] - m[ 5]*m[ 8]));
+    r[12] = -det*(m[ 4]*(m[ 9]*m[14] - m[10]*m[13]) + m[ 5]*(m[10]*m[12] - m[ 8]*m[14]) + m[ 6]*(m[ 8]*m[13] - m[ 9]*m[12]));
+    r[13] = det *(m[ 0]*(m[ 9]*m[14] - m[10]*m[13]) + m[ 1]*(m[10]*m[12] - m[ 8]*m[14]) + m[ 2]*(m[ 8]*m[13] - m[ 9]*m[12]));
+    r[14] = -det*(m[ 0]*(m[ 5]*m[14] - m[ 6]*m[13]) + m[ 1]*(m[ 6]*m[12] - m[ 4]*m[14]) + m[ 2]*(m[ 4]*m[13] - m[ 5]*m[12]));
+    r[15] = det *(m[ 0]*(m[ 5]*m[10] - m[ 6]*m[ 9]) + m[ 1]*(m[ 6]*m[ 8] - m[ 4]*m[10]) + m[ 2]*(m[ 4]*m[ 9] - m[ 5]*m[ 8]));
+    memcpy(m, &r, sizeof(r));
+}
+/* compose a coloumn major 4 x 4 matrix from vec3 position and vec4 orientation/rotation quaternion */
+void _m3d_mat(M3D_FLOAT *r, m3dv_t *p, m3dv_t *q)
+{
+    if(q->x == (M3D_FLOAT)0.0 && q->y == (M3D_FLOAT)0.0 && q->z >=(M3D_FLOAT) 0.7071065 && q->z <= (M3D_FLOAT)0.7071075 &&
+        q->w == (M3D_FLOAT)0.0) {
+        r[ 1] = r[ 2] = r[ 4] = r[ 6] = r[ 8] = r[ 9] = (M3D_FLOAT)0.0;
+        r[ 0] = r[ 5] = r[10] = (M3D_FLOAT)-1.0;
+    } else {
+        r[ 0] = 1 - 2 * (q->y * q->y + q->z * q->z); if(r[ 0]>-M3D_EPSILON && r[ 0]<M3D_EPSILON) r[ 0]=(M3D_FLOAT)0.0;
+        r[ 1] = 2 * (q->x * q->y - q->z * q->w);     if(r[ 1]>-M3D_EPSILON && r[ 1]<M3D_EPSILON) r[ 1]=(M3D_FLOAT)0.0;
+        r[ 2] = 2 * (q->x * q->z + q->y * q->w);     if(r[ 2]>-M3D_EPSILON && r[ 2]<M3D_EPSILON) r[ 2]=(M3D_FLOAT)0.0;
+        r[ 4] = 2 * (q->x * q->y + q->z * q->w);     if(r[ 4]>-M3D_EPSILON && r[ 4]<M3D_EPSILON) r[ 4]=(M3D_FLOAT)0.0;
+        r[ 5] = 1 - 2 * (q->x * q->x + q->z * q->z); if(r[ 5]>-M3D_EPSILON && r[ 5]<M3D_EPSILON) r[ 5]=(M3D_FLOAT)0.0;
+        r[ 6] = 2 * (q->y * q->z - q->x * q->w);     if(r[ 6]>-M3D_EPSILON && r[ 6]<M3D_EPSILON) r[ 6]=(M3D_FLOAT)0.0;
+        r[ 8] = 2 * (q->x * q->z - q->y * q->w);     if(r[ 8]>-M3D_EPSILON && r[ 8]<M3D_EPSILON) r[ 8]=(M3D_FLOAT)0.0;
+        r[ 9] = 2 * (q->y * q->z + q->x * q->w);     if(r[ 9]>-M3D_EPSILON && r[ 9]<M3D_EPSILON) r[ 9]=(M3D_FLOAT)0.0;
+        r[10] = 1 - 2 * (q->x * q->x + q->y * q->y); if(r[10]>-M3D_EPSILON && r[10]<M3D_EPSILON) r[10]=(M3D_FLOAT)0.0;
+    }
+    r[ 3] = p->x; r[ 7] = p->y; r[11] = p->z;
+    r[12] = 0; r[13] = 0; r[14] = 0; r[15] = 1;
+}
+#endif
+#if !defined(M3D_NOANIMATION) || !defined(M3D_NONORMALS)
+/* portable fast inverse square root calculation. returns 1/sqrt(x) */
+static M3D_FLOAT _m3d_rsq(M3D_FLOAT x)
+{
+#ifdef M3D_DOUBLE
+    return ((M3D_FLOAT)15.0/(M3D_FLOAT)8.0) + ((M3D_FLOAT)-5.0/(M3D_FLOAT)4.0)*x + ((M3D_FLOAT)3.0/(M3D_FLOAT)8.0)*x*x;
+#else
+    /* John Carmack's */
+    float x2 = x * 0.5f;
+    uint32_t *i = (uint32_t*)&x;
+    *i = (0x5f3759df - (*i >> 1));
+    return x * (1.5f - (x2 * x * x));
+#endif
+}
+#endif
+
+/**
+ * Function to decode a Model 3D into in-memory format
+ */
+m3d_t *m3d_load(unsigned char *data, m3dread_t readfilecb, m3dfree_t freecb, m3d_t *mtllib)
+{
+    unsigned char *end, *chunk, *buff, weights[8];
+    unsigned int i, j, k, l, n, am, len = 0, reclen, offs;
+#ifndef M3D_NOVOXELS
+    int32_t min_x, min_y, min_z, max_x, max_y, max_z, sx, sy, sz, x, y, z;
+    M3D_INDEX edge[8], enorm;
+#endif
+    char *name, *lang;
+    float f;
+    m3d_t *model;
+    M3D_INDEX mi;
+#ifdef M3D_VERTEXMAX
+    M3D_INDEX pi;
+#endif
+    M3D_FLOAT w;
+    m3dcd_t *cd;
+    m3dtx_t *tx;
+    m3dh_t *h;
+    m3dm_t *m;
+    m3da_t *a;
+    m3di_t *t;
+#ifndef M3D_NONORMALS
+    char neednorm = 0;
+    m3dv_t *norm = NULL, *v0, *v1, *v2, va, vb;
+#endif
+#ifndef M3D_NOANIMATION
+    M3D_FLOAT r[16];
+#endif
+#if !defined(M3D_NOWEIGHTS) || !defined(M3D_NOANIMATION)
+    m3db_t *b;
+#endif
+#ifndef M3D_NOWEIGHTS
+    m3ds_t *sk;
+#endif
+#ifdef M3D_ASCII
+    m3ds_t s;
+    M3D_INDEX bi[M3D_BONEMAXLEVEL+1], level;
+    const char *ol;
+    char *ptr, *pe, *fn;
+#endif
+#ifdef M3D_PROFILING
+    struct timeval tv0, tv1, tvd;
+    gettimeofday(&tv0, NULL);
+#endif
+
+    if(!data || (!M3D_CHUNKMAGIC(data, '3','D','M','O')
+#ifdef M3D_ASCII
+        && !M3D_CHUNKMAGIC(data, '3','d','m','o')
+#endif
+        )) return NULL;
+    model = (m3d_t*)M3D_MALLOC(sizeof(m3d_t));
+    if(!model) {
+        M3D_LOG("Out of memory");
+        return NULL;
+    }
+    memset(model, 0, sizeof(m3d_t));
+
+    if(mtllib) {
+        model->nummaterial = mtllib->nummaterial;
+        model->material = mtllib->material;
+        model->numtexture = mtllib->numtexture;
+        model->texture = mtllib->texture;
+        model->flags |= M3D_FLG_MTLLIB;
+    }
+#ifdef M3D_ASCII
+    /* ASCII variant? */
+    if(M3D_CHUNKMAGIC(data, '3','d','m','o')) {
+        model->errcode = M3D_ERR_BADFILE;
+        model->flags |= M3D_FLG_FREESTR;
+        model->raw = (m3dhdr_t*)data;
+        ptr = (char*)data;
+        ol = setlocale(LC_NUMERIC, NULL);
+        setlocale(LC_NUMERIC, "C");
+        /* parse header. Don't use sscanf, that's incredibly slow */
+        ptr = _m3d_findarg(ptr);
+        if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+        pe = _m3d_findnl(ptr);
+        model->scale = (float)strtod(ptr, NULL); ptr = pe;
+        if(model->scale <= (M3D_FLOAT)0.0) model->scale = (M3D_FLOAT)1.0;
+        model->name = _m3d_safestr(ptr, 2); ptr = _m3d_findnl(ptr);
+        if(!*ptr) goto asciiend;
+        model->license = _m3d_safestr(ptr, 2); ptr = _m3d_findnl(ptr);
+        if(!*ptr) goto asciiend;
+        model->author = _m3d_safestr(ptr, 2); ptr = _m3d_findnl(ptr);
+        if(!*ptr) goto asciiend;
+        if(*ptr != '\r' && *ptr != '\n')
+            model->desc = _m3d_safestr(ptr, 3);
+        while(*ptr) {
+            while(*ptr && *ptr!='\n') ptr++;
+            ptr++; if(*ptr=='\r') ptr++;
+            if(*ptr == '\n') break;
+        }
+
+        /* the main chunk reader loop */
+        while(*ptr) {
+            while(*ptr && (*ptr == '\r' || *ptr == '\n')) ptr++;
+            if(!*ptr || (ptr[0]=='E' && ptr[1]=='n' && ptr[2]=='d')) break;
+            /* make sure there's at least one data row */
+            pe = ptr; ptr = _m3d_findnl(ptr);
+            if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+            /* Preview chunk */
+            if(!memcmp(pe, "Preview", 7)) {
+                if(readfilecb) {
+                    pe = _m3d_safestr(ptr, 0);
+                    if(!pe || !*pe) goto asciiend;
+                    model->preview.data = (*readfilecb)(pe, &model->preview.length);
+                    M3D_FREE(pe);
+                }
+                while(*ptr && *ptr != '\r' && *ptr != '\n')
+                    ptr = _m3d_findnl(ptr);
+            } else
+            /* texture map chunk */
+            if(!memcmp(pe, "Textmap", 7)) {
+                if(model->tmap) { M3D_LOG("More texture map chunks, should be unique"); goto asciiend; }
+                while(*ptr && *ptr != '\r' && *ptr != '\n') {
+                    i = model->numtmap++;
+                    model->tmap = (m3dti_t*)M3D_REALLOC(model->tmap, model->numtmap * sizeof(m3dti_t));
+                    if(!model->tmap) goto memerr;
+                    ptr = _m3d_getfloat(ptr, &model->tmap[i].u);
+                    if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                    _m3d_getfloat(ptr, &model->tmap[i].v);
+                    ptr = _m3d_findnl(ptr);
+                }
+            } else
+            /* vertex chunk */
+            if(!memcmp(pe, "Vertex", 6)) {
+                if(model->vertex) { M3D_LOG("More vertex chunks, should be unique"); goto asciiend; }
+                while(*ptr && *ptr != '\r' && *ptr != '\n') {
+                    i = model->numvertex++;
+                    model->vertex = (m3dv_t*)M3D_REALLOC(model->vertex, model->numvertex * sizeof(m3dv_t));
+                    if(!model->vertex) goto memerr;
+                    memset(&model->vertex[i], 0, sizeof(m3dv_t));
+                    model->vertex[i].skinid = M3D_UNDEF;
+                    model->vertex[i].color = 0;
+                    model->vertex[i].w = (M3D_FLOAT)1.0;
+                    ptr = _m3d_getfloat(ptr, &model->vertex[i].x);
+                    if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                    ptr = _m3d_getfloat(ptr, &model->vertex[i].y);
+                    if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                    ptr = _m3d_getfloat(ptr, &model->vertex[i].z);
+                    if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                    ptr = _m3d_getfloat(ptr, &model->vertex[i].w);
+                    if(!*ptr) goto asciiend;
+                    if(*ptr == '#') {
+                        ptr = _m3d_gethex(ptr, &model->vertex[i].color);
+                        if(!*ptr) goto asciiend;
+                    }
+                    /* parse skin */
+                    memset(&s, 0, sizeof(m3ds_t));
+                    for(j = 0, w = (M3D_FLOAT)0.0; j < M3D_NUMBONE && *ptr && *ptr != '\r' && *ptr != '\n'; j++) {
+                        ptr = _m3d_findarg(ptr);
+                        if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                        ptr = _m3d_getint(ptr, &k);
+                        s.boneid[j] = (M3D_INDEX)k;
+                        if(*ptr == ':') {
+                            ptr++;
+                            ptr = _m3d_getfloat(ptr, &s.weight[j]);
+                            w += s.weight[j];
+                        } else if(!j)
+                            s.weight[j] = (M3D_FLOAT)1.0;
+                        if(!*ptr) goto asciiend;
+                    }
+                    if(s.boneid[0] != M3D_UNDEF && s.weight[0] > (M3D_FLOAT)0.0) {
+                        if(w != (M3D_FLOAT)1.0 && w != (M3D_FLOAT)0.0)
+                            for(j = 0; j < M3D_NUMBONE && s.weight[j] > (M3D_FLOAT)0.0; j++)
+                                s.weight[j] /= w;
+                        k = M3D_NOTDEFINED;
+                        if(model->skin) {
+                            for(j = 0; j < model->numskin; j++)
+                                if(!memcmp(&model->skin[j], &s, sizeof(m3ds_t))) { k = j; break; }
+                        }
+                        if(k == M3D_NOTDEFINED) {
+                            k = model->numskin++;
+                            model->skin = (m3ds_t*)M3D_REALLOC(model->skin, model->numskin * sizeof(m3ds_t));
+                            if(!model->skin) goto memerr;
+                            memcpy(&model->skin[k], &s, sizeof(m3ds_t));
+                        }
+                        model->vertex[i].skinid = (M3D_INDEX)k;
+                    }
+                    ptr = _m3d_findnl(ptr);
+                }
+            } else
+            /* Skeleton, bone hierarchy */
+            if(!memcmp(pe, "Bones", 5)) {
+                if(model->bone) { M3D_LOG("More bones chunks, should be unique"); goto asciiend; }
+                bi[0] = M3D_UNDEF;
+                while(*ptr && *ptr != '\r' && *ptr != '\n') {
+                    i = model->numbone++;
+                    model->bone = (m3db_t*)M3D_REALLOC(model->bone, model->numbone * sizeof(m3db_t));
+                    if(!model->bone) goto memerr;
+                    for(level = 0; *ptr == '/'; ptr++, level++);
+                    if(level > M3D_BONEMAXLEVEL || !*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                    bi[level+1] = i;
+                    model->bone[i].numweight = 0;
+                    model->bone[i].weight = NULL;
+                    model->bone[i].parent = bi[level];
+                    ptr = _m3d_getint(ptr, &k);
+                    ptr = _m3d_findarg(ptr);
+                    if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                    model->bone[i].pos = (M3D_INDEX)k;
+                    ptr = _m3d_getint(ptr, &k);
+                    ptr = _m3d_findarg(ptr);
+                    if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                    model->bone[i].ori = (M3D_INDEX)k;
+                    model->vertex[k].skinid = M3D_INDEXMAX;
+                    pe = _m3d_safestr(ptr, 0);
+                    if(!pe || !*pe) goto asciiend;
+                    model->bone[i].name = pe;
+                    ptr = _m3d_findnl(ptr);
+                }
+            } else
+            /* material chunk */
+            if(!memcmp(pe, "Material", 8)) {
+                pe = _m3d_findarg(pe);
+                if(!*pe || *pe == '\r' || *pe == '\n') goto asciiend;
+                pe = _m3d_safestr(pe, 0);
+                if(!pe || !*pe) goto asciiend;
+                for(i = 0; i < model->nummaterial; i++)
+                    if(!strcmp(pe, model->material[i].name)) {
+                        M3D_LOG("Multiple definitions for material");
+                        M3D_LOG(pe);
+                        M3D_FREE(pe);
+                        pe = NULL;
+                        while(*ptr && *ptr != '\r' && *ptr != '\n') ptr = _m3d_findnl(ptr);
+                        break;
+                    }
+                if(!pe) continue;
+                i = model->nummaterial++;
+                if(model->flags & M3D_FLG_MTLLIB) {
+                    m = model->material;
+                    model->material = (m3dm_t*)M3D_MALLOC(model->nummaterial * sizeof(m3dm_t));
+                    if(!model->material) goto memerr;
+                    memcpy(model->material, m, (model->nummaterial - 1) * sizeof(m3dm_t));
+                    if(model->texture) {
+                        tx = model->texture;
+                        model->texture = (m3dtx_t*)M3D_MALLOC(model->numtexture * sizeof(m3dtx_t));
+                        if(!model->texture) goto memerr;
+                        memcpy(model->texture, tx, model->numtexture * sizeof(m3dm_t));
+                    }
+                    model->flags &= ~M3D_FLG_MTLLIB;
+                } else {
+                    model->material = (m3dm_t*)M3D_REALLOC(model->material, model->nummaterial * sizeof(m3dm_t));
+                    if(!model->material) goto memerr;
+                }
+                m = &model->material[i];
+                m->name = pe;
+                m->numprop = 0;
+                m->prop = NULL;
+                while(*ptr && *ptr != '\r' && *ptr != '\n') {
+                    k = n = 256;
+                    if(*ptr == 'm' && *(ptr+1) == 'a' && *(ptr+2) == 'p' && *(ptr+3) == '_') {
+                        k = m3dpf_map;
+                        ptr += 4;
+                    }
+                    for(j = 0; j < sizeof(m3d_propertytypes)/sizeof(m3d_propertytypes[0]); j++)
+                        if(!memcmp(ptr, m3d_propertytypes[j].key, strlen(m3d_propertytypes[j].key))) {
+                            n = m3d_propertytypes[j].id;
+                            if(k != m3dpf_map) k = m3d_propertytypes[j].format;
+                            break;
+                        }
+                    if(n != 256 && k != 256) {
+                        ptr = _m3d_findarg(ptr);
+                        if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                        j = m->numprop++;
+                        m->prop = (m3dp_t*)M3D_REALLOC(m->prop, m->numprop * sizeof(m3dp_t));
+                        if(!m->prop) goto memerr;
+                        m->prop[j].type = n + (k == m3dpf_map && n < 128 ? 128 : 0);
+                        switch(k) {
+                            case m3dpf_color: ptr = _m3d_gethex(ptr, &m->prop[j].value.color); break;
+                            case m3dpf_uint8:
+                            case m3dpf_uint16:
+                            case m3dpf_uint32: ptr = _m3d_getint(ptr, &m->prop[j].value.num); break;
+                            case m3dpf_float:  ptr = _m3d_getfloat(ptr, &m->prop[j].value.fnum); break;
+                            case m3dpf_map:
+                                pe = _m3d_safestr(ptr, 0);
+                                if(!pe || !*pe) goto asciiend;
+                                m->prop[j].value.textureid = _m3d_gettx(model, readfilecb, freecb, pe);
+                                if(model->errcode == M3D_ERR_ALLOC) { M3D_FREE(pe); goto memerr; }
+                                /* this error code only returned if readfilecb was specified */
+                                if(m->prop[j].value.textureid == M3D_UNDEF) {
+                                    M3D_LOG("Texture not found");
+                                    M3D_LOG(pe);
+                                    m->numprop--;
+                                }
+                                M3D_FREE(pe);
+                            break;
+                        }
+                    } else {
+                        M3D_LOG("Unknown material property in");
+                        M3D_LOG(m->name);
+                        model->errcode = M3D_ERR_UNKPROP;
+                    }
+                    ptr = _m3d_findnl(ptr);
+                }
+                if(!m->numprop) model->nummaterial--;
+            } else
+            /* procedural */
+            if(!memcmp(pe, "Procedural", 10)) {
+                pe = _m3d_safestr(ptr, 0);
+                _m3d_getpr(model, readfilecb, freecb, pe);
+                M3D_FREE(pe);
+                while(*ptr && *ptr != '\r' && *ptr != '\n') ptr = _m3d_findnl(ptr);
+            } else
+            /* mesh */
+            if(!memcmp(pe, "Mesh", 4)) {
+                mi = M3D_UNDEF;
+#ifdef M3D_VERTEXMAX
+                pi = M3D_UNDEF;
+#endif
+                while(*ptr && *ptr != '\r' && *ptr != '\n') {
+                    if(*ptr == 'u') {
+                        ptr = _m3d_findarg(ptr);
+                        if(!*ptr) goto asciiend;
+                        mi = M3D_UNDEF;
+                        if(*ptr != '\r' && *ptr != '\n') {
+                            pe = _m3d_safestr(ptr, 0);
+                            if(!pe || !*pe) goto asciiend;
+                            for(j = 0; j < model->nummaterial; j++)
+                                if(!strcmp(pe, model->material[j].name)) { mi = (M3D_INDEX)j; break; }
+                            if(mi == M3D_UNDEF && !(model->flags & M3D_FLG_MTLLIB)) {
+                                mi = model->nummaterial++;
+                                model->material = (m3dm_t*)M3D_REALLOC(model->material, model->nummaterial * sizeof(m3dm_t));
+                                if(!model->material) goto memerr;
+                                model->material[mi].name = pe;
+                                model->material[mi].numprop = 1;
+                                model->material[mi].prop = NULL;
+                            } else
+                                M3D_FREE(pe);
+                        }
+                    } else
+                    if(*ptr == 'p') {
+                        ptr = _m3d_findarg(ptr);
+                        if(!*ptr) goto asciiend;
+#ifdef M3D_VERTEXMAX
+                        pi = M3D_UNDEF;
+                        if(*ptr != '\r' && *ptr != '\n') {
+                            pe = _m3d_safestr(ptr, 0);
+                            if(!pe || !*pe) goto asciiend;
+                            for(j = 0; j < model->numparam; j++)
+                                if(!strcmp(pe, model->param[j].name)) { pi = (M3D_INDEX)j; break; }
+                            if(pi == M3D_UNDEF) {
+                                pi = model->numparam++;
+                                model->param = (m3dvi_t*)M3D_REALLOC(model->param, model->numparam * sizeof(m3dvi_t));
+                                if(!model->param) goto memerr;
+                                model->param[pi].name = pe;
+                                model->param[pi].count = 0;
+                            } else
+                                M3D_FREE(pe);
+                        }
+#endif
+                    } else {
+                        i = model->numface++;
+                        model->face = (m3df_t*)M3D_REALLOC(model->face, model->numface * sizeof(m3df_t));
+                        if(!model->face) goto memerr;
+                        memset(&model->face[i], 255, sizeof(m3df_t)); /* set all index to -1 by default */
+                        model->face[i].materialid = mi;
+#ifdef M3D_VERTEXMAX
+                        model->face[i].paramid = pi;
+#endif
+                        /* hardcoded triangles. */
+                        for(j = 0; j < 3; j++) {
+                            /* vertex */
+                            ptr = _m3d_getint(ptr, &k);
+                            model->face[i].vertex[j] = (M3D_INDEX)k;
+                            if(!*ptr) goto asciiend;
+                            if(*ptr == '/') {
+                                ptr++;
+                                if(*ptr != '/') {
+                                    /* texcoord */
+                                    ptr = _m3d_getint(ptr, &k);
+                                    model->face[i].texcoord[j] = (M3D_INDEX)k;
+                                    if(!*ptr) goto asciiend;
+                                }
+                                if(*ptr == '/') {
+                                    ptr++;
+                                    /* normal */
+                                    ptr = _m3d_getint(ptr, &k);
+                                    model->face[i].normal[j] = (M3D_INDEX)k;
+                                    if(!*ptr) goto asciiend;
+                                }
+                                if(*ptr == '/') {
+                                    ptr++;
+                                    /* maximum */
+                                    ptr = _m3d_getint(ptr, &k);
+#ifdef M3D_VERTEXMAX
+                                    model->face[i].vertmax[j] = (M3D_INDEX)k;
+#endif
+                                    if(!*ptr) goto asciiend;
+                                }
+                            }
+#ifndef M3D_NONORMALS
+                            if(model->face[i].normal[j] == M3D_UNDEF) neednorm = 1;
+#endif
+                            ptr = _m3d_findarg(ptr);
+                        }
+                    }
+                    ptr = _m3d_findnl(ptr);
+                }
+            } else
+            /* voxel types chunk */
+            if(!memcmp(pe, "VoxTypes", 8) || !memcmp(pe, "Voxtypes", 8)) {
+                if(model->voxtype) { M3D_LOG("More voxel types chunks, should be unique"); goto asciiend; }
+                while(*ptr && *ptr != '\r' && *ptr != '\n') {
+                    i = model->numvoxtype++;
+                    model->voxtype = (m3dvt_t*)M3D_REALLOC(model->voxtype, model->numvoxtype * sizeof(m3dvt_t));
+                    if(!model->voxtype) goto memerr;
+                    memset(&model->voxtype[i], 0, sizeof(m3dvt_t));
+                    model->voxtype[i].materialid = M3D_UNDEF;
+                    model->voxtype[i].skinid = M3D_UNDEF;
+                    ptr = _m3d_gethex(ptr, &model->voxtype[i].color);
+                    if(!*ptr) goto asciiend;
+                    if(*ptr == '/') {
+                        ptr = _m3d_gethex(ptr, &k);
+                        model->voxtype[i].rotation = k;
+                        if(!*ptr) goto asciiend;
+                        if(*ptr == '/') {
+                            ptr = _m3d_gethex(ptr, &k);
+                            model->voxtype[i].voxshape = k;
+                            if(!*ptr) goto asciiend;
+                        }
+                    }
+                    while(*ptr == ' ' || *ptr == '\t') ptr++;
+                    if(*ptr == '\r' || *ptr == '\n') { ptr = _m3d_findnl(ptr); continue; }
+                    /* name */
+                    if(*ptr != '-') {
+                        pe = _m3d_safestr(ptr, 0);
+                        if(!pe || !*pe) goto asciiend;
+                        model->voxtype[i].name = pe;
+                        for(j = 0; j < model->nummaterial; j++)
+                            if(!strcmp(pe, model->material[j].name)) { model->voxtype[i].materialid = (M3D_INDEX)j; break; }
+                    }
+                    ptr = _m3d_findarg(ptr);
+                    /* parse skin */
+                    memset(&s, 0, sizeof(m3ds_t));
+                    for(j = 0, w = (M3D_FLOAT)0.0; j < M3D_NUMBONE && *ptr && *ptr != '{' && *ptr != '\r' && *ptr != '\n'; j++) {
+                        ptr = _m3d_getint(ptr, &k);
+                        s.boneid[j] = (M3D_INDEX)k;
+                        if(*ptr == ':') {
+                            ptr++;
+                            ptr = _m3d_getfloat(ptr, &s.weight[j]);
+                            w += s.weight[j];
+                        } else if(!j)
+                            s.weight[j] = (M3D_FLOAT)1.0;
+                        if(!*ptr) goto asciiend;
+                        ptr = _m3d_findarg(ptr);
+                    }
+                    if(s.boneid[0] != M3D_UNDEF && s.weight[0] > (M3D_FLOAT)0.0) {
+                        if(w != (M3D_FLOAT)1.0 && w != (M3D_FLOAT)0.0)
+                            for(j = 0; j < M3D_NUMBONE && s.weight[j] > (M3D_FLOAT)0.0; j++)
+                                s.weight[j] /= w;
+                        k = M3D_NOTDEFINED;
+                        if(model->skin) {
+                            for(j = 0; j < model->numskin; j++)
+                                if(!memcmp(&model->skin[j], &s, sizeof(m3ds_t))) { k = j; break; }
+                        }
+                        if(k == M3D_NOTDEFINED) {
+                            k = model->numskin++;
+                            model->skin = (m3ds_t*)M3D_REALLOC(model->skin, model->numskin * sizeof(m3ds_t));
+                            if(!model->skin) goto memerr;
+                            memcpy(&model->skin[k], &s, sizeof(m3ds_t));
+                        }
+                        model->voxtype[i].skinid = (M3D_INDEX)k;
+                    }
+                    /* parse item list */
+                    if(*ptr == '{') {
+                        while(*ptr == '{' || *ptr == ' ' || *ptr == '\t') ptr++;
+                        while(*ptr && *ptr != '}' && *ptr != '\r' && *ptr != '\n') {
+                            ptr = _m3d_getint(ptr, &k);
+                            ptr = _m3d_findarg(ptr);
+                            if(!*ptr || *ptr == '}' || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                            pe = _m3d_safestr(ptr, 0);
+                            if(!pe || !*pe) goto asciiend;
+                            ptr = _m3d_findarg(ptr);
+                            j = model->voxtype[i].numitem++;
+                            model->voxtype[i].item = (m3dvi_t*)M3D_REALLOC(model->voxtype[i].item,
+                                model->voxtype[i].numitem * sizeof(m3dvi_t));
+                            if(!model->voxtype[i].item) goto memerr;
+                            model->voxtype[i].item[j].count = k;
+                            model->voxtype[i].item[j].name = pe;
+                        }
+                        if(*ptr != '}') goto asciiend;
+                    }
+                    ptr = _m3d_findnl(ptr);
+                }
+            } else
+            /* voxel data */
+            if(!memcmp(pe, "Voxel", 5)) {
+                if(!model->voxtype) { M3D_LOG("No voxel type chunk before voxel data"); goto asciiend; }
+                pe = _m3d_findarg(pe);
+                if(!*pe) goto asciiend;
+                if(*pe == '\r' || *pe == '\n') pe = NULL;
+                else pe = _m3d_safestr(pe, 0);
+                i = model->numvoxel++;
+                model->voxel = (m3dvx_t*)M3D_REALLOC(model->voxel, model->numvoxel * sizeof(m3dvx_t));
+                if(!model->voxel) goto memerr;
+                memset(&model->voxel[i], 0, sizeof(m3dvx_t));
+                model->voxel[i].name = pe;
+                k = l = 0;
+                while(*ptr && *ptr != '\r' && *ptr != '\n') {
+                    switch(*ptr) {
+                        case 'u':
+                            ptr = _m3d_findarg(ptr);
+                            if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                            ptr = _m3d_getint(ptr, &n);
+                            model->voxel[i].uncertain = ((n > 0 && n < 256 ? n : 0) * 255) / 100;
+                            ptr = _m3d_findarg(ptr);
+                            if(*ptr && *ptr != '\r' && *ptr != '\n') {
+                                ptr = _m3d_getint(ptr, &n);
+                                model->voxel[i].groupid = n > 0 && n < 256 ? n : 0;
+                            }
+                        break;
+                        case 'p':
+                            ptr = _m3d_findarg(ptr);
+                            if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                            ptr = _m3d_getint(ptr, &n);
+                            model->voxel[i].x = n;
+                            ptr = _m3d_findarg(ptr);
+                            if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                            ptr = _m3d_getint(ptr, &n);
+                            model->voxel[i].y = n;
+                            ptr = _m3d_findarg(ptr);
+                            if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                            ptr = _m3d_getint(ptr, &n);
+                            model->voxel[i].z = n;
+                        break;
+                        case 'd':
+                            ptr = _m3d_findarg(ptr);
+                            if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                            ptr = _m3d_getint(ptr, &n);
+                            model->voxel[i].w = n;
+                            ptr = _m3d_findarg(ptr);
+                            if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                            ptr = _m3d_getint(ptr, &n);
+                            model->voxel[i].h = n;
+                            ptr = _m3d_findarg(ptr);
+                            if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                            ptr = _m3d_getint(ptr, &n);
+                            model->voxel[i].d = n;
+                        break;
+                        case 'l':
+                            if(model->voxel[i].data) { l++; k = 0; }
+                            else {
+                                if(!model->voxel[i].w || !model->voxel[i].h || !model->voxel[i].d) {
+                                    M3D_LOG("No voxel dimension before layer data");
+                                    goto asciiend;
+                                }
+                                model->voxel[i].data = (M3D_VOXEL*)M3D_MALLOC(
+                                    model->voxel[i].w * model->voxel[i].h * model->voxel[i].d * sizeof(M3D_VOXEL));
+                                if(!model->voxel[i].data) goto memerr;
+                            }
+                        break;
+                        default:
+                            if(!model->voxel[i].data || l >= model->voxel[i].h || k >= model->voxel[i].d) {
+                                M3D_LOG("Missing voxel attributes or out of bound data");
+                                goto asciiend;
+                            }
+                            for(n = l * model->voxel[i].w * model->voxel[i].d + k * model->voxel[i].w;
+                                j < model->voxel[i].w && *ptr && *ptr != '\r' && *ptr != '\n'; j++) {
+                                ptr = _m3d_getint(ptr, &am);
+                                if(am >= model->numvoxtype) goto asciiend;
+                                model->voxel[i].data[n + j] = am;
+                            }
+                            k++;
+                        break;
+                    }
+                    ptr = _m3d_findnl(ptr);
+                }
+            } else
+            /* mathematical shape */
+            if(!memcmp(pe, "Shape", 5)) {
+                pe = _m3d_findarg(pe);
+                if(!*pe || *pe == '\r' || *pe == '\n') goto asciiend;
+                pe = _m3d_safestr(pe, 0);
+                if(!pe || !*pe) goto asciiend;
+                i = model->numshape++;
+                model->shape = (m3dh_t*)M3D_REALLOC(model->shape, model->numshape * sizeof(m3ds_t));
+                if(!model->shape) goto memerr;
+                h = &model->shape[i];
+                h->name = pe;
+                h->group = M3D_UNDEF;
+                h->numcmd = 0;
+                h->cmd = NULL;
+                while(*ptr && *ptr != '\r' && *ptr != '\n') {
+                    if(!memcmp(ptr, "group", 5)) {
+                        ptr = _m3d_findarg(ptr);
+                        ptr = _m3d_getint(ptr, &h->group);
+                        ptr = _m3d_findnl(ptr);
+                        if(h->group != M3D_UNDEF && h->group >= model->numbone) {
+                            M3D_LOG("Unknown bone id as shape group in shape");
+                            M3D_LOG(pe);
+                            h->group = M3D_UNDEF;
+                            model->errcode = M3D_ERR_SHPE;
+                        }
+                        continue;
+                    }
+                    for(cd = NULL, k = 0; k < (unsigned int)(sizeof(m3d_commandtypes)/sizeof(m3d_commandtypes[0])); k++) {
+                        j = (unsigned int)strlen(m3d_commandtypes[k].key);
+                        if(!memcmp(ptr, m3d_commandtypes[k].key, j) && (ptr[j] == ' ' || ptr[j] == '\r' || ptr[j] == '\n'))
+                            { cd = &m3d_commandtypes[k]; break; }
+                    }
+                    if(cd) {
+                        j = h->numcmd++;
+                        h->cmd = (m3dc_t*)M3D_REALLOC(h->cmd, h->numcmd * sizeof(m3dc_t));
+                        if(!h->cmd) goto memerr;
+                        h->cmd[j].type = k;
+                        h->cmd[j].arg = (uint32_t*)M3D_MALLOC(cd->p * sizeof(uint32_t));
+                        if(!h->cmd[j].arg) goto memerr;
+                        memset(h->cmd[j].arg, 0, cd->p * sizeof(uint32_t));
+                        for(k = n = 0, l = cd->p; k < l; k++) {
+                            ptr = _m3d_findarg(ptr);
+                            if(!*ptr) goto asciiend;
+                            if(*ptr == '[') {
+                                ptr = _m3d_findarg(ptr + 1);
+                                if(!*ptr) goto asciiend;
+                            }
+                            if(*ptr == ']' || *ptr == '\r' || *ptr == '\n') break;
+                            switch(cd->a[((k - n) % (cd->p - n)) + n]) {
+                                case m3dcp_mi_t:
+                                    mi = M3D_UNDEF;
+                                    if(*ptr != '\r' && *ptr != '\n') {
+                                        pe = _m3d_safestr(ptr, 0);
+                                        if(!pe || !*pe) goto asciiend;
+                                        for(n = 0; n < model->nummaterial; n++)
+                                            if(!strcmp(pe, model->material[n].name)) { mi = (M3D_INDEX)n; break; }
+                                        if(mi == M3D_UNDEF && !(model->flags & M3D_FLG_MTLLIB)) {
+                                            mi = model->nummaterial++;
+                                            model->material = (m3dm_t*)M3D_REALLOC(model->material,
+                                                model->nummaterial * sizeof(m3dm_t));
+                                            if(!model->material) goto memerr;
+                                            model->material[mi].name = pe;
+                                            model->material[mi].numprop = 1;
+                                            model->material[mi].prop = NULL;
+                                        } else
+                                            M3D_FREE(pe);
+                                    }
+                                    h->cmd[j].arg[k] = mi;
+                                break;
+                                case m3dcp_vc_t:
+#ifdef M3D_DOUBLE
+                                    _m3d_getfloat(ptr, &w); f = w;
+                                    memcpy(&h->cmd[j].arg[k], &f, 4);
+#else
+                                    _m3d_getfloat(ptr, (float*)&h->cmd[j].arg[k]);
+#endif
+                                break;
+                                case m3dcp_va_t:
+                                    ptr = _m3d_getint(ptr, &h->cmd[j].arg[k]);
+                                    n = k + 1; l += (h->cmd[j].arg[k] - 1) * (cd->p - k - 1);
+                                    h->cmd[j].arg = (uint32_t*)M3D_REALLOC(h->cmd[j].arg, l * sizeof(uint32_t));
+                                    if(!h->cmd[j].arg) goto memerr;
+                                    memset(&h->cmd[j].arg[k + 1], 0, (l - k - 1) * sizeof(uint32_t));
+                                break;
+                                case m3dcp_qi_t:
+                                    ptr = _m3d_getint(ptr, &h->cmd[j].arg[k]);
+                                    model->vertex[h->cmd[i].arg[k]].skinid = M3D_INDEXMAX;
+                                break;
+                                default:
+                                    ptr = _m3d_getint(ptr, &h->cmd[j].arg[k]);
+                                break;
+                            }
+                        }
+                    } else {
+                        M3D_LOG("Unknown shape command in");
+                        M3D_LOG(h->name);
+                        model->errcode = M3D_ERR_UNKCMD;
+                    }
+                    ptr = _m3d_findnl(ptr);
+                }
+                if(!h->numcmd) model->numshape--;
+            } else
+            /* annotation labels */
+            if(!memcmp(pe, "Labels", 6)) {
+                pe = _m3d_findarg(pe);
+                if(!*pe) goto asciiend;
+                if(*pe == '\r' || *pe == '\n') pe = NULL;
+                else pe = _m3d_safestr(pe, 0);
+                k = 0; fn = NULL;
+                while(*ptr && *ptr != '\r' && *ptr != '\n') {
+                    if(*ptr == 'c') {
+                        ptr = _m3d_findarg(ptr);
+                        if(!*pe || *pe == '\r' || *pe == '\n') goto asciiend;
+                        ptr = _m3d_gethex(ptr, &k);
+                    } else
+                    if(*ptr == 'l') {
+                        ptr = _m3d_findarg(ptr);
+                        if(!*pe || *pe == '\r' || *pe == '\n') goto asciiend;
+                        fn = _m3d_safestr(ptr, 2);
+                    } else {
+                        i = model->numlabel++;
+                        model->label = (m3dl_t*)M3D_REALLOC(model->label, model->numlabel * sizeof(m3dl_t));
+                        if(!model->label) goto memerr;
+                        model->label[i].name = pe;
+                        model->label[i].lang = fn;
+                        model->label[i].color = k;
+                        ptr = _m3d_getint(ptr, &j);
+                        model->label[i].vertexid = (M3D_INDEX)j;
+                        ptr = _m3d_findarg(ptr);
+                        if(!*pe || *pe == '\r' || *pe == '\n') goto asciiend;
+                        model->label[i].text = _m3d_safestr(ptr, 2);
+                    }
+                    ptr = _m3d_findnl(ptr);
+                }
+            } else
+            /* action */
+            if(!memcmp(pe, "Action", 6)) {
+                pe = _m3d_findarg(pe);
+                if(!*pe || *pe == '\r' || *pe == '\n') goto asciiend;
+                pe = _m3d_getint(pe, &k);
+                pe = _m3d_findarg(pe);
+                if(!*pe || *pe == '\r' || *pe == '\n') goto asciiend;
+                pe = _m3d_safestr(pe, 0);
+                if(!pe || !*pe) goto asciiend;
+                i = model->numaction++;
+                model->action = (m3da_t*)M3D_REALLOC(model->action, model->numaction * sizeof(m3da_t));
+                if(!model->action) goto memerr;
+                a = &model->action[i];
+                a->name = pe;
+                a->durationmsec = k;
+                /* skip the first frame marker as there's always at least one frame */
+                a->numframe = 1;
+                a->frame = (m3dfr_t*)M3D_MALLOC(sizeof(m3dfr_t));
+                if(!a->frame) goto memerr;
+                a->frame[0].msec = 0;
+                a->frame[0].numtransform = 0;
+                a->frame[0].transform = NULL;
+                i = 0;
+                if(*ptr == 'f')
+                    ptr = _m3d_findnl(ptr);
+                while(*ptr && *ptr != '\r' && *ptr != '\n') {
+                    if(*ptr == 'f') {
+                        i = a->numframe++;
+                        a->frame = (m3dfr_t*)M3D_REALLOC(a->frame, a->numframe * sizeof(m3dfr_t));
+                        if(!a->frame) goto memerr;
+                        ptr = _m3d_findarg(ptr);
+                        ptr = _m3d_getint(ptr, &a->frame[i].msec);
+                        a->frame[i].numtransform = 0;
+                        a->frame[i].transform = NULL;
+                    } else {
+                        j = a->frame[i].numtransform++;
+                        a->frame[i].transform = (m3dtr_t*)M3D_REALLOC(a->frame[i].transform,
+                            a->frame[i].numtransform * sizeof(m3dtr_t));
+                        if(!a->frame[i].transform) goto memerr;
+                        ptr = _m3d_getint(ptr, &k);
+                        ptr = _m3d_findarg(ptr);
+                        if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                        a->frame[i].transform[j].boneid = (M3D_INDEX)k;
+                        ptr = _m3d_getint(ptr, &k);
+                        ptr = _m3d_findarg(ptr);
+                        if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                        a->frame[i].transform[j].pos = (M3D_INDEX)k;
+                        ptr = _m3d_getint(ptr, &k);
+                        if(!*ptr || *ptr == '\r' || *ptr == '\n') goto asciiend;
+                        a->frame[i].transform[j].ori = (M3D_INDEX)k;
+                        model->vertex[k].skinid = M3D_INDEXMAX;
+                    }
+                    ptr = _m3d_findnl(ptr);
+                }
+            } else
+            /* inlined assets chunk */
+            if(!memcmp(pe, "Assets", 6)) {
+                while(*ptr && *ptr != '\r' && *ptr != '\n') {
+                    if(readfilecb) {
+                        pe = _m3d_safestr(ptr, 2);
+                        if(!pe || !*pe) goto asciiend;
+                        i = model->numinlined++;
+                        model->inlined = (m3di_t*)M3D_REALLOC(model->inlined, model->numinlined * sizeof(m3di_t));
+                        if(!model->inlined) goto memerr;
+                        t = &model->inlined[i];
+                        model->inlined[i].data = (*readfilecb)(pe, &model->inlined[i].length);
+                        if(model->inlined[i].data) {
+                            fn = strrchr(pe, '.');
+                            if(fn && (fn[1] == 'p' || fn[1] == 'P') && (fn[2] == 'n' || fn[2] == 'N') &&
+                                (fn[3] == 'g' || fn[3] == 'G')) *fn = 0;
+                            fn = strrchr(pe, '/');
+                            if(!fn) fn = strrchr(pe, '\\');
+                            if(!fn) fn = pe; else fn++;
+                            model->inlined[i].name = _m3d_safestr(fn, 0);
+                        } else
+                            model->numinlined--;
+                        M3D_FREE(pe);
+                    }
+                    ptr = _m3d_findnl(ptr);
+                }
+            } else
+            /* extra chunks */
+            if(!memcmp(pe, "Extra", 5)) {
+                pe = _m3d_findarg(pe);
+                if(!*pe || *pe == '\r' || *pe == '\n') goto asciiend;
+                buff = (unsigned char*)_m3d_findnl(ptr);
+                k = ((uint32_t)((uintptr_t)buff - (uintptr_t)ptr) / 3) + 1;
+                i = model->numextra++;
+                model->extra = (m3dchunk_t**)M3D_REALLOC(model->extra, model->numextra * sizeof(m3dchunk_t*));
+                if(!model->extra) goto memerr;
+                model->extra[i] = (m3dchunk_t*)M3D_MALLOC(k + sizeof(m3dchunk_t));
+                if(!model->extra[i]) goto memerr;
+                memcpy(&model->extra[i]->magic, pe, 4);
+                model->extra[i]->length = sizeof(m3dchunk_t);
+                pe = (char*)model->extra[i] + sizeof(m3dchunk_t);
+                while(*ptr && *ptr != '\r' && *ptr != '\n') {
+                    ptr = _m3d_gethex(ptr, &k);
+                    *pe++ = (uint8_t)k;
+                    model->extra[i]->length++;
+                }
+            } else
+                goto asciiend;
+        }
+        model->errcode = M3D_SUCCESS;
+asciiend:
+        setlocale(LC_NUMERIC, ol);
+        goto postprocess;
+    }
+#endif
+    /* Binary variant */
+    len = ((m3dhdr_t*)data)->length - 8;
+    data += 8;
+    if(M3D_CHUNKMAGIC(data, 'P','R','V','W')) {
+        /* optional preview chunk */
+        model->preview.length = ((m3dchunk_t*)data)->length;
+        model->preview.data = data + sizeof(m3dchunk_t);
+        data += model->preview.length;
+        len -= model->preview.length;
+    }
+    if(!M3D_CHUNKMAGIC(data, 'H','E','A','D')) {
+        buff = (unsigned char *)stbi_zlib_decode_malloc_guesssize_headerflag((const char*)data, len, 4096, (int*)&len, 1);
+        if(!buff || !len || !M3D_CHUNKMAGIC(buff, 'H','E','A','D')) {
+            if(buff) M3D_FREE(buff);
+            M3D_FREE(model);
+            return NULL;
+        }
+        buff = (unsigned char*)M3D_REALLOC(buff, len);
+        model->flags |= M3D_FLG_FREERAW; /* mark that we have to free the raw buffer */
+        data = buff;
+#ifdef M3D_PROFILING
+        gettimeofday(&tv1, NULL);
+        tvd.tv_sec = tv1.tv_sec - tv0.tv_sec;
+        tvd.tv_usec = tv1.tv_usec - tv0.tv_usec;
+        if(tvd.tv_usec < 0) { tvd.tv_sec--; tvd.tv_usec += 1000000L; }
+        printf("  Deflate model   %ld.%06ld sec\n", tvd.tv_sec, tvd.tv_usec);
+        memcpy(&tv0, &tv1, sizeof(struct timeval));
+#endif
+    }
+    model->raw = (m3dhdr_t*)data;
+    end = data + len;
+
+    /* parse header */
+    data += sizeof(m3dhdr_t);
+    M3D_LOG((char*)data);
+    model->name = (char*)data;
+    for(; data < end && *data; data++) {}; data++;
+    model->license = (char*)data;
+    for(; data < end && *data; data++) {}; data++;
+    model->author = (char*)data;
+    for(; data < end && *data; data++) {}; data++;
+    model->desc = (char*)data;
+    chunk = (unsigned char*)model->raw + model->raw->length;
+    model->scale = (M3D_FLOAT)model->raw->scale;
+    if(model->scale <= (M3D_FLOAT)0.0) model->scale = (M3D_FLOAT)1.0;
+    model->vc_s = 1 << ((model->raw->types >> 0) & 3);  /* vertex coordinate size */
+    model->vi_s = 1 << ((model->raw->types >> 2) & 3);  /* vertex index size */
+    model->si_s = 1 << ((model->raw->types >> 4) & 3);  /* string offset size */
+    model->ci_s = 1 << ((model->raw->types >> 6) & 3);  /* color index size */
+    model->ti_s = 1 << ((model->raw->types >> 8) & 3);  /* tmap index size */
+    model->bi_s = 1 << ((model->raw->types >>10) & 3);  /* bone index size */
+    model->nb_s = 1 << ((model->raw->types >>12) & 3);  /* number of bones per vertex */
+    model->sk_s = 1 << ((model->raw->types >>14) & 3);  /* skin index size */
+    model->fc_s = 1 << ((model->raw->types >>16) & 3);  /* frame counter size */
+    model->hi_s = 1 << ((model->raw->types >>18) & 3);  /* shape index size */
+    model->fi_s = 1 << ((model->raw->types >>20) & 3);  /* face index size */
+    model->vd_s = 1 << ((model->raw->types >>22) & 3);  /* voxel dimension size */
+    model->vp_s = 1 << ((model->raw->types >>24) & 3);  /* voxel pixel size */
+    if(model->ci_s == 8) model->ci_s = 0;               /* optional indices */
+    if(model->ti_s == 8) model->ti_s = 0;
+    if(model->bi_s == 8) model->bi_s = 0;
+    if(model->sk_s == 8) model->sk_s = 0;
+    if(model->fc_s == 8) model->fc_s = 0;
+    if(model->hi_s == 8) model->hi_s = 0;
+    if(model->fi_s == 8) model->fi_s = 0;
+
+    /* variable limit checks */
+    if(sizeof(M3D_FLOAT) == 4 && model->vc_s > 4) {
+        M3D_LOG("Double precision coordinates not supported, truncating to float...");
+        model->errcode = M3D_ERR_TRUNC;
+    }
+    if((sizeof(M3D_INDEX) == 2 && (model->vi_s > 2 || model->si_s > 2 || model->ci_s > 2 || model->ti_s > 2 ||
+        model->bi_s > 2 || model->sk_s > 2 || model->fc_s > 2 || model->hi_s > 2 || model->fi_s > 2)) ||
+       (sizeof(M3D_VOXEL) < (size_t)model->vp_s && model->vp_s != 8)) {
+        M3D_LOG("32 bit indices not supported, unable to load model");
+        M3D_FREE(model);
+        return NULL;
+    }
+    if(model->vi_s > 4 || model->si_s > 4 || model->vp_s == 4) {
+        M3D_LOG("Invalid index size, unable to load model");
+        M3D_FREE(model);
+        return NULL;
+    }
+    if(!M3D_CHUNKMAGIC(end - 4, 'O','M','D','3')) {
+        M3D_LOG("Missing end chunk");
+        M3D_FREE(model);
+        return NULL;
+    }
+    if(model->nb_s > M3D_NUMBONE) {
+        M3D_LOG("Model has more bones per vertex than what importer was configured to support");
+        model->errcode = M3D_ERR_TRUNC;
+    }
+
+    /* look for inlined assets in advance, material and procedural chunks may need them */
+    buff = chunk;
+    while(buff < end && !M3D_CHUNKMAGIC(buff, 'O','M','D','3')) {
+        data = buff;
+        len = ((m3dchunk_t*)data)->length;
+        buff += len;
+        if(len < sizeof(m3dchunk_t) || buff >= end) {
+            M3D_LOG("Invalid chunk size");
+            break;
+        }
+        len -= sizeof(m3dchunk_t) + model->si_s;
+
+        /* inlined assets */
+        if(M3D_CHUNKMAGIC(data, 'A','S','E','T') && len > 0) {
+            M3D_LOG("Inlined asset");
+            i = model->numinlined++;
+            model->inlined = (m3di_t*)M3D_REALLOC(model->inlined, model->numinlined * sizeof(m3di_t));
+            if(!model->inlined) {
+memerr:         M3D_LOG("Out of memory");
+                model->errcode = M3D_ERR_ALLOC;
+                return model;
+            }
+            data += sizeof(m3dchunk_t);
+            t = &model->inlined[i];
+            M3D_GETSTR(t->name);
+            M3D_LOG(t->name);
+            t->data = (uint8_t*)data;
+            t->length = len;
+        }
+    }
+
+    /* parse chunks */
+    while(chunk < end && !M3D_CHUNKMAGIC(chunk, 'O','M','D','3')) {
+        data = chunk;
+        len = ((m3dchunk_t*)chunk)->length;
+        chunk += len;
+        if(len < sizeof(m3dchunk_t) || chunk >= end) {
+            M3D_LOG("Invalid chunk size");
+            break;
+        }
+        len -= sizeof(m3dchunk_t);
+
+        /* color map */
+        if(M3D_CHUNKMAGIC(data, 'C','M','A','P')) {
+            M3D_LOG("Color map");
+            if(model->cmap) { M3D_LOG("More color map chunks, should be unique"); model->errcode = M3D_ERR_CMAP; continue; }
+            if(!model->ci_s) { M3D_LOG("Color map chunk, shouldn't be any"); model->errcode = M3D_ERR_CMAP; continue; }
+            model->numcmap = len / sizeof(uint32_t);
+            model->cmap = (uint32_t*)(data + sizeof(m3dchunk_t));
+        } else
+        /* texture map */
+        if(M3D_CHUNKMAGIC(data, 'T','M','A','P')) {
+            M3D_LOG("Texture map");
+            if(model->tmap) { M3D_LOG("More texture map chunks, should be unique"); model->errcode = M3D_ERR_TMAP; continue; }
+            if(!model->ti_s) { M3D_LOG("Texture map chunk, shouldn't be any"); model->errcode = M3D_ERR_TMAP; continue; }
+            reclen = model->vc_s + model->vc_s;
+            model->numtmap = len / reclen;
+            model->tmap = (m3dti_t*)M3D_MALLOC(model->numtmap * sizeof(m3dti_t));
+            if(!model->tmap) goto memerr;
+            for(i = 0, data += sizeof(m3dchunk_t); data < chunk; i++) {
+                switch(model->vc_s) {
+                    case 1:
+                        model->tmap[i].u = (M3D_FLOAT)((uint8_t)data[0]) / (M3D_FLOAT)255.0;
+                        model->tmap[i].v = (M3D_FLOAT)((uint8_t)data[1]) / (M3D_FLOAT)255.0;
+                    break;
+                    case 2:
+                        model->tmap[i].u = (M3D_FLOAT)(*((uint16_t*)(data+0))) / (M3D_FLOAT)65535.0;
+                        model->tmap[i].v = (M3D_FLOAT)(*((uint16_t*)(data+2))) / (M3D_FLOAT)65535.0;
+                    break;
+                    case 4:
+                        model->tmap[i].u = (M3D_FLOAT)(*((float*)(data+0)));
+                        model->tmap[i].v = (M3D_FLOAT)(*((float*)(data+4)));
+                    break;
+                    case 8:
+                        model->tmap[i].u = (M3D_FLOAT)(*((double*)(data+0)));
+                        model->tmap[i].v = (M3D_FLOAT)(*((double*)(data+8)));
+                    break;
+                }
+                data += reclen;
+            }
+        } else
+        /* vertex list */
+        if(M3D_CHUNKMAGIC(data, 'V','R','T','S')) {
+            M3D_LOG("Vertex list");
+            if(model->vertex) { M3D_LOG("More vertex chunks, should be unique"); model->errcode = M3D_ERR_VRTS; continue; }
+            if(model->ci_s && model->ci_s < 4 && !model->cmap) model->errcode = M3D_ERR_CMAP;
+            reclen = model->ci_s + model->sk_s + 4 * model->vc_s;
+            model->numvertex = len / reclen;
+            model->vertex = (m3dv_t*)M3D_MALLOC(model->numvertex * sizeof(m3dv_t));
+            if(!model->vertex) goto memerr;
+            memset(model->vertex, 0, model->numvertex * sizeof(m3dv_t));
+            for(i = 0, data += sizeof(m3dchunk_t); data < chunk && i < model->numvertex; i++) {
+                switch(model->vc_s) {
+                    case 1:
+                        model->vertex[i].x = (M3D_FLOAT)((int8_t)data[0]) / (M3D_FLOAT)127.0;
+                        model->vertex[i].y = (M3D_FLOAT)((int8_t)data[1]) / (M3D_FLOAT)127.0;
+                        model->vertex[i].z = (M3D_FLOAT)((int8_t)data[2]) / (M3D_FLOAT)127.0;
+                        model->vertex[i].w = (M3D_FLOAT)((int8_t)data[3]) / (M3D_FLOAT)127.0;
+                        data += 4;
+                    break;
+                    case 2:
+                        model->vertex[i].x = (M3D_FLOAT)(*((int16_t*)(data+0))) / (M3D_FLOAT)32767.0;
+                        model->vertex[i].y = (M3D_FLOAT)(*((int16_t*)(data+2))) / (M3D_FLOAT)32767.0;
+                        model->vertex[i].z = (M3D_FLOAT)(*((int16_t*)(data+4))) / (M3D_FLOAT)32767.0;
+                        model->vertex[i].w = (M3D_FLOAT)(*((int16_t*)(data+6))) / (M3D_FLOAT)32767.0;
+                        data += 8;
+                    break;
+                    case 4:
+                        model->vertex[i].x = (M3D_FLOAT)(*((float*)(data+0)));
+                        model->vertex[i].y = (M3D_FLOAT)(*((float*)(data+4)));
+                        model->vertex[i].z = (M3D_FLOAT)(*((float*)(data+8)));
+                        model->vertex[i].w = (M3D_FLOAT)(*((float*)(data+12)));
+                        data += 16;
+                    break;
+                    case 8:
+                        model->vertex[i].x = (M3D_FLOAT)(*((double*)(data+0)));
+                        model->vertex[i].y = (M3D_FLOAT)(*((double*)(data+8)));
+                        model->vertex[i].z = (M3D_FLOAT)(*((double*)(data+16)));
+                        model->vertex[i].w = (M3D_FLOAT)(*((double*)(data+24)));
+                        data += 32;
+                    break;
+                }
+                switch(model->ci_s) {
+                    case 1: model->vertex[i].color = model->cmap ? model->cmap[data[0]] : 0; data++; break;
+                    case 2: model->vertex[i].color = model->cmap ? model->cmap[*((uint16_t*)data)] : 0; data += 2; break;
+                    case 4: model->vertex[i].color = *((uint32_t*)data); data += 4; break;
+                    /* case 8: break; */
+                }
+                model->vertex[i].skinid = M3D_UNDEF;
+                data = _m3d_getidx(data, model->sk_s, &model->vertex[i].skinid);
+            }
+        } else
+        /* skeleton: bone hierarchy and skin */
+        if(M3D_CHUNKMAGIC(data, 'B','O','N','E')) {
+            M3D_LOG("Skeleton");
+            if(model->bone) { M3D_LOG("More bone chunks, should be unique"); model->errcode = M3D_ERR_BONE; continue; }
+            if(!model->bi_s) { M3D_LOG("Bone chunk, shouldn't be any"); model->errcode=M3D_ERR_BONE; continue; }
+            if(!model->vertex) { M3D_LOG("No vertex chunk before bones"); model->errcode = M3D_ERR_VRTS; break; }
+            data += sizeof(m3dchunk_t);
+            model->numbone = 0;
+            data = _m3d_getidx(data, model->bi_s, &model->numbone);
+            if(model->numbone) {
+                model->bone = (m3db_t*)M3D_MALLOC(model->numbone * sizeof(m3db_t));
+                if(!model->bone) goto memerr;
+            }
+            model->numskin = 0;
+            data = _m3d_getidx(data, model->sk_s, &model->numskin);
+            /* read bone hierarchy */
+            for(i = 0; data < chunk && i < model->numbone; i++) {
+                data = _m3d_getidx(data, model->bi_s, &model->bone[i].parent);
+                M3D_GETSTR(model->bone[i].name);
+                data = _m3d_getidx(data, model->vi_s, &model->bone[i].pos);
+                data = _m3d_getidx(data, model->vi_s, &model->bone[i].ori);
+                model->bone[i].numweight = 0;
+                model->bone[i].weight = NULL;
+            }
+            if(i != model->numbone) { M3D_LOG("Truncated bone chunk"); model->numbone = i; model->numskin = 0; model->errcode = M3D_ERR_BONE; }
+            /* read skin definitions */
+            if(model->numskin) {
+                model->skin = (m3ds_t*)M3D_MALLOC(model->numskin * sizeof(m3ds_t));
+                if(!model->skin) goto memerr;
+                for(i = 0; data < chunk && i < model->numskin; i++) {
+                    for(j = 0; j < M3D_NUMBONE; j++) {
+                        model->skin[i].boneid[j] = M3D_UNDEF;
+                        model->skin[i].weight[j] = (M3D_FLOAT)0.0;
+                    }
+                    memset(&weights, 0, sizeof(weights));
+                    if(model->nb_s == 1) weights[0] = 255;
+                    else {
+                        memcpy(&weights, data, model->nb_s);
+                        data += model->nb_s;
+                    }
+                    for(j = 0, w = (M3D_FLOAT)0.0; j < (unsigned int)model->nb_s; j++) {
+                        if(weights[j]) {
+                            if(j >= M3D_NUMBONE)
+                                data += model->bi_s;
+                            else {
+                                model->skin[i].weight[j] = (M3D_FLOAT)(weights[j]) / (M3D_FLOAT)255.0;
+                                w += model->skin[i].weight[j];
+                                data = _m3d_getidx(data, model->bi_s, &model->skin[i].boneid[j]);
+                            }
+                        }
+                    }
+                    /* this can occur if model has more bones than what the importer is configured to handle */
+                    if(w != (M3D_FLOAT)1.0 && w != (M3D_FLOAT)0.0) {
+                        for(j = 0; j < M3D_NUMBONE; j++)
+                            model->skin[i].weight[j] /= w;
+                    }
+                }
+                if(i != model->numskin) { M3D_LOG("Truncated skin in bone chunk"); model->numskin = i; model->errcode = M3D_ERR_BONE; }
+            }
+        } else
+        /* material */
+        if(M3D_CHUNKMAGIC(data, 'M','T','R','L')) {
+            data += sizeof(m3dchunk_t);
+            M3D_GETSTR(name);
+            M3D_LOG("Material");
+            M3D_LOG(name);
+            if(model->ci_s < 4 && !model->numcmap) model->errcode = M3D_ERR_CMAP;
+            for(i = 0; i < model->nummaterial; i++)
+                if(!strcmp(name, model->material[i].name)) {
+                    model->errcode = M3D_ERR_MTRL;
+                    M3D_LOG("Multiple definitions for material");
+                    M3D_LOG(name);
+                    name = NULL;
+                    break;
+                }
+            if(name) {
+                i = model->nummaterial++;
+                if(model->flags & M3D_FLG_MTLLIB) {
+                    m = model->material;
+                    model->material = (m3dm_t*)M3D_MALLOC(model->nummaterial * sizeof(m3dm_t));
+                    if(!model->material) goto memerr;
+                    memcpy(model->material, m, (model->nummaterial - 1) * sizeof(m3dm_t));
+                    if(model->texture) {
+                        tx = model->texture;
+                        model->texture = (m3dtx_t*)M3D_MALLOC(model->numtexture * sizeof(m3dtx_t));
+                        if(!model->texture) goto memerr;
+                        memcpy(model->texture, tx, model->numtexture * sizeof(m3dm_t));
+                    }
+                    model->flags &= ~M3D_FLG_MTLLIB;
+                } else {
+                    model->material = (m3dm_t*)M3D_REALLOC(model->material, model->nummaterial * sizeof(m3dm_t));
+                    if(!model->material) goto memerr;
+                }
+                m = &model->material[i];
+                m->numprop = 0;
+                m->name = name;
+                m->prop = (m3dp_t*)M3D_MALLOC((len / 2) * sizeof(m3dp_t));
+                if(!m->prop) goto memerr;
+                while(data < chunk) {
+                    i = m->numprop++;
+                    m->prop[i].type = *data++;
+                    m->prop[i].value.num = 0;
+                    if(m->prop[i].type >= 128)
+                        k = m3dpf_map;
+                    else {
+                        for(k = 256, j = 0; j < sizeof(m3d_propertytypes)/sizeof(m3d_propertytypes[0]); j++)
+                            if(m->prop[i].type == m3d_propertytypes[j].id) { k = m3d_propertytypes[j].format; break; }
+                    }
+                    switch(k) {
+                        case m3dpf_color:
+                            switch(model->ci_s) {
+                                case 1: m->prop[i].value.color = model->cmap ? model->cmap[data[0]] : 0; data++; break;
+                                case 2: m->prop[i].value.color = model->cmap ? model->cmap[*((uint16_t*)data)] : 0; data += 2; break;
+                                case 4: m->prop[i].value.color = *((uint32_t*)data); data += 4; break;
+                            }
+                        break;
+
+                        case m3dpf_uint8: m->prop[i].value.num = *data++; break;
+                        case m3dpf_uint16:m->prop[i].value.num = *((uint16_t*)data); data += 2; break;
+                        case m3dpf_uint32:m->prop[i].value.num = *((uint32_t*)data); data += 4; break;
+                        case m3dpf_float: m->prop[i].value.fnum = *((float*)data); data += 4; break;
+
+                        case m3dpf_map:
+                            M3D_GETSTR(name);
+                            m->prop[i].value.textureid = _m3d_gettx(model, readfilecb, freecb, name);
+                            if(model->errcode == M3D_ERR_ALLOC) goto memerr;
+                            /* this error code only returned if readfilecb was specified */
+                            if(m->prop[i].value.textureid == M3D_UNDEF) {
+                                M3D_LOG("Texture not found");
+                                M3D_LOG(m->name);
+                                m->numprop--;
+                            }
+                        break;
+
+                        default:
+                            M3D_LOG("Unknown material property in");
+                            M3D_LOG(m->name);
+                            model->errcode = M3D_ERR_UNKPROP;
+                            data = chunk;
+                        break;
+                    }
+                }
+                m->prop = (m3dp_t*)M3D_REALLOC(m->prop, m->numprop * sizeof(m3dp_t));
+                if(!m->prop) goto memerr;
+            }
+        } else
+        /* face */
+        if(M3D_CHUNKMAGIC(data, 'P','R','O','C')) {
+            /* procedural surface */
+            M3D_GETSTR(name);
+            M3D_LOG("Procedural surface");
+            M3D_LOG(name);
+            _m3d_getpr(model, readfilecb, freecb, name);
+        } else
+        if(M3D_CHUNKMAGIC(data, 'M','E','S','H')) {
+            M3D_LOG("Mesh data");
+            if(!model->vertex) { M3D_LOG("No vertex chunk before mesh"); model->errcode = M3D_ERR_VRTS; }
+            /* mesh */
+            data += sizeof(m3dchunk_t);
+            mi = M3D_UNDEF;
+#ifdef M3D_VERTEXMAX
+            pi = M3D_UNDEF;
+#endif
+            am = model->numface;
+            while(data < chunk) {
+                k = *data++;
+                n = k >> 4;
+                k &= 15;
+                if(!n) {
+                    if(!k) {
+                        /* use material */
+                        mi = M3D_UNDEF;
+                        M3D_GETSTR(name);
+                        if(name) {
+                            for(j = 0; j < model->nummaterial; j++)
+                                if(!strcmp(name, model->material[j].name)) {
+                                    mi = (M3D_INDEX)j;
+                                    break;
+                                }
+                            if(mi == M3D_UNDEF) model->errcode = M3D_ERR_MTRL;
+                        }
+                    } else {
+                        /* use parameter */
+                        M3D_GETSTR(name);
+#ifdef M3D_VERTEXMAX
+                        pi = M3D_UNDEF;
+                        if(name) {
+                            for(j = 0; j < model->numparam; j++)
+                                if(!strcmp(name, model->param[j].name)) {
+                                    pi = (M3D_INDEX)j;
+                                    break;
+                                }
+                            if(pi == M3D_UNDEF) {
+                                pi = model->numparam++;
+                                model->param = (m3dvi_t*)M3D_REALLOC(model->param, model->numparam * sizeof(m3dvi_t));
+                                if(!model->param) goto memerr;
+                                model->param[pi].name = name;
+                                model->param[pi].count = 0;
+                            }
+                        }
+#endif
+                    }
+                    continue;
+                }
+                if(n != 3) { M3D_LOG("Only triangle mesh supported for now"); model->errcode = M3D_ERR_UNKMESH; return model; }
+                i = model->numface++;
+                if(model->numface > am) {
+                    am = model->numface + 4095;
+                    model->face = (m3df_t*)M3D_REALLOC(model->face, am * sizeof(m3df_t));
+                    if(!model->face) goto memerr;
+                }
+                memset(&model->face[i], 255, sizeof(m3df_t)); /* set all index to -1 by default */
+                model->face[i].materialid = mi;
+#ifdef M3D_VERTEXMAX
+                model->face[i].paramid = pi;
+#endif
+                for(j = 0; data < chunk && j < n; j++) {
+                    /* vertex */
+                    data = _m3d_getidx(data, model->vi_s, &model->face[i].vertex[j]);
+                    /* texcoord */
+                    if(k & 1)
+                        data = _m3d_getidx(data, model->ti_s, &model->face[i].texcoord[j]);
+                    /* normal */
+                    if(k & 2)
+                        data = _m3d_getidx(data, model->vi_s, &model->face[i].normal[j]);
+#ifndef M3D_NONORMALS
+                    if(model->face[i].normal[j] == M3D_UNDEF) neednorm = 1;
+#endif
+                    /* maximum */
+                    if(k & 4)
+#ifdef M3D_VERTEXMAX
+                        data = _m3d_getidx(data, model->vi_s, &model->face[i].vertmax[j]);
+#else
+                        data += model->vi_s;
+#endif
+                }
+                if(j != n) { M3D_LOG("Invalid mesh"); model->numface = 0; model->errcode = M3D_ERR_UNKMESH; return model; }
+            }
+            model->face = (m3df_t*)M3D_REALLOC(model->face, model->numface * sizeof(m3df_t));
+        } else
+        if(M3D_CHUNKMAGIC(data, 'V','O','X','T')) {
+            /* voxel types */
+            M3D_LOG("Voxel types list");
+            if(model->voxtype) { M3D_LOG("More voxel type chunks, should be unique"); model->errcode = M3D_ERR_VOXT; continue; }
+            if(model->ci_s && model->ci_s < 4 && !model->cmap) model->errcode = M3D_ERR_CMAP;
+            reclen = model->ci_s + model->si_s + 3 + model->sk_s;
+            k = len / reclen;
+            model->voxtype = (m3dvt_t*)M3D_MALLOC(k * sizeof(m3dvt_t));
+            if(!model->voxtype) goto memerr;
+            memset(model->voxtype, 0, k * sizeof(m3dvt_t));
+            model->numvoxtype = 0;
+            for(i = 0, data += sizeof(m3dchunk_t); data < chunk && i < k; i++) {
+                switch(model->ci_s) {
+                    case 1: model->voxtype[i].color = model->cmap ? model->cmap[data[0]] : 0; data++; break;
+                    case 2: model->voxtype[i].color = model->cmap ? model->cmap[*((uint16_t*)data)] : 0; data += 2; break;
+                    case 4: model->voxtype[i].color = *((uint32_t*)data); data += 4; break;
+                    /* case 8: break; */
+                }
+                M3D_GETSTR(name);
+                model->voxtype[i].materialid = M3D_UNDEF;
+                if(name) {
+                    model->voxtype[i].name = name;
+/*
+                    for(j = 0; j < model->nummaterial; j++)
+                        if(!strcmp(name, model->material[j].name)) {
+                            model->voxtype[i].materialid = (M3D_INDEX)j;
+                            break;
+                        }
+*/
+                }
+                j = *data++;
+                model->voxtype[i].rotation = j & 0xBF;
+                model->voxtype[i].voxshape = ((j & 0x40) << 2) | *data++;
+                model->voxtype[i].numitem = *data++;
+                model->voxtype[i].skinid = M3D_UNDEF;
+                data = _m3d_getidx(data, model->sk_s, &model->voxtype[i].skinid);
+                if(model->voxtype[i].numitem) {
+                    model->voxtype[i].item = (m3dvi_t*)M3D_MALLOC(model->voxtype[i].numitem * sizeof(m3dvi_t));
+                    if(!model->voxtype[i].item) goto memerr;
+                    memset(model->voxtype[i].item, 0, model->voxtype[i].numitem * sizeof(m3dvi_t));
+                    for(j = 0; j < model->voxtype[i].numitem; j++) {
+                        model->voxtype[i].item[j].count = *data++;
+                        model->voxtype[i].item[j].count |= (*data++) << 8;
+                        M3D_GETSTR(model->voxtype[i].item[j].name);
+                    }
+                }
+            }
+            model->numvoxtype = i;
+            if(k != model->numvoxtype) {
+                model->voxtype = (m3dvt_t*)M3D_REALLOC(model->voxtype, model->numvoxtype * sizeof(m3dvt_t));
+                if(!model->voxtype) goto memerr;
+            }
+        } else
+        if(M3D_CHUNKMAGIC(data, 'V','O','X','D')) {
+            /* voxel data */
+            data += sizeof(m3dchunk_t);
+            M3D_GETSTR(name);
+            M3D_LOG("Voxel Data Layer");
+            M3D_LOG(name);
+            if(model->vd_s > 4 || model->vp_s > 2) { M3D_LOG("No voxel index size"); model->errcode = M3D_ERR_UNKVOX; continue; }
+            if(!model->voxtype) { M3D_LOG("No voxel type chunk before voxel data"); model->errcode = M3D_ERR_VOXT; }
+            i = model->numvoxel++;
+            model->voxel = (m3dvx_t*)M3D_REALLOC(model->voxel, model->numvoxel * sizeof(m3dvx_t));
+            if(!model->voxel) goto memerr;
+            memset(&model->voxel[i], 0, sizeof(m3dvx_t));
+            model->voxel[i].name = name;
+            switch(model->vd_s) {
+                case 1:
+                    model->voxel[i].x = (int32_t)((int8_t)data[0]);
+                    model->voxel[i].y = (int32_t)((int8_t)data[1]);
+                    model->voxel[i].z = (int32_t)((int8_t)data[2]);
+                    model->voxel[i].w = (uint32_t)(data[3]);
+                    model->voxel[i].h = (uint32_t)(data[4]);
+                    model->voxel[i].d = (uint32_t)(data[5]);
+                    data += 6;
+                break;
+                case 2:
+                    model->voxel[i].x = (int32_t)(*((int16_t*)(data+0)));
+                    model->voxel[i].y = (int32_t)(*((int16_t*)(data+2)));
+                    model->voxel[i].z = (int32_t)(*((int16_t*)(data+4)));
+                    model->voxel[i].w = (uint32_t)(*((uint16_t*)(data+6)));
+                    model->voxel[i].h = (uint32_t)(*((uint16_t*)(data+8)));
+                    model->voxel[i].d = (uint32_t)(*((uint16_t*)(data+10)));
+                    data += 12;
+                break;
+                case 4:
+                    model->voxel[i].x = *((int32_t*)(data+0));
+                    model->voxel[i].y = *((int32_t*)(data+4));
+                    model->voxel[i].z = *((int32_t*)(data+8));
+                    model->voxel[i].w = *((uint32_t*)(data+12));
+                    model->voxel[i].h = *((uint32_t*)(data+16));
+                    model->voxel[i].d = *((uint32_t*)(data+20));
+                    data += 24;
+                break;
+            }
+            model->voxel[i].uncertain = *data++;
+            model->voxel[i].groupid = *data++;
+            k = model->voxel[i].w * model->voxel[i].h * model->voxel[i].d;
+            model->voxel[i].data = (M3D_VOXEL*)M3D_MALLOC(k * sizeof(M3D_VOXEL));
+            if(!model->voxel[i].data) goto memerr;
+            memset(model->voxel[i].data, 0xff, k * sizeof(M3D_VOXEL));
+            for(j = 0; data < chunk && j < k;) {
+                l = ((*data++) & 0x7F) + 1;
+                if(data[-1] & 0x80) {
+                    data = _m3d_getidx(data, model->vp_s, &mi);
+                    while(l-- && j < k) model->voxel[i].data[j++] = (M3D_VOXEL)mi;
+                } else
+                    while(l-- && j < k) {
+                        data = _m3d_getidx(data, model->vp_s, &mi);
+                        model->voxel[i].data[j++] = (M3D_VOXEL)mi;
+                    }
+            }
+        } else
+        if(M3D_CHUNKMAGIC(data, 'S','H','P','E')) {
+            /* mathematical shape */
+            data += sizeof(m3dchunk_t);
+            M3D_GETSTR(name);
+            M3D_LOG("Mathematical Shape");
+            M3D_LOG(name);
+            i = model->numshape++;
+            model->shape = (m3dh_t*)M3D_REALLOC(model->shape, model->numshape * sizeof(m3dh_t));
+            if(!model->shape) goto memerr;
+            h = &model->shape[i];
+            h->numcmd = 0;
+            h->cmd = NULL;
+            h->name = name;
+            h->group = M3D_UNDEF;
+            data = _m3d_getidx(data, model->bi_s, &h->group);
+            if(h->group != M3D_UNDEF && h->group >= model->numbone) {
+                M3D_LOG("Unknown bone id as shape group in shape");
+                M3D_LOG(name);
+                h->group = M3D_UNDEF;
+                model->errcode = M3D_ERR_SHPE;
+            }
+            while(data < chunk) {
+                i = h->numcmd++;
+                h->cmd = (m3dc_t*)M3D_REALLOC(h->cmd, h->numcmd * sizeof(m3dc_t));
+                if(!h->cmd) goto memerr;
+                h->cmd[i].type = *data++;
+                if(h->cmd[i].type & 0x80) {
+                    h->cmd[i].type &= 0x7F;
+                    h->cmd[i].type |= (*data++ << 7);
+                }
+                if(h->cmd[i].type >= (unsigned int)(sizeof(m3d_commandtypes)/sizeof(m3d_commandtypes[0]))) {
+                    M3D_LOG("Unknown shape command in");
+                    M3D_LOG(h->name);
+                    model->errcode = M3D_ERR_UNKCMD;
+                    break;
+                }
+                cd = &m3d_commandtypes[h->cmd[i].type];
+                h->cmd[i].arg = (uint32_t*)M3D_MALLOC(cd->p * sizeof(uint32_t));
+                if(!h->cmd[i].arg) goto memerr;
+                memset(h->cmd[i].arg, 0, cd->p * sizeof(uint32_t));
+                for(k = n = 0, l = cd->p; k < l; k++)
+                    switch(cd->a[((k - n) % (cd->p - n)) + n]) {
+                        case m3dcp_mi_t:
+                            h->cmd[i].arg[k] = M3D_NOTDEFINED;
+                            M3D_GETSTR(name);
+                            if(name) {
+                                for(n = 0; n < model->nummaterial; n++)
+                                    if(!strcmp(name, model->material[n].name)) {
+                                        h->cmd[i].arg[k] = n;
+                                        break;
+                                    }
+                                if(h->cmd[i].arg[k] == M3D_NOTDEFINED) model->errcode = M3D_ERR_MTRL;
+                            }
+                        break;
+                        case m3dcp_vc_t:
+                            f = 0.0f;
+                            switch(model->vc_s) {
+                                case 1: f = (float)((int8_t)data[0]) / 127; break;
+                                case 2: f = (float)(*((int16_t*)(data+0))) / 32767; break;
+                                case 4: f = (float)(*((float*)(data+0))); break;
+                                case 8: f = (float)(*((double*)(data+0))); break;
+                            }
+                            memcpy(&h->cmd[i].arg[k], &f, 4);
+                            data += model->vc_s;
+                        break;
+                        case m3dcp_hi_t: data = _m3d_getidx(data, model->hi_s, &h->cmd[i].arg[k]); break;
+                        case m3dcp_fi_t: data = _m3d_getidx(data, model->fi_s, &h->cmd[i].arg[k]); break;
+                        case m3dcp_ti_t: data = _m3d_getidx(data, model->ti_s, &h->cmd[i].arg[k]); break;
+                        case m3dcp_qi_t:
+                        case m3dcp_vi_t: data = _m3d_getidx(data, model->vi_s, &h->cmd[i].arg[k]); break;
+                        case m3dcp_i1_t: data = _m3d_getidx(data, 1, &h->cmd[i].arg[k]); break;
+                        case m3dcp_i2_t: data = _m3d_getidx(data, 2, &h->cmd[i].arg[k]); break;
+                        case m3dcp_i4_t: data = _m3d_getidx(data, 4, &h->cmd[i].arg[k]); break;
+                        case m3dcp_va_t: data = _m3d_getidx(data, 4, &h->cmd[i].arg[k]);
+                            n = k + 1; l += (h->cmd[i].arg[k] - 1) * (cd->p - k - 1);
+                            h->cmd[i].arg = (uint32_t*)M3D_REALLOC(h->cmd[i].arg, l * sizeof(uint32_t));
+                            if(!h->cmd[i].arg) goto memerr;
+                            memset(&h->cmd[i].arg[k + 1], 0, (l - k - 1) * sizeof(uint32_t));
+                        break;
+                    }
+            }
+        } else
+        /* annotation label list */
+        if(M3D_CHUNKMAGIC(data, 'L','B','L','S')) {
+            data += sizeof(m3dchunk_t);
+            M3D_GETSTR(name);
+            M3D_GETSTR(lang);
+            M3D_LOG("Label list");
+            if(name) { M3D_LOG(name); }
+            if(lang) { M3D_LOG(lang); }
+            if(model->ci_s && model->ci_s < 4 && !model->cmap) model->errcode = M3D_ERR_CMAP;
+            k = 0;
+            switch(model->ci_s) {
+                case 1: k = model->cmap ? model->cmap[data[0]] : 0; data++; break;
+                case 2: k = model->cmap ? model->cmap[*((uint16_t*)data)] : 0; data += 2; break;
+                case 4: k = *((uint32_t*)data); data += 4; break;
+                /* case 8: break; */
+            }
+            reclen = model->vi_s + model->si_s;
+            i = model->numlabel; model->numlabel += len / reclen;
+            model->label = (m3dl_t*)M3D_REALLOC(model->label, model->numlabel * sizeof(m3dl_t));
+            if(!model->label) goto memerr;
+            memset(&model->label[i], 0, (model->numlabel - i) * sizeof(m3dl_t));
+            for(; data < chunk && i < model->numlabel; i++) {
+                model->label[i].name = name;
+                model->label[i].lang = lang;
+                model->label[i].color = k;
+                data = _m3d_getidx(data, model->vi_s, &model->label[i].vertexid);
+                M3D_GETSTR(model->label[i].text);
+            }
+        } else
+        /* action */
+        if(M3D_CHUNKMAGIC(data, 'A','C','T','N')) {
+            M3D_LOG("Action");
+            i = model->numaction++;
+            model->action = (m3da_t*)M3D_REALLOC(model->action, model->numaction * sizeof(m3da_t));
+            if(!model->action) goto memerr;
+            a = &model->action[i];
+            data += sizeof(m3dchunk_t);
+            M3D_GETSTR(a->name);
+            M3D_LOG(a->name);
+            a->numframe = *((uint16_t*)data); data += 2;
+            if(a->numframe < 1) {
+                model->numaction--;
+            } else {
+                a->durationmsec = *((uint32_t*)data); data += 4;
+                a->frame = (m3dfr_t*)M3D_MALLOC(a->numframe * sizeof(m3dfr_t));
+                if(!a->frame) goto memerr;
+                for(i = 0; data < chunk && i < a->numframe; i++) {
+                    a->frame[i].msec = *((uint32_t*)data); data += 4;
+                    a->frame[i].numtransform = 0; a->frame[i].transform = NULL;
+                    data = _m3d_getidx(data, model->fc_s, &a->frame[i].numtransform);
+                    if(a->frame[i].numtransform > 0) {
+                        a->frame[i].transform = (m3dtr_t*)M3D_MALLOC(a->frame[i].numtransform * sizeof(m3dtr_t));
+                        for(j = 0; j < a->frame[i].numtransform; j++) {
+                            data = _m3d_getidx(data, model->bi_s, &a->frame[i].transform[j].boneid);
+                            data = _m3d_getidx(data, model->vi_s, &a->frame[i].transform[j].pos);
+                            data = _m3d_getidx(data, model->vi_s, &a->frame[i].transform[j].ori);
+                        }
+                    }
+                }
+            }
+        } else {
+            i = model->numextra++;
+            model->extra = (m3dchunk_t**)M3D_REALLOC(model->extra, model->numextra * sizeof(m3dchunk_t*));
+            if(!model->extra) goto memerr;
+            model->extra[i] = (m3dchunk_t*)data;
+        }
+    }
+    /* calculate normals, normalize skin weights, create bone/vertex cross-references and calculate transform matrices */
+#ifdef M3D_ASCII
+postprocess:
+#endif
+    if(model) {
+        M3D_LOG("Post-process");
+#ifdef M3D_PROFILING
+        gettimeofday(&tv1, NULL);
+        tvd.tv_sec = tv1.tv_sec - tv0.tv_sec;
+        tvd.tv_usec = tv1.tv_usec - tv0.tv_usec;
+        if(tvd.tv_usec < 0) { tvd.tv_sec--; tvd.tv_usec += 1000000L; }
+        printf("  Parsing chunks  %ld.%06ld sec\n", tvd.tv_sec, tvd.tv_usec);
+#endif
+#ifndef M3D_NOVOXELS
+        if(model->numvoxel && model->voxel) {
+            M3D_LOG("Converting voxels into vertices and mesh");
+            /* add normals */
+            enorm = model->numvertex; model->numvertex += 6;
+            model->vertex = (m3dv_t*)M3D_REALLOC(model->vertex, model->numvertex * sizeof(m3dv_t));
+            if(!model->vertex) goto memerr;
+            memset(&model->vertex[enorm], 0, 6 * sizeof(m3dv_t));
+            for(l = 0; l < 6; l++)
+                model->vertex[enorm+l].skinid = M3D_UNDEF;
+            model->vertex[enorm+0].y = (M3D_FLOAT)-1.0;
+            model->vertex[enorm+1].z = (M3D_FLOAT)-1.0;
+            model->vertex[enorm+2].x = (M3D_FLOAT)-1.0;
+            model->vertex[enorm+3].y = (M3D_FLOAT)1.0;
+            model->vertex[enorm+4].z = (M3D_FLOAT)1.0;
+            model->vertex[enorm+5].x = (M3D_FLOAT)1.0;
+            /* this is a fast, not so memory efficient version, only basic face culling used */
+            min_x = min_y = min_z = 2147483647L;
+            max_x = max_y = max_z = -2147483647L;
+            for(i = 0; i < model->numvoxel; i++) {
+                if(model->voxel[i].x + (int32_t)model->voxel[i].w > max_x) max_x = model->voxel[i].x + (int32_t)model->voxel[i].w;
+                if(model->voxel[i].x < min_x) min_x = model->voxel[i].x;
+                if(model->voxel[i].y + (int32_t)model->voxel[i].h > max_y) max_y = model->voxel[i].y + (int32_t)model->voxel[i].h;
+                if(model->voxel[i].y < min_y) min_y = model->voxel[i].y;
+                if(model->voxel[i].z + (int32_t)model->voxel[i].d > max_z) max_z = model->voxel[i].z + (int32_t)model->voxel[i].d;
+                if(model->voxel[i].z < min_z) min_z = model->voxel[i].z;
+            }
+            i = (-min_x > max_x ? -min_x : max_x);
+            j = (-min_y > max_y ? -min_y : max_y);
+            k = (-min_z > max_z ? -min_z : max_z);
+            if(j > i) i = j;
+            if(k > i) i = k;
+            if(i <= 1) i = 1;
+            w = (M3D_FLOAT)1.0 / (M3D_FLOAT)i;
+            if(i >= 254) model->vc_s = 2;
+            if(i >= 65534) model->vc_s = 4;
+            for(i = 0; i < model->numvoxel; i++) {
+                sx = model->voxel[i].w; sz = model->voxel[i].d; sy = model->voxel[i].h;
+                for(y = 0, j = 0; y < sy; y++)
+                    for(z = 0; z < sz; z++)
+                        for(x = 0; x < sx; x++, j++)
+                            if(model->voxel[i].data[j] < model->numvoxtype) {
+                                k = 0;
+                                /*  16__32     ____
+                                 *  /|  /|    /|2 /|
+                                 *64_128 |   /_8_/ 32
+                                 * | 1_|_2   |4|_|_|
+                                 * |/  |/    |/ 1|/
+                                 * 4___8     |16_|    */
+                                k = n = am = 0;
+                                if(!y || model->voxel[i].data[j - sx*sz] >= model->numvoxtype) { n++; am |= 1; k |= 1|2|4|8; }
+                                if(!z || model->voxel[i].data[j - sx] >= model->numvoxtype) { n++; am |= 2; k |= 1|2|16|32; }
+                                if(!x || model->voxel[i].data[j - 1] >= model->numvoxtype) { n++; am |= 4; k |= 1|4|16|64; }
+                                if(y == sy-1 || model->voxel[i].data[j + sx*sz] >= model->numvoxtype) { n++; am |= 8; k |= 16|32|64|128; }
+                                if(z == sz-1 || model->voxel[i].data[j + sx] >= model->numvoxtype) { n++; am |= 16; k |= 4|8|64|128; }
+                                if(x == sx-1 || model->voxel[i].data[j + 1] >= model->numvoxtype) { n++; am |= 32; k |= 2|8|32|128; }
+                                if(k) {
+                                    memset(edge, 255, sizeof(edge));
+                                    for(l = 0, len = 1, reclen = model->numvertex; l < 8; l++, len <<= 1)
+                                        if(k & len) edge[l] = model->numvertex++;
+                                    model->vertex = (m3dv_t*)M3D_REALLOC(model->vertex, model->numvertex * sizeof(m3dv_t));
+                                    if(!model->vertex) goto memerr;
+                                    memset(&model->vertex[reclen], 0, (model->numvertex-reclen) * sizeof(m3dv_t));
+                                    for(l = reclen; l < model->numvertex; l++) {
+                                        model->vertex[l].skinid = model->voxtype[model->voxel[i].data[j]].skinid;
+                                        model->vertex[l].color = model->voxtype[model->voxel[i].data[j]].color;
+                                    }
+                                    l = reclen;
+                                    if(k & 1) {
+                                        model->vertex[l].x = (model->voxel[i].x + x) * w;
+                                        model->vertex[l].y = (model->voxel[i].y + y) * w;
+                                        model->vertex[l].z = (model->voxel[i].z + z) * w;
+                                        l++;
+                                    }
+                                    if(k & 2) {
+                                        model->vertex[l].x = (model->voxel[i].x + x + 1) * w;
+                                        model->vertex[l].y = (model->voxel[i].y + y) * w;
+                                        model->vertex[l].z = (model->voxel[i].z + z) * w;
+                                        l++;
+                                    }
+                                    if(k & 4) {
+                                        model->vertex[l].x = (model->voxel[i].x + x) * w;
+                                        model->vertex[l].y = (model->voxel[i].y + y) * w;
+                                        model->vertex[l].z = (model->voxel[i].z + z + 1) * w;
+                                        l++;
+                                    }
+                                    if(k & 8) {
+                                        model->vertex[l].x = (model->voxel[i].x + x + 1) * w;
+                                        model->vertex[l].y = (model->voxel[i].y + y) * w;
+                                        model->vertex[l].z = (model->voxel[i].z + z + 1) * w;
+                                        l++;
+                                    }
+                                    if(k & 16) {
+                                        model->vertex[l].x = (model->voxel[i].x + x) * w;
+                                        model->vertex[l].y = (model->voxel[i].y + y + 1) * w;
+                                        model->vertex[l].z = (model->voxel[i].z + z) * w;
+                                        l++;
+                                    }
+                                    if(k & 32) {
+                                        model->vertex[l].x = (model->voxel[i].x + x + 1) * w;
+                                        model->vertex[l].y = (model->voxel[i].y + y + 1) * w;
+                                        model->vertex[l].z = (model->voxel[i].z + z) * w;
+                                        l++;
+                                    }
+                                    if(k & 64) {
+                                        model->vertex[l].x = (model->voxel[i].x + x) * w;
+                                        model->vertex[l].y = (model->voxel[i].y + y + 1) * w;
+                                        model->vertex[l].z = (model->voxel[i].z + z + 1) * w;
+                                        l++;
+                                    }
+                                    if(k & 128) {
+                                        model->vertex[l].x = (model->voxel[i].x + x + 1) * w;
+                                        model->vertex[l].y = (model->voxel[i].y + y + 1) * w;
+                                        model->vertex[l].z = (model->voxel[i].z + z + 1) * w;
+                                        l++;
+                                    }
+                                    n <<= 1;
+                                    l = model->numface; model->numface += n;
+                                    model->face = (m3df_t*)M3D_REALLOC(model->face, model->numface * sizeof(m3df_t));
+                                    if(!model->face) goto memerr;
+                                    memset(&model->face[l], 255, n * sizeof(m3df_t));
+                                    for(reclen = l; reclen < model->numface; reclen++)
+                                        model->face[reclen].materialid = model->voxtype[model->voxel[i].data[j]].materialid;
+                                    if(am & 1) {            /* bottom */
+                                        model->face[l].vertex[0] = edge[0];   model->face[l].vertex[1] = edge[1];   model->face[l].vertex[2] = edge[2];
+                                        model->face[l+1].vertex[0] = edge[2]; model->face[l+1].vertex[1] = edge[1]; model->face[l+1].vertex[2] = edge[3];
+                                        model->face[l].normal[0] = model->face[l].normal[1] = model->face[l].normal[2] =
+                                        model->face[l+1].normal[0] = model->face[l+1].normal[1] = model->face[l+1].normal[2] = enorm;
+                                        l += 2;
+                                    }
+                                    if(am & 2) {            /* north */
+                                        model->face[l].vertex[0] = edge[0];   model->face[l].vertex[1] = edge[4];   model->face[l].vertex[2] = edge[1];
+                                        model->face[l+1].vertex[0] = edge[1]; model->face[l+1].vertex[1] = edge[4]; model->face[l+1].vertex[2] = edge[5];
+                                        model->face[l].normal[0] = model->face[l].normal[1] = model->face[l].normal[2] =
+                                        model->face[l+1].normal[0] = model->face[l+1].normal[1] = model->face[l+1].normal[2] = enorm+1;
+                                        l += 2;
+                                    }
+                                    if(am & 4) {            /* west */
+                                        model->face[l].vertex[0] = edge[0];   model->face[l].vertex[1] = edge[2];   model->face[l].vertex[2] = edge[4];
+                                        model->face[l+1].vertex[0] = edge[2]; model->face[l+1].vertex[1] = edge[6]; model->face[l+1].vertex[2] = edge[4];
+                                        model->face[l].normal[0] = model->face[l].normal[1] = model->face[l].normal[2] =
+                                        model->face[l+1].normal[0] = model->face[l+1].normal[1] = model->face[l+1].normal[2] = enorm+2;
+                                        l += 2;
+                                    }
+                                    if(am & 8) {            /* top */
+                                        model->face[l].vertex[0] = edge[4];   model->face[l].vertex[1] = edge[6];   model->face[l].vertex[2] = edge[5];
+                                        model->face[l+1].vertex[0] = edge[5]; model->face[l+1].vertex[1] = edge[6]; model->face[l+1].vertex[2] = edge[7];
+                                        model->face[l].normal[0] = model->face[l].normal[1] = model->face[l].normal[2] =
+                                        model->face[l+1].normal[0] = model->face[l+1].normal[1] = model->face[l+1].normal[2] = enorm+3;
+                                        l += 2;
+                                    }
+                                    if(am & 16) {           /* south */
+                                        model->face[l].vertex[0] = edge[2];   model->face[l].vertex[1] = edge[7];   model->face[l].vertex[2] = edge[6];
+                                        model->face[l+1].vertex[0] = edge[7]; model->face[l+1].vertex[1] = edge[2]; model->face[l+1].vertex[2] = edge[3];
+                                        model->face[l].normal[0] = model->face[l].normal[1] = model->face[l].normal[2] =
+                                        model->face[l+1].normal[0] = model->face[l+1].normal[1] = model->face[l+1].normal[2] = enorm+4;
+                                        l += 2;
+                                    }
+                                    if(am & 32) {           /* east */
+                                        model->face[l].vertex[0] = edge[1];   model->face[l].vertex[1] = edge[5];   model->face[l].vertex[2] = edge[7];
+                                        model->face[l+1].vertex[0] = edge[1]; model->face[l+1].vertex[1] = edge[7]; model->face[l+1].vertex[2] = edge[3];
+                                        model->face[l].normal[0] = model->face[l].normal[1] = model->face[l].normal[2] =
+                                        model->face[l+1].normal[0] = model->face[l+1].normal[1] = model->face[l+1].normal[2] = enorm+5;
+                                        l += 2;
+                                    }
+                                }
+                            }
+            }
+        }
+#endif
+#ifndef M3D_NONORMALS
+        if(model->numface && model->face && neednorm) {
+            /* if they are missing, calculate triangle normals into a temporary buffer */
+            norm = (m3dv_t*)M3D_MALLOC(model->numface * sizeof(m3dv_t));
+            if(!norm) goto memerr;
+            for(i = 0, n = model->numvertex; i < model->numface; i++)
+                if(model->face[i].normal[0] == M3D_UNDEF) {
+                    v0 = &model->vertex[model->face[i].vertex[0]];
+                    v1 = &model->vertex[model->face[i].vertex[1]];
+                    v2 = &model->vertex[model->face[i].vertex[2]];
+                    va.x = v1->x - v0->x; va.y = v1->y - v0->y; va.z = v1->z - v0->z;
+                    vb.x = v2->x - v0->x; vb.y = v2->y - v0->y; vb.z = v2->z - v0->z;
+                    v0 = &norm[i];
+                    v0->x = (va.y * vb.z) - (va.z * vb.y);
+                    v0->y = (va.z * vb.x) - (va.x * vb.z);
+                    v0->z = (va.x * vb.y) - (va.y * vb.x);
+                    w = _m3d_rsq((v0->x * v0->x) + (v0->y * v0->y) + (v0->z * v0->z));
+                    v0->x *= w; v0->y *= w; v0->z *= w;
+                    model->face[i].normal[0] = model->face[i].vertex[0] + n;
+                    model->face[i].normal[1] = model->face[i].vertex[1] + n;
+                    model->face[i].normal[2] = model->face[i].vertex[2] + n;
+                }
+            /* this is the fast way, we don't care if a normal is repeated in model->vertex */
+            M3D_LOG("Generating normals");
+            model->flags |= M3D_FLG_GENNORM;
+            model->numvertex <<= 1;
+            model->vertex = (m3dv_t*)M3D_REALLOC(model->vertex, model->numvertex * sizeof(m3dv_t));
+            if(!model->vertex) goto memerr;
+            memset(&model->vertex[n], 0, n * sizeof(m3dv_t));
+            for(i = 0; i < model->numface; i++)
+                for(j = 0; j < 3; j++) {
+                    v0 = &model->vertex[model->face[i].vertex[j] + n];
+                    v0->x += norm[i].x;
+                    v0->y += norm[i].y;
+                    v0->z += norm[i].z;
+                }
+            /* for each vertex, take the average of the temporary normals and use that */
+            for(i = 0, v0 = &model->vertex[n]; i < n; i++, v0++) {
+                w = _m3d_rsq((v0->x * v0->x) + (v0->y * v0->y) + (v0->z * v0->z));
+                v0->x *= w; v0->y *= w; v0->z *= w;
+                v0->skinid = M3D_UNDEF;
+            }
+            M3D_FREE(norm);
+        }
+#endif
+        if(model->numbone && model->bone && model->numskin && model->skin && model->numvertex && model->vertex) {
+#ifndef M3D_NOWEIGHTS
+            M3D_LOG("Generating weight cross-reference");
+            for(i = 0; i < model->numvertex; i++) {
+                if(model->vertex[i].skinid < model->numskin) {
+                    sk = &model->skin[model->vertex[i].skinid];
+                    w = (M3D_FLOAT)0.0;
+                    for(j = 0; j < M3D_NUMBONE && sk->boneid[j] != M3D_UNDEF && sk->weight[j] > (M3D_FLOAT)0.0; j++)
+                        w += sk->weight[j];
+                    for(j = 0; j < M3D_NUMBONE && sk->boneid[j] != M3D_UNDEF && sk->weight[j] > (M3D_FLOAT)0.0; j++) {
+                        sk->weight[j] /= w;
+                        b = &model->bone[sk->boneid[j]];
+                        k = b->numweight++;
+                        b->weight = (m3dw_t*)M3D_REALLOC(b->weight, b->numweight * sizeof(m3da_t));
+                        if(!b->weight) goto memerr;
+                        b->weight[k].vertexid = i;
+                        b->weight[k].weight = sk->weight[j];
+                    }
+                }
+            }
+#endif
+#ifndef M3D_NOANIMATION
+            M3D_LOG("Calculating bone transformation matrices");
+            for(i = 0; i < model->numbone; i++) {
+                b = &model->bone[i];
+                if(model->bone[i].parent == M3D_UNDEF) {
+                    _m3d_mat((M3D_FLOAT*)&b->mat4, &model->vertex[b->pos], &model->vertex[b->ori]);
+                } else {
+                    _m3d_mat((M3D_FLOAT*)&r, &model->vertex[b->pos], &model->vertex[b->ori]);
+                    _m3d_mul((M3D_FLOAT*)&b->mat4, (M3D_FLOAT*)&model->bone[b->parent].mat4, (M3D_FLOAT*)&r);
+                }
+            }
+            for(i = 0; i < model->numbone; i++)
+                _m3d_inv((M3D_FLOAT*)&model->bone[i].mat4);
+#endif
+        }
+#ifdef M3D_PROFILING
+        gettimeofday(&tv0, NULL);
+        tvd.tv_sec = tv0.tv_sec - tv1.tv_sec;
+        tvd.tv_usec = tv0.tv_usec - tv1.tv_usec;
+        if(tvd.tv_usec < 0) { tvd.tv_sec--; tvd.tv_usec += 1000000L; }
+        printf("  Post-process    %ld.%06ld sec\n", tvd.tv_sec, tvd.tv_usec);
+#endif
+    }
+    return model;
+}
+
+/**
+ * Calculates skeletons for animation frames, returns a working copy (should be freed after use)
+ */
+m3dtr_t *m3d_frame(m3d_t *model, M3D_INDEX actionid, M3D_INDEX frameid, m3dtr_t *skeleton)
+{
+    unsigned int i;
+    M3D_INDEX s = frameid;
+    m3dfr_t *fr;
+
+    if(!model || !model->numbone || !model->bone || (actionid != M3D_UNDEF && (!model->action ||
+        actionid >= model->numaction || frameid >= model->action[actionid].numframe))) {
+            model->errcode = M3D_ERR_UNKFRAME;
+            return skeleton;
+    }
+    model->errcode = M3D_SUCCESS;
+    if(!skeleton) {
+        skeleton = (m3dtr_t*)M3D_MALLOC(model->numbone * sizeof(m3dtr_t));
+        if(!skeleton) {
+            model->errcode = M3D_ERR_ALLOC;
+            return NULL;
+        }
+        goto gen;
+    }
+    if(actionid == M3D_UNDEF || !frameid) {
+gen:    s = 0;
+        for(i = 0; i < model->numbone; i++) {
+            skeleton[i].boneid = i;
+            skeleton[i].pos = model->bone[i].pos;
+            skeleton[i].ori = model->bone[i].ori;
+        }
+    }
+    if(actionid < model->numaction && (frameid || !model->action[actionid].frame[0].msec)) {
+        for(; s <= frameid; s++) {
+            fr = &model->action[actionid].frame[s];
+            for(i = 0; i < fr->numtransform; i++) {
+                skeleton[fr->transform[i].boneid].pos = fr->transform[i].pos;
+                skeleton[fr->transform[i].boneid].ori = fr->transform[i].ori;
+            }
+        }
+    }
+    return skeleton;
+}
+
+#ifndef M3D_NOANIMATION
+/**
+ * Returns interpolated animation-pose, a working copy (should be freed after use)
+ */
+m3db_t *m3d_pose(m3d_t *model, M3D_INDEX actionid, uint32_t msec)
+{
+    unsigned int i, j, l;
+    M3D_FLOAT r[16], t, c, d, s;
+    m3db_t *ret;
+    m3dv_t *v, *p, *f;
+    m3dtr_t *tmp;
+    m3dfr_t *fr;
+
+    if(!model || !model->numbone || !model->bone) {
+        model->errcode = M3D_ERR_UNKFRAME;
+        return NULL;
+    }
+    ret = (m3db_t*)M3D_MALLOC(model->numbone * sizeof(m3db_t));
+    if(!ret) {
+        model->errcode = M3D_ERR_ALLOC;
+        return NULL;
+    }
+    memcpy(ret, model->bone, model->numbone * sizeof(m3db_t));
+    for(i = 0; i < model->numbone; i++)
+        _m3d_inv((M3D_FLOAT*)&ret[i].mat4);
+    if(!model->action || actionid >= model->numaction) {
+        model->errcode = M3D_ERR_UNKFRAME;
+        return ret;
+    }
+    msec %= model->action[actionid].durationmsec;
+    model->errcode = M3D_SUCCESS;
+    fr = &model->action[actionid].frame[0];
+    for(j = l = 0; j < model->action[actionid].numframe && model->action[actionid].frame[j].msec <= msec; j++) {
+        fr = &model->action[actionid].frame[j];
+        l = fr->msec;
+        for(i = 0; i < fr->numtransform; i++) {
+            ret[fr->transform[i].boneid].pos = fr->transform[i].pos;
+            ret[fr->transform[i].boneid].ori = fr->transform[i].ori;
+        }
+    }
+    if(l != msec) {
+        model->vertex = (m3dv_t*)M3D_REALLOC(model->vertex, (model->numvertex + 2 * model->numbone) * sizeof(m3dv_t));
+        if(!model->vertex) {
+            M3D_FREE(ret);
+            model->errcode = M3D_ERR_ALLOC;
+            return NULL;
+        }
+        tmp = (m3dtr_t*)M3D_MALLOC(model->numbone * sizeof(m3dtr_t));
+        if(tmp) {
+            for(i = 0; i < model->numbone; i++) {
+                tmp[i].pos = ret[i].pos;
+                tmp[i].ori = ret[i].ori;
+            }
+            fr = &model->action[actionid].frame[j % model->action[actionid].numframe];
+            t = l >= fr->msec ? (M3D_FLOAT)1.0 : (M3D_FLOAT)(msec - l) / (M3D_FLOAT)(fr->msec - l);
+            for(i = 0; i < fr->numtransform; i++) {
+                tmp[fr->transform[i].boneid].pos = fr->transform[i].pos;
+                tmp[fr->transform[i].boneid].ori = fr->transform[i].ori;
+            }
+            for(i = 0, j = model->numvertex; i < model->numbone; i++) {
+                /* interpolation of position */
+                if(ret[i].pos != tmp[i].pos) {
+                    p = &model->vertex[ret[i].pos];
+                    f = &model->vertex[tmp[i].pos];
+                    v = &model->vertex[j];
+                    v->x = p->x + t * (f->x - p->x);
+                    v->y = p->y + t * (f->y - p->y);
+                    v->z = p->z + t * (f->z - p->z);
+                    ret[i].pos = j++;
+                }
+                /* interpolation of orientation */
+                if(ret[i].ori != tmp[i].ori) {
+                    p = &model->vertex[ret[i].ori];
+                    f = &model->vertex[tmp[i].ori];
+                    v = &model->vertex[j];
+                    d = p->w * f->w + p->x * f->x + p->y * f->y + p->z * f->z;
+                    if(d < 0) { d = -d; s = (M3D_FLOAT)-1.0; } else s = (M3D_FLOAT)1.0;
+#if 0
+                    /* don't use SLERP, requires two more variables, libm linkage and it is slow (but nice) */
+                    a = (M3D_FLOAT)1.0 - t; b = t;
+                    if(d < (M3D_FLOAT)0.999999) { c = acosf(d); b = 1 / sinf(c); a = sinf(a * c) * b; b *= sinf(t * c) * s; }
+                    v->x = p->x * a + f->x * b;
+                    v->y = p->y * a + f->y * b;
+                    v->z = p->z * a + f->z * b;
+                    v->w = p->w * a + f->w * b;
+#else
+                    /* approximated NLERP, original approximation by Arseny Kapoulkine, heavily optimized by me */
+                    c = t - (M3D_FLOAT)0.5; t += t * c * (t - (M3D_FLOAT)1.0) * (((M3D_FLOAT)1.0904 + d * ((M3D_FLOAT)-3.2452 +
+                        d * ((M3D_FLOAT)3.55645 - d * (M3D_FLOAT)1.43519))) * c * c + ((M3D_FLOAT)0.848013 + d *
+                        ((M3D_FLOAT)-1.06021 + d * (M3D_FLOAT)0.215638)));
+                    v->x = p->x + t * (s * f->x - p->x);
+                    v->y = p->y + t * (s * f->y - p->y);
+                    v->z = p->z + t * (s * f->z - p->z);
+                    v->w = p->w + t * (s * f->w - p->w);
+                    d = _m3d_rsq(v->w * v->w + v->x * v->x + v->y * v->y + v->z * v->z);
+                    v->x *= d; v->y *= d; v->z *= d; v->w *= d;
+#endif
+                    ret[i].ori = j++;
+                }
+            }
+            M3D_FREE(tmp);
+        }
+    }
+    for(i = 0; i < model->numbone; i++) {
+        if(ret[i].parent == M3D_UNDEF) {
+            _m3d_mat((M3D_FLOAT*)&ret[i].mat4, &model->vertex[ret[i].pos], &model->vertex[ret[i].ori]);
+        } else {
+            _m3d_mat((M3D_FLOAT*)&r, &model->vertex[ret[i].pos], &model->vertex[ret[i].ori]);
+            _m3d_mul((M3D_FLOAT*)&ret[i].mat4, (M3D_FLOAT*)&ret[ret[i].parent].mat4, (M3D_FLOAT*)&r);
+        }
+    }
+    return ret;
+}
+
+#endif /* M3D_NOANIMATION */
+
+#endif /* M3D_IMPLEMENTATION */
+
+#if !defined(M3D_NODUP) && (!defined(M3D_NOIMPORTER) || defined(M3D_EXPORTER))
+/**
+ * Free the in-memory model
+ */
+void m3d_free(m3d_t *model)
+{
+    unsigned int i, j;
+
+    if(!model) return;
+#ifdef M3D_ASCII
+    /* if model imported from ASCII, we have to free all strings as well */
+    if(model->flags & M3D_FLG_FREESTR) {
+        if(model->name) M3D_FREE(model->name);
+        if(model->license) M3D_FREE(model->license);
+        if(model->author) M3D_FREE(model->author);
+        if(model->desc) M3D_FREE(model->desc);
+        if(model->bone)
+            for(i = 0; i < model->numbone; i++)
+                if(model->bone[i].name)
+                    M3D_FREE(model->bone[i].name);
+        if(model->shape)
+            for(i = 0; i < model->numshape; i++)
+                if(model->shape[i].name)
+                    M3D_FREE(model->shape[i].name);
+        if(model->numvoxtype)
+            for(i = 0; i < model->numvoxtype; i++) {
+                if(model->voxtype[i].name)
+                    M3D_FREE(model->voxtype[i].name);
+                for(j = 0; j < model->voxtype[i].numitem; j++)
+                    if(model->voxtype[i].item[j].name)
+                        M3D_FREE(model->voxtype[i].item[j].name);
+            }
+        if(model->numvoxel)
+            for(i = 0; i < model->numvoxel; i++)
+                if(model->voxel[i].name)
+                    M3D_FREE(model->voxel[i].name);
+        if(model->material)
+            for(i = 0; i < model->nummaterial; i++)
+                if(model->material[i].name)
+                    M3D_FREE(model->material[i].name);
+        if(model->action)
+            for(i = 0; i < model->numaction; i++)
+                if(model->action[i].name)
+                    M3D_FREE(model->action[i].name);
+        if(model->texture)
+            for(i = 0; i < model->numtexture; i++)
+                if(model->texture[i].name)
+                    M3D_FREE(model->texture[i].name);
+        if(model->inlined)
+            for(i = 0; i < model->numinlined; i++) {
+                if(model->inlined[i].name)
+                    M3D_FREE(model->inlined[i].name);
+                if(model->inlined[i].data)
+                    M3D_FREE(model->inlined[i].data);
+            }
+        if(model->extra)
+            for(i = 0; i < model->numextra; i++)
+                if(model->extra[i])
+                    M3D_FREE(model->extra[i]);
+        if(model->label)
+            for(i = 0; i < model->numlabel; i++) {
+                if(model->label[i].name) {
+                    for(j = i + 1; j < model->numlabel; j++)
+                        if(model->label[j].name == model->label[i].name)
+                            model->label[j].name = NULL;
+                    M3D_FREE(model->label[i].name);
+                }
+                if(model->label[i].lang) {
+                    for(j = i + 1; j < model->numlabel; j++)
+                        if(model->label[j].lang == model->label[i].lang)
+                            model->label[j].lang = NULL;
+                    M3D_FREE(model->label[i].lang);
+                }
+                if(model->label[i].text)
+                    M3D_FREE(model->label[i].text);
+            }
+        if(model->preview.data)
+            M3D_FREE(model->preview.data);
+    }
+#endif
+    if(model->flags & M3D_FLG_FREERAW) M3D_FREE(model->raw);
+
+    if(model->tmap) M3D_FREE(model->tmap);
+    if(model->bone) {
+        for(i = 0; i < model->numbone; i++)
+            if(model->bone[i].weight)
+                M3D_FREE(model->bone[i].weight);
+        M3D_FREE(model->bone);
+    }
+    if(model->skin) M3D_FREE(model->skin);
+    if(model->vertex) M3D_FREE(model->vertex);
+    if(model->face) M3D_FREE(model->face);
+    if(model->voxtype) {
+        for(i = 0; i < model->numvoxtype; i++)
+            if(model->voxtype[i].item)
+                M3D_FREE(model->voxtype[i].item);
+        M3D_FREE(model->voxtype);
+    }
+    if(model->voxel) {
+        for(i = 0; i < model->numvoxel; i++)
+            if(model->voxel[i].data)
+                M3D_FREE(model->voxel[i].data);
+        M3D_FREE(model->voxel);
+    }
+    if(model->shape) {
+        for(i = 0; i < model->numshape; i++) {
+            if(model->shape[i].cmd) {
+                for(j = 0; j < model->shape[i].numcmd; j++)
+                    if(model->shape[i].cmd[j].arg) M3D_FREE(model->shape[i].cmd[j].arg);
+                M3D_FREE(model->shape[i].cmd);
+            }
+        }
+        M3D_FREE(model->shape);
+    }
+    if(model->material && !(model->flags & M3D_FLG_MTLLIB)) {
+        for(i = 0; i < model->nummaterial; i++)
+            if(model->material[i].prop) M3D_FREE(model->material[i].prop);
+        M3D_FREE(model->material);
+    }
+    if(model->texture) {
+        for(i = 0; i < model->numtexture; i++)
+            if(model->texture[i].d) M3D_FREE(model->texture[i].d);
+        M3D_FREE(model->texture);
+    }
+    if(model->action) {
+        for(i = 0; i < model->numaction; i++) {
+            if(model->action[i].frame) {
+                for(j = 0; j < model->action[i].numframe; j++)
+                    if(model->action[i].frame[j].transform) M3D_FREE(model->action[i].frame[j].transform);
+                M3D_FREE(model->action[i].frame);
+            }
+        }
+        M3D_FREE(model->action);
+    }
+    if(model->label) M3D_FREE(model->label);
+    if(model->inlined) M3D_FREE(model->inlined);
+    if(model->extra) M3D_FREE(model->extra);
+    M3D_FREE(model);
+}
+#endif
+
+#ifdef M3D_EXPORTER
+typedef struct {
+    char *str;
+    uint32_t offs;
+} m3dstr_t;
+
+typedef struct {
+    m3dti_t data;
+    M3D_INDEX oldidx;
+    M3D_INDEX newidx;
+} m3dtisave_t;
+
+typedef struct {
+    m3dv_t data;
+    M3D_INDEX oldidx;
+    M3D_INDEX newidx;
+    unsigned char norm;
+} m3dvsave_t;
+
+typedef struct {
+    m3ds_t data;
+    M3D_INDEX oldidx;
+    M3D_INDEX newidx;
+} m3dssave_t;
+
+typedef struct {
+    m3df_t data;
+    int group;
+    uint8_t opacity;
+} m3dfsave_t;
+
+/* create unique list of strings */
+static m3dstr_t *_m3d_addstr(m3dstr_t *str, uint32_t *numstr, char *s)
+{
+    uint32_t i;
+    if(!s || !*s) return str;
+    if(str) {
+        for(i = 0; i < *numstr; i++)
+            if(str[i].str == s || !strcmp(str[i].str, s)) return str;
+    }
+    str = (m3dstr_t*)M3D_REALLOC(str, ((*numstr) + 1) * sizeof(m3dstr_t));
+    str[*numstr].str = s;
+    str[*numstr].offs = 0;
+    (*numstr)++;
+    return str;
+}
+
+/* add strings to header */
+m3dhdr_t *_m3d_addhdr(m3dhdr_t *h, m3dstr_t *s)
+{
+    int i;
+    char *safe = _m3d_safestr(s->str, 0);
+    i = (int)strlen(safe);
+    h = (m3dhdr_t*)M3D_REALLOC(h, h->length + i+1);
+    if(!h) { M3D_FREE(safe); return NULL; }
+    memcpy((uint8_t*)h + h->length, safe, i+1);
+    s->offs = h->length - 16;
+    h->length += i+1;
+    M3D_FREE(safe);
+    return h;
+}
+
+/* return offset of string */
+static uint32_t _m3d_stridx(m3dstr_t *str, uint32_t numstr, char *s)
+{
+    uint32_t i;
+    char *safe;
+    if(!s || !*s) return 0;
+    if(str) {
+        safe = _m3d_safestr(s, 0);
+        if(!safe) return 0;
+        if(!*safe) {
+            M3D_FREE(safe);
+            return 0;
+        }
+        for(i = 0; i < numstr; i++)
+            if(!strcmp(str[i].str, s)) {
+                M3D_FREE(safe);
+                return str[i].offs;
+            }
+        M3D_FREE(safe);
+    }
+    return 0;
+}
+
+/* compare to faces by their material */
+static int _m3d_facecmp(const void *a, const void *b) {
+    const m3dfsave_t *A = (const m3dfsave_t*)a, *B = (const m3dfsave_t*)b;
+    return A->group != B->group ? A->group - B->group : (A->opacity != B->opacity ? (int)B->opacity - (int)A->opacity :
+        (int)A->data.materialid - (int)B->data.materialid);
+}
+/* compare face groups */
+static int _m3d_grpcmp(const void *a, const void *b) { return *((uint32_t*)a) - *((uint32_t*)b); }
+/* compare UVs */
+static int _m3d_ticmp(const void *a, const void *b) { return memcmp(a, b, sizeof(m3dti_t)); }
+/* compare skin groups */
+static int _m3d_skincmp(const void *a, const void *b) { return memcmp(a, b, sizeof(m3ds_t)); }
+/* compare vertices */
+static int _m3d_vrtxcmp(const void *a, const void *b) {
+    int c = memcmp(a, b, 3 * sizeof(M3D_FLOAT));
+    if(c) return c;
+    c = ((m3dvsave_t*)a)->norm - ((m3dvsave_t*)b)->norm;
+    if(c) return c;
+    return memcmp(a, b, sizeof(m3dv_t));
+}
+/* compare labels */
+static _inline int _m3d_strcmp(char *a, char *b)
+{
+    if(a == NULL && b != NULL) return -1;
+    if(a != NULL && b == NULL) return 1;
+    if(a == NULL && b == NULL) return 0;
+    return strcmp(a, b);
+}
+static int _m3d_lblcmp(const void *a, const void *b) {
+    const m3dl_t *A = (const m3dl_t*)a, *B = (const m3dl_t*)b;
+    int c = _m3d_strcmp(A->lang, B->lang);
+    if(!c) c = _m3d_strcmp(A->name, B->name);
+    if(!c) c = _m3d_strcmp(A->text, B->text);
+    return c;
+}
+/* compare two colors by HSV value */
+_inline static int _m3d_cmapcmp(const void *a, const void *b)
+{
+    uint8_t *A = (uint8_t*)a,  *B = (uint8_t*)b;
+    _register int m, vA, vB;
+    /* get HSV value for A */
+    m = A[2] < A[1]? A[2] : A[1]; if(A[0] < m) m = A[0];
+    vA = A[2] > A[1]? A[2] : A[1]; if(A[0] > vA) vA = A[0];
+    /* get HSV value for B */
+    m = B[2] < B[1]? B[2] : B[1]; if(B[0] < m) m = B[0];
+    vB = B[2] > B[1]? B[2] : B[1]; if(B[0] > vB) vB = B[0];
+    return vA - vB;
+}
+
+/* create sorted list of colors */
+static uint32_t *_m3d_addcmap(uint32_t *cmap, uint32_t *numcmap, uint32_t color)
+{
+    uint32_t i;
+    if(cmap) {
+        for(i = 0; i < *numcmap; i++)
+            if(cmap[i] == color) return cmap;
+    }
+    cmap = (uint32_t*)M3D_REALLOC(cmap, ((*numcmap) + 1) * sizeof(uint32_t));
+    for(i = 0; i < *numcmap && _m3d_cmapcmp(&color, &cmap[i]) > 0; i++);
+    if(i < *numcmap) memmove(&cmap[i+1], &cmap[i], ((*numcmap) - i)*sizeof(uint32_t));
+    cmap[i] = color;
+    (*numcmap)++;
+    return cmap;
+}
+
+/* look up a color and return its index */
+static uint32_t _m3d_cmapidx(uint32_t *cmap, uint32_t numcmap, uint32_t color)
+{
+    uint32_t i;
+    if(numcmap >= 65536)
+        return color;
+    for(i = 0; i < numcmap; i++)
+        if(cmap[i] == color) return i;
+    return 0;
+}
+
+/* add index to output */
+static unsigned char *_m3d_addidx(unsigned char *out, char type, uint32_t idx) {
+    switch(type) {
+        case 1: *out++ = (uint8_t)(idx); break;
+        case 2: *((uint16_t*)out) = (uint16_t)(idx); out += 2; break;
+        case 4: *((uint32_t*)out) = (uint32_t)(idx); out += 4; break;
+        /* case 0: case 8: break; */
+    }
+    return out;
+}
+
+/* round a vertex position */
+static void _m3d_round(int quality, m3dv_t *src, m3dv_t *dst)
+{
+    _register int t;
+    /* copy additional attributes */
+    if(src != dst) memcpy(dst, src, sizeof(m3dv_t));
+    /* round according to quality */
+    switch(quality) {
+        case M3D_EXP_INT8:
+            t = (int)(src->x * 127 + (src->x >= 0 ? (M3D_FLOAT)0.5 : (M3D_FLOAT)-0.5)); dst->x = (M3D_FLOAT)t / (M3D_FLOAT)127.0;
+            t = (int)(src->y * 127 + (src->y >= 0 ? (M3D_FLOAT)0.5 : (M3D_FLOAT)-0.5)); dst->y = (M3D_FLOAT)t / (M3D_FLOAT)127.0;
+            t = (int)(src->z * 127 + (src->z >= 0 ? (M3D_FLOAT)0.5 : (M3D_FLOAT)-0.5)); dst->z = (M3D_FLOAT)t / (M3D_FLOAT)127.0;
+            t = (int)(src->w * 127 + (src->w >= 0 ? (M3D_FLOAT)0.5 : (M3D_FLOAT)-0.5)); dst->w = (M3D_FLOAT)t / (M3D_FLOAT)127.0;
+        break;
+        case M3D_EXP_INT16:
+            t = (int)(src->x * 32767 + (src->x >= 0 ? (M3D_FLOAT)0.5 : (M3D_FLOAT)-0.5)); dst->x = (M3D_FLOAT)t / (M3D_FLOAT)32767.0;
+            t = (int)(src->y * 32767 + (src->y >= 0 ? (M3D_FLOAT)0.5 : (M3D_FLOAT)-0.5)); dst->y = (M3D_FLOAT)t / (M3D_FLOAT)32767.0;
+            t = (int)(src->z * 32767 + (src->z >= 0 ? (M3D_FLOAT)0.5 : (M3D_FLOAT)-0.5)); dst->z = (M3D_FLOAT)t / (M3D_FLOAT)32767.0;
+            t = (int)(src->w * 32767 + (src->w >= 0 ? (M3D_FLOAT)0.5 : (M3D_FLOAT)-0.5)); dst->w = (M3D_FLOAT)t / (M3D_FLOAT)32767.0;
+        break;
+    }
+    if(dst->x == (M3D_FLOAT)-0.0) dst->x = (M3D_FLOAT)0.0;
+    if(dst->y == (M3D_FLOAT)-0.0) dst->y = (M3D_FLOAT)0.0;
+    if(dst->z == (M3D_FLOAT)-0.0) dst->z = (M3D_FLOAT)0.0;
+    if(dst->w == (M3D_FLOAT)-0.0) dst->w = (M3D_FLOAT)0.0;
+}
+
+#ifdef M3D_ASCII
+/* add a bone to ascii output */
+static char *_m3d_prtbone(char *ptr, m3db_t *bone, M3D_INDEX numbone, M3D_INDEX parent, uint32_t level, M3D_INDEX *vrtxidx)
+{
+    uint32_t i, j;
+    char *sn;
+
+    if(level > M3D_BONEMAXLEVEL || !bone) return ptr;
+    for(i = 0; i < numbone; i++) {
+        if(bone[i].parent == parent) {
+            for(j = 0; j < level; j++) *ptr++ = '/';
+            sn = _m3d_safestr(bone[i].name, 0);
+            ptr += sprintf(ptr, "%d %d %s\r\n", vrtxidx[bone[i].pos], vrtxidx[bone[i].ori], sn);
+            M3D_FREE(sn);
+            ptr = _m3d_prtbone(ptr, bone, numbone, i, level + 1, vrtxidx);
+        }
+    }
+    return ptr;
+}
+#endif
+
+/**
+ * Function to encode an in-memory model into on storage Model 3D format
+ */
+unsigned char *m3d_save(m3d_t *model, int quality, int flags, unsigned int *size)
+{
+#ifdef M3D_ASCII
+    const char *ol;
+    char *ptr;
+#endif
+    char vc_s, vi_s, si_s, ci_s, ti_s, bi_s, nb_s, sk_s, fc_s, hi_s, fi_s, vd_s, vp_s;
+    char *sn = NULL, *sl = NULL, *sa = NULL, *sd = NULL;
+    unsigned char *out = NULL, *z = NULL, weights[M3D_NUMBONE < 8 ? 8 : M3D_NUMBONE], *norm = NULL;
+    unsigned int i, j, k, l, n, o, len, chunklen, *length;
+    int maxvox = 0, minvox = 0;
+    M3D_FLOAT scale = (M3D_FLOAT)0.0, min_x, max_x, min_y, max_y, min_z, max_z, mw;
+    M3D_INDEX last, *vrtxidx = NULL, *mtrlidx = NULL, *tmapidx = NULL, *skinidx = NULL;
+#ifdef M3D_VERTEXMAX
+    M3D_INDEX lastp;
+#endif
+    uint32_t idx, numcmap = 0, *cmap = NULL, numvrtx = 0, maxvrtx = 0, numtmap = 0, maxtmap = 0, numproc = 0;
+    uint32_t numskin = 0, maxskin = 0, numstr = 0, maxt = 0, maxbone = 0, numgrp = 0, maxgrp = 0, *grpidx = NULL;
+    uint8_t *opa = NULL;
+    m3dcd_t *cd;
+    m3dc_t *cmd;
+    m3dstr_t *str = NULL;
+    m3dvsave_t *vrtx = NULL, vertex;
+    m3dtisave_t *tmap = NULL, tcoord;
+    m3dssave_t *skin = NULL, sk;
+    m3dfsave_t *face = NULL;
+    m3dhdr_t *h = NULL;
+    m3dm_t *m;
+    m3da_t *a;
+
+    if(!model) {
+        if(size) *size = 0;
+        return NULL;
+    }
+    model->errcode = M3D_SUCCESS;
+#ifdef M3D_ASCII
+    if(flags & M3D_EXP_ASCII) quality = M3D_EXP_DOUBLE;
+#endif
+    vrtxidx = (M3D_INDEX*)M3D_MALLOC(model->numvertex * sizeof(M3D_INDEX));
+    if(!vrtxidx) goto memerr;
+    memset(vrtxidx, 255, model->numvertex * sizeof(M3D_INDEX));
+    if(model->numvertex && !(flags & M3D_EXP_NONORMAL)){
+        norm = (unsigned char*)M3D_MALLOC(model->numvertex * sizeof(unsigned char));
+        if(!norm) goto memerr;
+        memset(norm, 0, model->numvertex * sizeof(unsigned char));
+    }
+    if(model->nummaterial && !(flags & M3D_EXP_NOMATERIAL)) {
+        mtrlidx = (M3D_INDEX*)M3D_MALLOC(model->nummaterial * sizeof(M3D_INDEX));
+        if(!mtrlidx) goto memerr;
+        memset(mtrlidx, 255, model->nummaterial * sizeof(M3D_INDEX));
+        opa = (uint8_t*)M3D_MALLOC(model->nummaterial * 2 * sizeof(M3D_INDEX));
+        if(!opa) goto memerr;
+        memset(opa, 255, model->nummaterial * 2 * sizeof(M3D_INDEX));
+    }
+    if(model->numtmap && !(flags & M3D_EXP_NOTXTCRD)) {
+        tmapidx = (M3D_INDEX*)M3D_MALLOC(model->numtmap * sizeof(M3D_INDEX));
+        if(!tmapidx) goto memerr;
+        memset(tmapidx, 255, model->numtmap * sizeof(M3D_INDEX));
+    }
+    /** collect array elements that are actually referenced **/
+    if(!(flags & M3D_EXP_NOFACE)) {
+        /* face */
+        if(model->numface && model->face) {
+            M3D_LOG("Processing mesh face");
+            face = (m3dfsave_t*)M3D_MALLOC(model->numface * sizeof(m3dfsave_t));
+            if(!face) goto memerr;
+            for(i = 0; i < model->numface; i++) {
+                memcpy(&face[i].data, &model->face[i], sizeof(m3df_t));
+                face[i].group = 0;
+                face[i].opacity = 255;
+                if(!(flags & M3D_EXP_NOMATERIAL) && model->face[i].materialid < model->nummaterial) {
+                    if(model->material[model->face[i].materialid].numprop) {
+                        mtrlidx[model->face[i].materialid] = 0;
+                        if(opa[model->face[i].materialid * 2]) {
+                            m = &model->material[model->face[i].materialid];
+                            for(j = 0; j < m->numprop; j++)
+                                if(m->prop[j].type == m3dp_Kd) {
+                                    opa[model->face[i].materialid * 2 + 1] = ((uint8_t*)&m->prop[j].value.color)[3];
+                                    break;
+                                }
+                            for(j = 0; j < m->numprop; j++)
+                                if(m->prop[j].type == m3dp_d) {
+                                    opa[model->face[i].materialid * 2 + 1] = (uint8_t)(m->prop[j].value.fnum * 255);
+                                    break;
+                                }
+                            opa[model->face[i].materialid * 2] = 0;
+                        }
+                        face[i].opacity = opa[model->face[i].materialid * 2 + 1];
+                    } else
+                        face[i].data.materialid = M3D_UNDEF;
+                }
+                for(j = 0; j < 3; j++) {
+                    k = model->face[i].vertex[j];
+                    if(k < model->numvertex)
+                        vrtxidx[k] = 0;
+                    if(!(flags & M3D_EXP_NOCMAP)) {
+                        cmap = _m3d_addcmap(cmap, &numcmap, model->vertex[k].color);
+                        if(!cmap) goto memerr;
+                    }
+                    k = model->face[i].normal[j];
+                    if(k < model->numvertex && !(flags & M3D_EXP_NONORMAL)) {
+                        vrtxidx[k] = 0;
+                        norm[k] = 1;
+                    }
+                    k = model->face[i].texcoord[j];
+                    if(k < model->numtmap && !(flags & M3D_EXP_NOTXTCRD))
+                        tmapidx[k] = 0;
+#ifdef M3D_VERTEXMAX
+                    k = model->face[i].vertmax[j];
+                    if(k < model->numvertex && !(flags & M3D_EXP_NOVRTMAX))
+                        vrtxidx[k] = 0;
+#endif
+                }
+                /* convert from CW to CCW */
+                if(flags & M3D_EXP_IDOSUCK) {
+                    j = face[i].data.vertex[1];
+                    face[i].data.vertex[1] = face[i].data.vertex[2];
+                    face[i].data.vertex[2] = j;
+                    j = face[i].data.normal[1];
+                    face[i].data.normal[1] = face[i].data.normal[2];
+                    face[i].data.normal[2] = j;
+                    j = face[i].data.texcoord[1];
+                    face[i].data.texcoord[1] = face[i].data.texcoord[2];
+                    face[i].data.texcoord[2] = j;
+#ifdef M3D_VERTEXMAX
+                    j = face[i].data.vertmax[1];
+                    face[i].data.vertmax[1] = face[i].data.vertmax[2];
+                    face[i].data.vertmax[2] = j;
+#endif
+                }
+            }
+        }
+        if((model->numvoxtype && model->voxtype) || (model->numvoxel && model->voxel)) {
+            M3D_LOG("Processing voxel face");
+            for(i = 0; i < model->numvoxtype; i++) {
+                str = _m3d_addstr(str, &numstr, model->voxtype[i].name);
+                if(model->voxtype[i].name && !str) goto memerr;
+                if(!(flags & M3D_EXP_NOCMAP)) {
+                    cmap = _m3d_addcmap(cmap, &numcmap, model->voxtype[i].color);
+                    if(!cmap) goto memerr;
+                }
+                for(j = 0; j < model->voxtype[i].numitem; j++) {
+                    str = _m3d_addstr(str, &numstr, model->voxtype[i].item[j].name);
+                    if(model->voxtype[i].item[j].name && !str) goto memerr;
+                }
+            }
+            for(i = 0; i < model->numvoxel; i++) {
+                str = _m3d_addstr(str, &numstr, model->voxel[i].name);
+                if(model->voxel[i].name && !str) goto memerr;
+                if(model->voxel[i].x < minvox) minvox = model->voxel[i].x;
+                if(model->voxel[i].x + (int)model->voxel[i].w > maxvox) maxvox = model->voxel[i].x + model->voxel[i].w;
+                if(model->voxel[i].y < minvox) minvox = model->voxel[i].y;
+                if(model->voxel[i].y + (int)model->voxel[i].h > maxvox) maxvox = model->voxel[i].y + model->voxel[i].h;
+                if(model->voxel[i].z < minvox) minvox = model->voxel[i].z;
+                if(model->voxel[i].z + (int)model->voxel[i].d > maxvox) maxvox = model->voxel[i].z + model->voxel[i].d;
+            }
+        }
+        if(model->numshape && model->shape) {
+            M3D_LOG("Processing shape face");
+            for(i = 0; i < model->numshape; i++) {
+                if(!model->shape[i].numcmd) continue;
+                str = _m3d_addstr(str, &numstr, model->shape[i].name);
+                if(model->shape[i].name && !str) goto memerr;
+                for(j = 0; j < model->shape[i].numcmd; j++) {
+                    cmd = &model->shape[i].cmd[j];
+                    if(cmd->type >= (unsigned int)(sizeof(m3d_commandtypes)/sizeof(m3d_commandtypes[0])) || !cmd->arg)
+                        continue;
+                    if(cmd->type == m3dc_mesh) {
+                        if(numgrp + 2 < maxgrp) {
+                            maxgrp += 1024;
+                            grpidx = (uint32_t*)M3D_REALLOC(grpidx, maxgrp * sizeof(uint32_t));
+                            if(!grpidx) goto memerr;
+                            if(!numgrp) {
+                                grpidx[0] = 0;
+                                grpidx[1] = model->numface;
+                                numgrp += 2;
+                            }
+                        }
+                        grpidx[numgrp + 0] = cmd->arg[0];
+                        grpidx[numgrp + 1] = cmd->arg[0] + cmd->arg[1];
+                        numgrp += 2;
+                    }
+                    cd = &m3d_commandtypes[cmd->type];
+                    for(k = n = 0, l = cd->p; k < l; k++)
+                        switch(cd->a[((k - n) % (cd->p - n)) + n]) {
+                            case m3dcp_mi_t:
+                                if(!(flags & M3D_EXP_NOMATERIAL) && cmd->arg[k] < model->nummaterial)
+                                    mtrlidx[cmd->arg[k]] = 0;
+                            break;
+                            case m3dcp_ti_t:
+                                if(!(flags & M3D_EXP_NOTXTCRD) && cmd->arg[k] < model->numtmap)
+                                    tmapidx[cmd->arg[k]] = 0;
+                            break;
+                            case m3dcp_qi_t:
+                            case m3dcp_vi_t:
+                                if(cmd->arg[k] < model->numvertex)
+                                    vrtxidx[cmd->arg[k]] = 0;
+                            break;
+                            case m3dcp_va_t:
+                                n = k + 1; l += (cmd->arg[k] - 1) * (cd->p - k - 1);
+                            break;
+                        }
+                }
+            }
+        }
+        if(model->numface && face) {
+            if(numgrp && grpidx) {
+                qsort(grpidx, numgrp, sizeof(uint32_t), _m3d_grpcmp);
+                for(i = j = 0; i < model->numface && j < numgrp; i++) {
+                    while(j < numgrp && grpidx[j] < i) j++;
+                    face[i].group = j;
+                }
+            }
+            qsort(face, model->numface, sizeof(m3dfsave_t), _m3d_facecmp);
+        }
+        if(grpidx) { M3D_FREE(grpidx); grpidx = NULL; }
+        if(model->numlabel && model->label) {
+            M3D_LOG("Processing annotation labels");
+            for(i = 0; i < model->numlabel; i++) {
+                str = _m3d_addstr(str, &numstr, model->label[i].name);
+                str = _m3d_addstr(str, &numstr, model->label[i].lang);
+                str = _m3d_addstr(str, &numstr, model->label[i].text);
+                if(!(flags & M3D_EXP_NOCMAP)) {
+                    cmap = _m3d_addcmap(cmap, &numcmap, model->label[i].color);
+                    if(!cmap) goto memerr;
+                }
+                if(model->label[i].vertexid < model->numvertex)
+                    vrtxidx[model->label[i].vertexid] = 0;
+            }
+            qsort(model->label, model->numlabel, sizeof(m3dl_t), _m3d_lblcmp);
+        }
+    } else if(!(flags & M3D_EXP_NOMATERIAL)) {
+        /* without a face, simply add all materials, because it can be an mtllib */
+        for(i = 0; i < model->nummaterial; i++)
+            mtrlidx[i] = i;
+    }
+    /* bind-pose skeleton */
+    if(model->numbone && model->bone && !(flags & M3D_EXP_NOBONE)) {
+        M3D_LOG("Processing bones");
+        for(i = 0; i < model->numbone; i++) {
+            str = _m3d_addstr(str, &numstr, model->bone[i].name);
+            if(!str) goto memerr;
+            k = model->bone[i].pos;
+            if(k < model->numvertex)
+                vrtxidx[k] = 0;
+            k = model->bone[i].ori;
+            if(k < model->numvertex)
+                vrtxidx[k] = 0;
+        }
+    }
+    /* actions, animated skeleton poses */
+    if(model->numaction && model->action && !(flags & M3D_EXP_NOACTION)) {
+        M3D_LOG("Processing action list");
+        for(j = 0; j < model->numaction; j++) {
+            a = &model->action[j];
+            str = _m3d_addstr(str, &numstr, a->name);
+            if(!str) goto memerr;
+            if(a->numframe > 65535) a->numframe = 65535;
+            for(i = 0; i < a->numframe; i++) {
+                for(l = 0; l < a->frame[i].numtransform; l++) {
+                    k = a->frame[i].transform[l].pos;
+                    if(k < model->numvertex)
+                        vrtxidx[k] = 0;
+                    k = a->frame[i].transform[l].ori;
+                    if(k < model->numvertex)
+                        vrtxidx[k] = 0;
+                }
+                if(l > maxt) maxt = l;
+            }
+        }
+    }
+    /* add colors to color map and texture names to string table */
+    if(!(flags & M3D_EXP_NOMATERIAL)) {
+        M3D_LOG("Processing materials");
+        for(i = k = 0; i < model->nummaterial; i++) {
+            if(mtrlidx[i] == M3D_UNDEF || !model->material[i].numprop) continue;
+            mtrlidx[i] = k++;
+            m = &model->material[i];
+            str = _m3d_addstr(str, &numstr, m->name);
+            if(!str) goto memerr;
+            if(m->prop)
+                for(j = 0; j < m->numprop; j++) {
+                    if(!(flags & M3D_EXP_NOCMAP) && m->prop[j].type < 128) {
+                        for(l = 0; l < sizeof(m3d_propertytypes)/sizeof(m3d_propertytypes[0]); l++) {
+                            if(m->prop[j].type == m3d_propertytypes[l].id && m3d_propertytypes[l].format == m3dpf_color) {
+                                ((uint8_t*)&m->prop[j].value.color)[3] = opa[i * 2 + 1];
+                                cmap = _m3d_addcmap(cmap, &numcmap, m->prop[j].value.color);
+                                if(!cmap) goto memerr;
+                                break;
+                            }
+                        }
+                    }
+                    if(m->prop[j].type >= 128 && m->prop[j].value.textureid < model->numtexture &&
+                        model->texture[m->prop[j].value.textureid].name) {
+                        str = _m3d_addstr(str, &numstr, model->texture[m->prop[j].value.textureid].name);
+                        if(!str) goto memerr;
+                    }
+                }
+        }
+    }
+    /* if there's only one black color, don't store it */
+    if(numcmap == 1 && cmap && !cmap[0]) numcmap = 0;
+
+    /** compress lists **/
+    if(model->numtmap && !(flags & M3D_EXP_NOTXTCRD)) {
+        M3D_LOG("Compressing tmap");
+        tmap = (m3dtisave_t*)M3D_MALLOC(model->numtmap * sizeof(m3dtisave_t));
+        if(!tmap) goto memerr;
+        for(i = 0; i < model->numtmap; i++) {
+            if(tmapidx[i] == M3D_UNDEF) continue;
+            switch(quality) {
+                case M3D_EXP_INT8:
+                    l = (unsigned int)(model->tmap[i].u * 255); tcoord.data.u = (M3D_FLOAT)l / (M3D_FLOAT)255.0;
+                    l = (unsigned int)(model->tmap[i].v * 255); tcoord.data.v = (M3D_FLOAT)l / (M3D_FLOAT)255.0;
+                break;
+                case M3D_EXP_INT16:
+                    l = (unsigned int)(model->tmap[i].u * 65535); tcoord.data.u = (M3D_FLOAT)l / (M3D_FLOAT)65535.0;
+                    l = (unsigned int)(model->tmap[i].v * 65535); tcoord.data.v = (M3D_FLOAT)l / (M3D_FLOAT)65535.0;
+                break;
+                default:
+                    tcoord.data.u = model->tmap[i].u;
+                    tcoord.data.v = model->tmap[i].v;
+                break;
+            }
+            if(flags & M3D_EXP_FLIPTXTCRD)
+                tcoord.data.v = (M3D_FLOAT)1.0 - tcoord.data.v;
+            tcoord.oldidx = i;
+            memcpy(&tmap[numtmap++], &tcoord, sizeof(m3dtisave_t));
+        }
+        if(numtmap) {
+            qsort(tmap, numtmap, sizeof(m3dtisave_t), _m3d_ticmp);
+            memcpy(&tcoord.data, &tmap[0], sizeof(m3dti_t));
+            for(i = 0; i < numtmap; i++) {
+                if(memcmp(&tcoord.data, &tmap[i].data, sizeof(m3dti_t))) {
+                    memcpy(&tcoord.data, &tmap[i].data, sizeof(m3dti_t));
+                    maxtmap++;
+                }
+                tmap[i].newidx = maxtmap;
+                tmapidx[tmap[i].oldidx] = maxtmap;
+            }
+            maxtmap++;
+        }
+    }
+    if(model->numskin && model->skin && !(flags & M3D_EXP_NOBONE)) {
+        M3D_LOG("Compressing skin");
+        skinidx = (M3D_INDEX*)M3D_MALLOC(model->numskin * sizeof(M3D_INDEX));
+        if(!skinidx) goto memerr;
+        skin = (m3dssave_t*)M3D_MALLOC(model->numskin * sizeof(m3dssave_t));
+        if(!skin) goto memerr;
+        memset(skinidx, 255, model->numskin * sizeof(M3D_INDEX));
+        for(i = 0; i < model->numvertex; i++) {
+            if(vrtxidx[i] != M3D_UNDEF && model->vertex[i].skinid < model->numskin)
+                skinidx[model->vertex[i].skinid] = 0;
+        }
+        for(i = 0; i < model->numskin; i++) {
+            if(skinidx[i] == M3D_UNDEF) continue;
+            memset(&sk, 0, sizeof(m3dssave_t));
+            for(j = 0, min_x = (M3D_FLOAT)0.0; j < M3D_NUMBONE && model->skin[i].boneid[j] != M3D_UNDEF; j++) {
+                    sk.data.boneid[j] = model->skin[i].boneid[j];
+                    sk.data.weight[j] = model->skin[i].weight[j] > (M3D_FLOAT)0.0 ? model->skin[i].weight[j] : (M3D_FLOAT)0.01;
+                    min_x += sk.data.weight[j];
+            }
+            if(j > maxbone) maxbone = j;
+            if(min_x != (M3D_FLOAT)1.0 && min_x != (M3D_FLOAT)0.0)
+                for(j = 0; j < M3D_NUMBONE && sk.data.weight[j] > (M3D_FLOAT)0.0; j++)
+                    sk.data.weight[j] /= min_x;
+            sk.oldidx = i;
+            memcpy(&skin[numskin++], &sk, sizeof(m3dssave_t));
+        }
+        if(numskin) {
+            qsort(skin, numskin, sizeof(m3dssave_t), _m3d_skincmp);
+            memcpy(&sk.data, &skin[0].data, sizeof(m3ds_t));
+            for(i = 0; i < numskin; i++) {
+                if(memcmp(&sk.data, &skin[i].data, sizeof(m3ds_t))) {
+                    memcpy(&sk.data, &skin[i].data, sizeof(m3ds_t));
+                    maxskin++;
+                }
+                skin[i].newidx = maxskin;
+                skinidx[skin[i].oldidx] = maxskin;
+            }
+            maxskin++;
+        }
+    }
+
+    M3D_LOG("Compressing vertex list");
+    min_x = min_y = min_z = (M3D_FLOAT)1e10;
+    max_x = max_y = max_z = (M3D_FLOAT)-1e10;
+    if(vrtxidx) {
+        vrtx = (m3dvsave_t*)M3D_MALLOC(model->numvertex * sizeof(m3dvsave_t));
+        if(!vrtx) goto memerr;
+        for(i = numvrtx = 0; i < model->numvertex; i++) {
+            if(vrtxidx[i] == M3D_UNDEF) continue;
+            _m3d_round(quality, &model->vertex[i], &vertex.data);
+            vertex.norm = norm ? norm[i] : 0;
+            if(vertex.data.skinid != M3D_INDEXMAX && !vertex.norm) {
+                vertex.data.skinid = vertex.data.skinid != M3D_UNDEF && skinidx ? skinidx[vertex.data.skinid] : M3D_UNDEF;
+                if(vertex.data.x > max_x) max_x = vertex.data.x;
+                if(vertex.data.x < min_x) min_x = vertex.data.x;
+                if(vertex.data.y > max_y) max_y = vertex.data.y;
+                if(vertex.data.y < min_y) min_y = vertex.data.y;
+                if(vertex.data.z > max_z) max_z = vertex.data.z;
+                if(vertex.data.z < min_z) min_z = vertex.data.z;
+            }
+#ifdef M3D_VERTEXTYPE
+            vertex.data.type = 0;
+#endif
+            vertex.oldidx = i;
+            memcpy(&vrtx[numvrtx++], &vertex, sizeof(m3dvsave_t));
+        }
+        if(numvrtx) {
+            qsort(vrtx, numvrtx, sizeof(m3dvsave_t), _m3d_vrtxcmp);
+            memcpy(&vertex.data, &vrtx[0].data, sizeof(m3dv_t));
+            for(i = 0; i < numvrtx; i++) {
+                if(memcmp(&vertex.data, &vrtx[i].data, vrtx[i].norm ? 3 * sizeof(M3D_FLOAT) : sizeof(m3dv_t))) {
+                    memcpy(&vertex.data, &vrtx[i].data, sizeof(m3dv_t));
+                    maxvrtx++;
+                }
+                vrtx[i].newidx = maxvrtx;
+                vrtxidx[vrtx[i].oldidx] = maxvrtx;
+            }
+            maxvrtx++;
+        }
+    }
+    if(norm) { M3D_FREE(norm); norm = NULL; }
+
+    /* normalize to bounding cube */
+    if(numvrtx && !(flags & M3D_EXP_NORECALC)) {
+        M3D_LOG("Normalizing coordinates");
+        if(min_x < (M3D_FLOAT)0.0) min_x = -min_x;
+        if(max_x < (M3D_FLOAT)0.0) max_x = -max_x;
+        if(min_y < (M3D_FLOAT)0.0) min_y = -min_y;
+        if(max_y < (M3D_FLOAT)0.0) max_y = -max_y;
+        if(min_z < (M3D_FLOAT)0.0) min_z = -min_z;
+        if(max_z < (M3D_FLOAT)0.0) max_z = -max_z;
+        scale = min_x;
+        if(max_x > scale) scale = max_x;
+        if(min_y > scale) scale = min_y;
+        if(max_y > scale) scale = max_y;
+        if(min_z > scale) scale = min_z;
+        if(max_z > scale) scale = max_z;
+        if(scale <= (M3D_FLOAT)0.0) scale = (M3D_FLOAT)1.0;
+        if(scale != (M3D_FLOAT)1.0) {
+            for(i = 0; i < numvrtx; i++) {
+                if(vrtx[i].data.skinid == M3D_INDEXMAX) continue;
+                vrtx[i].data.x /= scale;
+                vrtx[i].data.y /= scale;
+                vrtx[i].data.z /= scale;
+            }
+        }
+    }
+    if(model->scale > (M3D_FLOAT)0.0) scale = model->scale;
+    if(scale <= (M3D_FLOAT)0.0) scale = (M3D_FLOAT)1.0;
+
+    /* meta info */
+    sn = _m3d_safestr(model->name && *model->name ? model->name : (char*)"(noname)", 2);
+    sl = _m3d_safestr(model->license ? model->license : (char*)"MIT", 2);
+    sa = _m3d_safestr(model->author ? model->author : getenv("LOGNAME"), 2);
+    if(!sn || !sl || !sa) {
+memerr: if(vrtxidx) M3D_FREE(vrtxidx);
+        if(mtrlidx) M3D_FREE(mtrlidx);
+        if(tmapidx) M3D_FREE(tmapidx);
+        if(skinidx) M3D_FREE(skinidx);
+        if(grpidx) M3D_FREE(grpidx);
+        if(norm) M3D_FREE(norm);
+        if(face) M3D_FREE(face);
+        if(cmap) M3D_FREE(cmap);
+        if(tmap) M3D_FREE(tmap);
+        if(skin) M3D_FREE(skin);
+        if(str) M3D_FREE(str);
+        if(vrtx) M3D_FREE(vrtx);
+        if(sn) M3D_FREE(sn);
+        if(sl) M3D_FREE(sl);
+        if(sa) M3D_FREE(sa);
+        if(sd) M3D_FREE(sd);
+        if(out) M3D_FREE(out);
+        if(opa) M3D_FREE(opa);
+        if(h) M3D_FREE(h);
+        M3D_LOG("Out of memory");
+        model->errcode = M3D_ERR_ALLOC;
+        return NULL;
+    }
+
+    M3D_LOG("Serializing model");
+#ifdef M3D_ASCII
+    if(flags & M3D_EXP_ASCII) {
+        /* use CRLF to make model creators on Win happy... */
+        sd = _m3d_safestr(model->desc, 1);
+        if(!sd) goto memerr;
+        ol = setlocale(LC_NUMERIC, NULL);
+        setlocale(LC_NUMERIC, "C");
+        /* header */
+        len = 64 + (unsigned int)(strlen(sn) + strlen(sl) + strlen(sa) + strlen(sd));
+        out = (unsigned char*)M3D_MALLOC(len);
+        if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+        ptr = (char*)out;
+        ptr += sprintf(ptr, "3dmodel %g\r\n%s\r\n%s\r\n%s\r\n%s\r\n\r\n", scale,
+            sn, sl, sa, sd);
+        M3D_FREE(sl); M3D_FREE(sa); M3D_FREE(sd);
+        sl = sa = sd = NULL;
+        /* preview chunk */
+        if(model->preview.data && model->preview.length) {
+            sl = _m3d_safestr(sn, 0);
+            if(sl) {
+/* gcc thinks that "ptr is used after free", well, gcc is simply wrong. */
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuse-after-free"
+#endif
+                ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)20 + strlen(sl));
+                out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+                if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                ptr += sprintf(ptr, "Preview\r\n%s.png\r\n\r\n", sl);
+                M3D_FREE(sl); sl = NULL;
+            }
+        }
+        M3D_FREE(sn);  sn = NULL;
+        /* texture map */
+        if(numtmap && tmap && !(flags & M3D_EXP_NOTXTCRD) && !(flags & M3D_EXP_NOFACE)) {
+/* interestingly gcc does not complain about "ptr is used after free" here, although the code is 100% the same */
+            ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)(maxtmap * 32) + (uintptr_t)12);
+            out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+            if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+            ptr += sprintf(ptr, "Textmap\r\n");
+            last = M3D_UNDEF;
+            for(i = 0; i < numtmap; i++) {
+                if(tmap[i].newidx == last) continue;
+                last = tmap[i].newidx;
+                ptr += sprintf(ptr, "%g %g\r\n", tmap[i].data.u, tmap[i].data.v);
+            }
+            ptr += sprintf(ptr, "\r\n");
+        }
+        /* vertex chunk */
+        if(numvrtx && vrtx && !(flags & M3D_EXP_NOFACE)) {
+            ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)(maxvrtx * 128) + (uintptr_t)10);
+            out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+            if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+            ptr += sprintf(ptr, "Vertex\r\n");
+            last = M3D_UNDEF;
+            for(i = 0; i < numvrtx; i++) {
+                if(vrtx[i].newidx == last) continue;
+                last = vrtx[i].newidx;
+                ptr += sprintf(ptr, "%g %g %g %g", vrtx[i].data.x, vrtx[i].data.y, vrtx[i].data.z, vrtx[i].data.w);
+                if(!(flags & M3D_EXP_NOCMAP) && vrtx[i].data.color)
+                    ptr += sprintf(ptr, " #%08x", vrtx[i].data.color);
+                if(!(flags & M3D_EXP_NOBONE) && model->numbone && maxskin && vrtx[i].data.skinid < M3D_INDEXMAX) {
+                    if(skin[vrtx[i].data.skinid].data.weight[0] == (M3D_FLOAT)1.0)
+                        ptr += sprintf(ptr, " %d", skin[vrtx[i].data.skinid].data.boneid[0]);
+                    else
+                        for(j = 0; j < M3D_NUMBONE && skin[vrtx[i].data.skinid].data.boneid[j] != M3D_UNDEF &&
+                            skin[vrtx[i].data.skinid].data.weight[j] > (M3D_FLOAT)0.0; j++)
+                            ptr += sprintf(ptr, " %d:%g", skin[vrtx[i].data.skinid].data.boneid[j],
+                                skin[vrtx[i].data.skinid].data.weight[j]);
+                }
+                ptr += sprintf(ptr, "\r\n");
+            }
+            ptr += sprintf(ptr, "\r\n");
+        }
+        /* bones chunk */
+        if(model->numbone && model->bone && !(flags & M3D_EXP_NOBONE)) {
+            ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)9);
+            for(i = 0; i < model->numbone; i++) {
+                len += (unsigned int)strlen(model->bone[i].name) + 128;
+            }
+            out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+            if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+            ptr += sprintf(ptr, "Bones\r\n");
+            ptr = _m3d_prtbone(ptr, model->bone, model->numbone, M3D_UNDEF, 0, vrtxidx);
+            ptr += sprintf(ptr, "\r\n");
+        }
+        /* materials */
+        if(model->nummaterial && !(flags & M3D_EXP_NOMATERIAL)) {
+            for(j = 0; j < model->nummaterial; j++) {
+                if(mtrlidx[j] == M3D_UNDEF || !model->material[j].numprop || !model->material[j].prop) continue;
+                m = &model->material[j];
+                sn = _m3d_safestr(m->name, 0);
+                if(!sn) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)strlen(sn) + (uintptr_t)12);
+                for(i = 0; i < m->numprop; i++) {
+                    if(m->prop[i].type < 128)
+                        len += 32;
+                    else if(m->prop[i].value.textureid < model->numtexture && model->texture[m->prop[i].value.textureid].name)
+                        len += (unsigned int)strlen(model->texture[m->prop[i].value.textureid].name) + 16;
+                }
+                out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+                if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                ptr += sprintf(ptr, "Material %s\r\n", sn);
+                M3D_FREE(sn); sn = NULL;
+                for(i = 0; i < m->numprop; i++) {
+                    k = 256;
+                    if(m->prop[i].type >= 128) {
+                        for(l = 0; l < sizeof(m3d_propertytypes)/sizeof(m3d_propertytypes[0]); l++)
+                            if(m->prop[i].type == m3d_propertytypes[l].id) {
+                                sn = m3d_propertytypes[l].key;
+                                break;
+                            }
+                        if(!sn)
+                            for(l = 0; l < sizeof(m3d_propertytypes)/sizeof(m3d_propertytypes[0]); l++)
+                                if(m->prop[i].type - 128 == m3d_propertytypes[l].id) {
+                                    sn = m3d_propertytypes[l].key;
+                                    break;
+                                }
+                        k = sn ? m3dpf_map : 256;
+                    } else {
+                        for(l = 0; l < sizeof(m3d_propertytypes)/sizeof(m3d_propertytypes[0]); l++)
+                            if(m->prop[i].type == m3d_propertytypes[l].id) {
+                                sn = m3d_propertytypes[l].key;
+                                k = m3d_propertytypes[l].format;
+                                break;
+                            }
+                    }
+                    switch(k) {
+                        case m3dpf_color: ptr += sprintf(ptr, "%s #%08x\r\n", sn, m->prop[i].value.color); break;
+                        case m3dpf_uint8:
+                        case m3dpf_uint16:
+                        case m3dpf_uint32: ptr += sprintf(ptr, "%s %d\r\n", sn, m->prop[i].value.num); break;
+                        case m3dpf_float:  ptr += sprintf(ptr, "%s %g\r\n", sn, m->prop[i].value.fnum); break;
+                        case m3dpf_map:
+                            if(m->prop[i].value.textureid < model->numtexture &&
+                                model->texture[m->prop[i].value.textureid].name) {
+                                sl = _m3d_safestr(model->texture[m->prop[i].value.textureid].name, 0);
+                                if(!sl) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                                if(*sl)
+                                    ptr += sprintf(ptr, "map_%s %s\r\n", sn, sl);
+                                M3D_FREE(sn); M3D_FREE(sl); sl = NULL;
+                            }
+                        break;
+                    }
+                    sn = NULL;
+                }
+                ptr += sprintf(ptr, "\r\n");
+            }
+        }
+        /* procedural face */
+        if(model->numinlined && model->inlined && !(flags & M3D_EXP_NOFACE)) {
+            /* all inlined assets which are not textures should be procedural surfaces */
+            for(j = 0; j < model->numinlined; j++) {
+                if(!model->inlined[j].name || !*model->inlined[j].name || !model->inlined[j].length || !model->inlined[j].data ||
+                 (model->inlined[j].data[1] == 'P' && model->inlined[j].data[2] == 'N' && model->inlined[j].data[3] == 'G'))
+                    continue;
+                for(i = k = 0; i < model->numtexture; i++) {
+                    if(!strcmp(model->inlined[j].name, model->texture[i].name)) { k = 1; break; }
+                }
+                if(k) continue;
+                sn = _m3d_safestr(model->inlined[j].name, 0);
+                if(!sn) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)strlen(sn) + (uintptr_t)18);
+                out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+                if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                ptr += sprintf(ptr, "Procedural\r\n%s\r\n\r\n", sn);
+                M3D_FREE(sn); sn = NULL;
+            }
+        }
+        /* mesh face */
+        if(model->numface && face && !(flags & M3D_EXP_NOFACE)) {
+            ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)(model->numface * 128) + (uintptr_t)6);
+            last = M3D_UNDEF;
+#ifdef M3D_VERTEXMAX
+            lastp = M3D_UNDEF;
+#endif
+            if(!(flags & M3D_EXP_NOMATERIAL))
+                for(i = 0; i < model->numface; i++) {
+                    j = face[i].data.materialid < model->nummaterial ? face[i].data.materialid : M3D_UNDEF;
+                    if(j != last) {
+                        last = j;
+                        if(last < model->nummaterial)
+                            len += (unsigned int)strlen(model->material[last].name);
+                        len += 6;
+                    }
+#ifdef M3D_VERTEXMAX
+                    j = face[i].data.paramid < model->numparam ? face[i].data.paramid : M3D_UNDEF;
+                    if(j != lastp) {
+                        lastp = j;
+                        if(lastp < model->numparam)
+                            len += (unsigned int)strlen(model->param[lastp].name);
+                        len += 6;
+                    }
+#endif
+                }
+            out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+            if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+            ptr += sprintf(ptr, "Mesh\r\n");
+            last = M3D_UNDEF;
+#ifdef M3D_VERTEXMAX
+            lastp = M3D_UNDEF;
+#endif
+            for(i = 0; i < model->numface; i++) {
+                j = face[i].data.materialid < model->nummaterial ? face[i].data.materialid : M3D_UNDEF;
+                if(!(flags & M3D_EXP_NOMATERIAL) && j != last) {
+                    last = j;
+                    if(last < model->nummaterial) {
+                        sn = _m3d_safestr(model->material[last].name, 0);
+                        if(!sn) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                        ptr += sprintf(ptr, "use %s\r\n", sn);
+                        M3D_FREE(sn); sn = NULL;
+                    } else
+                        ptr += sprintf(ptr, "use\r\n");
+                }
+#ifdef M3D_VERTEXMAX
+                j = face[i].data.paramid < model->numparam ? face[i].data.paramid : M3D_UNDEF;
+                if(!(flags & M3D_EXP_NOVRTMAX) && j != lastp) {
+                    lastp = j;
+                    if(lastp < model->numparam) {
+                        sn = _m3d_safestr(model->param[lastp].name, 0);
+                        if(!sn) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                        ptr += sprintf(ptr, "par %s\r\n", sn);
+                        M3D_FREE(sn); sn = NULL;
+                    } else
+                        ptr += sprintf(ptr, "par\r\n");
+                }
+#endif
+                /* hardcoded triangles. Should be repeated as many times as the number of edges in polygon */
+                for(j = 0; j < 3; j++) {
+                    ptr += sprintf(ptr, "%s%d", j?" ":"", vrtxidx[face[i].data.vertex[j]]);
+                    k = l = M3D_NOTDEFINED;
+                    if(!(flags & M3D_EXP_NOTXTCRD) && (face[i].data.texcoord[j] != M3D_UNDEF) &&
+                        (tmapidx[face[i].data.texcoord[j]] != M3D_UNDEF)) {
+                            k = tmapidx[face[i].data.texcoord[j]];
+                            ptr += sprintf(ptr, "/%d", k);
+                    }
+                    if(!(flags & M3D_EXP_NONORMAL) && (face[i].data.normal[j] != M3D_UNDEF)) {
+                        l = vrtxidx[face[i].data.normal[j]];
+                        ptr += sprintf(ptr, "%s/%d", k == M3D_NOTDEFINED? "/" : "", l);
+                    }
+#ifdef M3D_VERTEXMAX
+                    if(!(flags & M3D_EXP_NOVRTMAX) && (face[i].data.vertmax[j] != M3D_UNDEF)) {
+                        ptr += sprintf(ptr, "%s%s/%d", k == M3D_NOTDEFINED? "/" : "", l == M3D_NOTDEFINED? "/" : "",
+                            vrtxidx[face[i].data.vertmax[j]]);
+                    }
+#endif
+                }
+                ptr += sprintf(ptr, "\r\n");
+            }
+            ptr += sprintf(ptr, "\r\n");
+        }
+        /* voxel face */
+        if(model->numvoxtype && model->voxtype && !(flags & M3D_EXP_NOFACE)) {
+            ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)(model->numvoxtype * 128) + (uintptr_t)10);
+            for(i = 0; i < model->numvoxtype; i++) {
+                if(model->voxtype[i].name) len += (unsigned int)strlen(model->voxtype[i].name);
+                for(j = 0; j < model->voxtype[i].numitem; j++)
+                    if(model->voxtype[i].item[j].name)
+                        len += (unsigned int)strlen(model->voxtype[i].item[j].name) + 6;
+            }
+            out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+            if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+            ptr += sprintf(ptr, "VoxTypes\r\n");
+            for(i = 0; i < model->numvoxtype; i++) {
+                ptr += sprintf(ptr, "#%08x", model->voxtype[i].color);
+                if(model->voxtype[i].rotation)
+                    ptr += sprintf(ptr, "/%02x", model->voxtype[i].rotation);
+                if(model->voxtype[i].voxshape)
+                    ptr += sprintf(ptr, "%s/%03x", model->voxtype[i].rotation ? "" : "/", model->voxtype[i].voxshape);
+                sn = _m3d_safestr(model->voxtype[i].name, 0);
+                if(!sn) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                ptr += sprintf(ptr, " %s", sn && sn[0] ? sn : "-");
+                M3D_FREE(sn); sn = NULL;
+                if(!(flags & M3D_EXP_NOBONE) && model->numbone && maxskin && model->voxtype[i].skinid < M3D_INDEXMAX) {
+                    if(skin[skinidx[model->voxtype[i].skinid]].data.weight[0] == (M3D_FLOAT)1.0)
+                        ptr += sprintf(ptr, " %d", skin[skinidx[model->voxtype[i].skinid]].data.boneid[0]);
+                    else
+                        for(j = 0; j < M3D_NUMBONE && skin[skinidx[model->voxtype[i].skinid]].data.boneid[j] != M3D_UNDEF &&
+                            skin[skinidx[model->voxtype[i].skinid]].data.weight[j] > (M3D_FLOAT)0.0; j++)
+                            ptr += sprintf(ptr, " %d:%g", skin[skinidx[model->voxtype[i].skinid]].data.boneid[j],
+                                skin[skinidx[model->voxtype[i].skinid]].data.weight[j]);
+                }
+                if(model->voxtype[i].numitem && model->voxtype[i].item) {
+                    for(j = k = 0; j < model->voxtype[i].numitem; j++) {
+                        if(!model->voxtype[i].item[j].count || !model->voxtype[i].item[j].name ||
+                            !model->voxtype[i].item[j].name[0]) continue;
+                        if(!k) { ptr += sprintf(ptr, " {"); k = 1; }
+                        sn = _m3d_safestr(model->voxtype[i].item[j].name, 0);
+                        if(!sn) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                        ptr += sprintf(ptr, " %d %s", model->voxtype[i].item[j].count, sn);
+                        M3D_FREE(sn); sn = NULL;
+                    }
+                    if(k) ptr += sprintf(ptr, " }");
+                }
+                while(ptr[-1] == '-' || ptr[-1] == ' ') ptr--;
+                ptr += sprintf(ptr, "\r\n");
+            }
+            ptr += sprintf(ptr, "\r\n");
+        }
+        if(model->numvoxel && model->voxel && !(flags & M3D_EXP_NOFACE)) {
+            for(i = 0; i < model->numvoxel; i++) {
+                ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)128);
+                if(model->voxel[i].name) len += (unsigned int)strlen(model->voxel[i].name);
+                len += model->voxel[i].h * ((model->voxel[i].w * 6 + 2) * model->voxel[i].d + 9);
+                out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+                if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                ptr += sprintf(ptr, "Voxel");
+                sn = _m3d_safestr(model->voxel[i].name, 0);
+                if(!sn) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                if(sn && sn[0])
+                    ptr += sprintf(ptr, " %s", sn);
+                M3D_FREE(sn); sn = NULL;
+                ptr += sprintf(ptr, "\r\n");
+                if(model->voxel[i].uncertain)
+                    ptr += sprintf(ptr, "uncertain %d %d\r\n", (model->voxel[i].uncertain * 100) / 255, model->voxel[i].groupid);
+                if(model->voxel[i].x || model->voxel[i].y || model->voxel[i].z)
+                    ptr += sprintf(ptr, "pos %d %d %d\r\n", model->voxel[i].x, model->voxel[i].y, model->voxel[i].z);
+                ptr += sprintf(ptr, "dim %d %d %d\r\n", model->voxel[i].w, model->voxel[i].h, model->voxel[i].d);
+                for(j = n = 0; j < model->voxel[i].h; j++) {
+                    ptr += sprintf(ptr, "layer\r\n");
+                    for(k = 0; k < model->voxel[i].d; k++) {
+                        for(l = 0; l < model->voxel[i].w; l++, n++) {
+                            switch(model->voxel[i].data[n]) {
+                                case M3D_VOXCLEAR: *ptr++ = '-'; break;
+                                case M3D_VOXUNDEF: *ptr++ = '.'; break;
+                                default: ptr += sprintf(ptr, "%d", model->voxel[i].data[n]); break;
+                            }
+                            *ptr++ = ' ';
+                        }
+                        ptr--;
+                        ptr += sprintf(ptr, "\r\n");
+                    }
+                }
+                ptr += sprintf(ptr, "\r\n");
+            }
+        }
+        /* mathematical shapes face */
+        if(model->numshape && model->numshape && !(flags & M3D_EXP_NOFACE)) {
+            for(j = 0; j < model->numshape; j++) {
+                sn = _m3d_safestr(model->shape[j].name, 0);
+                if(!sn) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)strlen(sn) + (uintptr_t)33);
+                out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+                if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                ptr += sprintf(ptr, "Shape %s\r\n", sn);
+                M3D_FREE(sn); sn = NULL;
+                if(model->shape[j].group != M3D_UNDEF && !(flags & M3D_EXP_NOBONE))
+                    ptr += sprintf(ptr, "group %d\r\n", model->shape[j].group);
+                for(i = 0; i < model->shape[j].numcmd; i++) {
+                    cmd = &model->shape[j].cmd[i];
+                    if(cmd->type >= (unsigned int)(sizeof(m3d_commandtypes)/sizeof(m3d_commandtypes[0])) || !cmd->arg)
+                        continue;
+                    cd = &m3d_commandtypes[cmd->type];
+                    ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)strlen(cd->key) + (uintptr_t)3);
+                    for(k = 0; k < cd->p; k++)
+                        switch(cd->a[k]) {
+                            case m3dcp_mi_t: if(cmd->arg[k] != M3D_NOTDEFINED) { len += (unsigned int)strlen(model->material[cmd->arg[k]].name) + 1; } break;
+                            case m3dcp_va_t: len += cmd->arg[k] * (cd->p - k - 1) * 16; k = cd->p; break;
+                            default: len += 16; break;
+                        }
+                    out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+                    if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                    ptr += sprintf(ptr, "%s", cd->key);
+                    for(k = n = 0, l = cd->p; k < l; k++) {
+                        switch(cd->a[((k - n) % (cd->p - n)) + n]) {
+                            case m3dcp_mi_t:
+                                if(cmd->arg[k] != M3D_NOTDEFINED) {
+                                    sn = _m3d_safestr(model->material[cmd->arg[k]].name, 0);
+                                    if(!sn) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                                    ptr += sprintf(ptr, " %s", sn);
+                                    M3D_FREE(sn); sn = NULL;
+                                }
+                            break;
+                            case m3dcp_vc_t: ptr += sprintf(ptr, " %g", *((float*)&cmd->arg[k])); break;
+                            case m3dcp_va_t: ptr += sprintf(ptr, " %d[", cmd->arg[k]);
+                                n = k + 1; l += (cmd->arg[k] - 1) * (cd->p - k - 1);
+                            break;
+                            default: ptr += sprintf(ptr, " %d", cmd->arg[k]); break;
+                        }
+                    }
+                    ptr += sprintf(ptr, "%s\r\n", l > cd->p ? " ]" : "");
+                }
+                ptr += sprintf(ptr, "\r\n");
+            }
+        }
+        /* annotation labels */
+        if(model->numlabel && model->label && !(flags & M3D_EXP_NOFACE)) {
+            for(i = 0, j = 3, length = NULL; i < model->numlabel; i++) {
+                if(model->label[i].name) j += (unsigned int)strlen(model->label[i].name);
+                if(model->label[i].lang) j += (unsigned int)strlen(model->label[i].lang);
+                if(model->label[i].text) j += (unsigned int)strlen(model->label[i].text);
+                j += 40;
+            }
+            ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)j);
+            out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+            if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+            for(i = 0; i < model->numlabel; i++) {
+                if(!i || _m3d_strcmp(sl, model->label[i].lang) || _m3d_strcmp(sn, model->label[i].name)) {
+                    sl = model->label[i].lang;
+                    sn = model->label[i].name;
+                    sd = _m3d_safestr(sn, 0);
+                    if(!sd) { setlocale(LC_NUMERIC, ol); sn = sl = NULL; goto memerr; }
+                    if(i) ptr += sprintf(ptr, "\r\n");
+                    ptr += sprintf(ptr, "Labels %s\r\n", sd);
+                    M3D_FREE(sd); sd = NULL;
+                    if(model->label[i].color)
+                        ptr += sprintf(ptr, "color #0x%08x\r\n", model->label[i].color);
+                    if(sl && *sl) {
+                        sd = _m3d_safestr(sl, 0);
+                        if(!sd) { setlocale(LC_NUMERIC, ol); sn = sl = NULL; goto memerr; }
+                        ptr += sprintf(ptr, "lang %s\r\n", sd);
+                        M3D_FREE(sd); sd = NULL;
+                    }
+                }
+                sd = _m3d_safestr(model->label[i].text, 2);
+                if(!sd) { setlocale(LC_NUMERIC, ol); sn = sl = NULL; goto memerr; }
+                ptr += sprintf(ptr, "%d %s\r\n", model->label[i].vertexid, sd);
+                M3D_FREE(sd); sd = NULL;
+            }
+            ptr += sprintf(ptr, "\r\n");
+            sn = sl = NULL;
+        }
+        /* actions */
+        if(model->numaction && model->action && !(flags & M3D_EXP_NOACTION)) {
+            for(j = 0; j < model->numaction; j++) {
+                a = &model->action[j];
+                sn = _m3d_safestr(a->name, 0);
+                if(!sn) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)strlen(sn) + (uintptr_t)48);
+                for(i = 0; i < a->numframe; i++)
+                    len += a->frame[i].numtransform * 128 + 8;
+                out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+                if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                ptr += sprintf(ptr, "Action %d %s\r\n", a->durationmsec, sn);
+                M3D_FREE(sn); sn = NULL;
+                for(i = 0; i < a->numframe; i++) {
+                    ptr += sprintf(ptr, "frame %d\r\n", a->frame[i].msec);
+                    for(k = 0; k < a->frame[i].numtransform; k++) {
+                        ptr += sprintf(ptr, "%d %d %d\r\n", a->frame[i].transform[k].boneid,
+                            vrtxidx[a->frame[i].transform[k].pos], vrtxidx[a->frame[i].transform[k].ori]);
+                    }
+                }
+                ptr += sprintf(ptr, "\r\n");
+            }
+        }
+        /* inlined assets */
+        if(model->numinlined && model->inlined) {
+            for(i = j = 0; i < model->numinlined; i++)
+                if(model->inlined[i].name)
+                    j += (unsigned int)strlen(model->inlined[i].name) + 6;
+            if(j > 0) {
+                ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)j + (uintptr_t)16);
+                out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+                if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                ptr += sprintf(ptr, "Assets\r\n");
+                for(i = 0; i < model->numinlined; i++)
+                    if(model->inlined[i].name)
+                        ptr += sprintf(ptr, "%s%s\r\n", model->inlined[i].name, strrchr(model->inlined[i].name, '.') ? "" : ".png");
+                ptr += sprintf(ptr, "\r\n");
+            }
+        }
+        /* extra info */
+        if(model->numextra && (flags & M3D_EXP_EXTRA)) {
+            for(i = 0; i < model->numextra; i++) {
+                if(model->extra[i]->length < 9) continue;
+                ptr -= (uintptr_t)out; len = (unsigned int)((uintptr_t)ptr + (uintptr_t)17 + (uintptr_t)(model->extra[i]->length * 3));
+                out = (unsigned char*)M3D_REALLOC(out, len); ptr += (uintptr_t)out;
+                if(!out) { setlocale(LC_NUMERIC, ol); goto memerr; }
+                ptr += sprintf(ptr, "Extra %c%c%c%c\r\n",
+                    model->extra[i]->magic[0] > ' ' ? model->extra[i]->magic[0] : '_',
+                    model->extra[i]->magic[1] > ' ' ? model->extra[i]->magic[1] : '_',
+                    model->extra[i]->magic[2] > ' ' ? model->extra[i]->magic[2] : '_',
+                    model->extra[i]->magic[3] > ' ' ? model->extra[i]->magic[3] : '_');
+                for(j = 0; j < model->extra[i]->length; j++)
+                    ptr += sprintf(ptr, "%02x ", *((unsigned char *)model->extra + sizeof(m3dchunk_t) + j));
+                ptr--;
+                ptr += sprintf(ptr, "\r\n\r\n");
+            }
+        }
+        setlocale(LC_NUMERIC, ol);
+        len = (unsigned int)((uintptr_t)ptr - (uintptr_t)out);
+        out = (unsigned char*)M3D_REALLOC(out, len + 1);
+        if(!out) goto memerr;
+        out[len] = 0;
+    } else
+#endif
+    {
+        /* stricly only use LF (newline) in binary */
+        sd = _m3d_safestr(model->desc, 3);
+        if(!sd) goto memerr;
+        /* header */
+        h = (m3dhdr_t*)M3D_MALLOC(sizeof(m3dhdr_t) + strlen(sn) + strlen(sl) + strlen(sa) + strlen(sd) + 4);
+        if(!h) goto memerr;
+        memcpy((uint8_t*)h, "HEAD", 4);
+        h->length = sizeof(m3dhdr_t);
+        h->scale = scale;
+        i = (unsigned int)strlen(sn); memcpy((uint8_t*)h + h->length, sn, i+1); h->length += i+1; M3D_FREE(sn);
+        i = (unsigned int)strlen(sl); memcpy((uint8_t*)h + h->length, sl, i+1); h->length += i+1; M3D_FREE(sl);
+        i = (unsigned int)strlen(sa); memcpy((uint8_t*)h + h->length, sa, i+1); h->length += i+1; M3D_FREE(sa);
+        i = (unsigned int)strlen(sd); memcpy((uint8_t*)h + h->length, sd, i+1); h->length += i+1; M3D_FREE(sd);
+        sn = sl = sa = sd = NULL;
+        if(model->inlined)
+            for(i = 0; i < model->numinlined; i++) {
+                if(model->inlined[i].name && *model->inlined[i].name && model->inlined[i].length > 0) {
+                    str = _m3d_addstr(str, &numstr, model->inlined[i].name);
+                    if(!str) goto memerr;
+                }
+            }
+        if(str)
+            for(i = 0; i < numstr; i++) {
+                h = _m3d_addhdr(h, &str[i]);
+                if(!h) goto memerr;
+            }
+        vc_s = quality == M3D_EXP_INT8? 1 : (quality == M3D_EXP_INT16? 2 : (quality == M3D_EXP_DOUBLE? 8 : 4));
+        vi_s = maxvrtx < 254 ? 1 : (maxvrtx < 65534 ? 2 : 4);
+        si_s = h->length - 16 < 254 ? 1 : (h->length - 16 < 65534 ? 2 : 4);
+        ci_s = !numcmap || !cmap ? 0 : (numcmap < 254 ? 1 : (numcmap < 65534 ? 2 : 4));
+        ti_s = !maxtmap || !tmap ? 0 : (maxtmap < 254 ? 1 : (maxtmap < 65534 ? 2 : 4));
+        bi_s = !model->numbone || !model->bone || (flags & M3D_EXP_NOBONE)? 0 : (model->numbone < 254 ? 1 :
+            (model->numbone < 65534 ? 2 : 4));
+        nb_s = maxbone < 2 ? 1 : (maxbone == 2 ? 2 : (maxbone <= 4 ? 4 : 8));
+        sk_s = !bi_s || !maxskin || !skin ? 0 : (maxskin < 254 ? 1 : (maxskin < 65534 ? 2 : 4));
+        fc_s = maxt < 254 ? 1 : (maxt < 65534 ? 2 : 4);
+        hi_s = !model->numshape || !model->shape || (flags & M3D_EXP_NOFACE)? 0 : (model->numshape < 254 ? 1 :
+            (model->numshape < 65534 ? 2 : 4));
+        fi_s = !model->numface || !model->face || (flags & M3D_EXP_NOFACE)? 0 : (model->numface < 254 ? 1 :
+            (model->numface < 65534 ? 2 : 4));
+        vd_s = !model->numvoxel || !model->voxel || (flags & M3D_EXP_NOFACE)? 0 : (minvox >= -128 && maxvox <= 127 ? 1 :
+            (minvox >= -32768 && maxvox <= 32767 ? 2 : 4));
+        vp_s = !model->numvoxtype || !model->voxtype || (flags & M3D_EXP_NOFACE)? 0 : (model->numvoxtype < 254 ? 1 :
+            (model->numvoxtype < 65534 ? 2 : 4));
+        h->types =  (vc_s == 8 ? (3<<0) : (vc_s == 2 ? (1<<0) : (vc_s == 1 ? (0<<0) : (2<<0)))) |
+                    (vi_s == 2 ? (1<<2) : (vi_s == 1 ? (0<<2) : (2<<2))) |
+                    (si_s == 2 ? (1<<4) : (si_s == 1 ? (0<<4) : (2<<4))) |
+                    (ci_s == 2 ? (1<<6) : (ci_s == 1 ? (0<<6) : (ci_s == 4 ? (2<<6) : (3<<6)))) |
+                    (ti_s == 2 ? (1<<8) : (ti_s == 1 ? (0<<8) : (ti_s == 4 ? (2<<8) : (3<<8)))) |
+                    (bi_s == 2 ? (1<<10): (bi_s == 1 ? (0<<10): (bi_s == 4 ? (2<<10) : (3<<10)))) |
+                    (nb_s == 2 ? (1<<12): (nb_s == 1 ? (0<<12): (2<<12))) |
+                    (sk_s == 2 ? (1<<14): (sk_s == 1 ? (0<<14): (sk_s == 4 ? (2<<14) : (3<<14)))) |
+                    (fc_s == 2 ? (1<<16): (fc_s == 1 ? (0<<16): (2<<16))) |
+                    (hi_s == 2 ? (1<<18): (hi_s == 1 ? (0<<18): (hi_s == 4 ? (2<<18) : (3<<18)))) |
+                    (fi_s == 2 ? (1<<20): (fi_s == 1 ? (0<<20): (fi_s == 4 ? (2<<20) : (3<<20)))) |
+                    (vd_s == 2 ? (1<<22): (vd_s == 1 ? (0<<22): (vd_s == 4 ? (2<<22) : (3<<22)))) |
+                    (vp_s == 2 ? (1<<24): (vp_s == 1 ? (0<<24): (vp_s == 4 ? (2<<24) : (3<<24))));
+        len = h->length;
+        /* color map */
+        if(numcmap && cmap && ci_s < 4 && !(flags & M3D_EXP_NOCMAP)) {
+            chunklen = 8 + numcmap * sizeof(uint32_t);
+            h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+            if(!h) goto memerr;
+            memcpy((uint8_t*)h + len, "CMAP", 4);
+            *((uint32_t*)((uint8_t*)h + len + 4)) = chunklen;
+            memcpy((uint8_t*)h + len + 8, cmap, chunklen - 8);
+            len += chunklen;
+        } else numcmap = 0;
+        /* texture map */
+        if(numtmap && tmap && !(flags & M3D_EXP_NOTXTCRD) && !(flags & M3D_EXP_NOFACE)) {
+            chunklen = 8 + maxtmap * vc_s * 2;
+            h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+            if(!h) goto memerr;
+            memcpy((uint8_t*)h + len, "TMAP", 4);
+            length = (uint32_t*)((uint8_t*)h + len + 4);
+            out = (uint8_t*)h + len + 8;
+            last = M3D_UNDEF;
+            for(i = 0; i < numtmap; i++) {
+                if(tmap[i].newidx == last) continue;
+                last = tmap[i].newidx;
+                switch(vc_s) {
+                    case 1: *out++ = (uint8_t)(tmap[i].data.u * 255); *out++ = (uint8_t)(tmap[i].data.v * 255); break;
+                    case 2:
+                        *((uint16_t*)out) = (uint16_t)(tmap[i].data.u * 65535); out += 2;
+                        *((uint16_t*)out) = (uint16_t)(tmap[i].data.v * 65535); out += 2;
+                    break;
+                    case 4:  *((float*)out) = tmap[i].data.u; out += 4;  *((float*)out) = tmap[i].data.v; out += 4; break;
+                    case 8: *((double*)out) = tmap[i].data.u; out += 8; *((double*)out) = tmap[i].data.v; out += 8; break;
+                }
+            }
+            *length = (uint32_t)((uintptr_t)out - (uintptr_t)((uint8_t*)h + len));
+            out = NULL;
+            len += *length;
+        }
+        /* vertex */
+        if(numvrtx && vrtx) {
+            chunklen = 8 + maxvrtx * (ci_s + sk_s + 4 * vc_s);
+            h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+            if(!h) goto memerr;
+            memcpy((uint8_t*)h + len, "VRTS", 4);
+            length = (uint32_t*)((uint8_t*)h + len + 4);
+            out = (uint8_t*)h + len + 8;
+            last = M3D_UNDEF;
+            for(i = 0; i < numvrtx; i++) {
+                if(vrtx[i].newidx == last) continue;
+                last = vrtx[i].newidx;
+                switch(vc_s) {
+                    case 1:
+                        *out++ = (int8_t)(vrtx[i].data.x * 127);
+                        *out++ = (int8_t)(vrtx[i].data.y * 127);
+                        *out++ = (int8_t)(vrtx[i].data.z * 127);
+                        *out++ = (int8_t)(vrtx[i].data.w * 127);
+                    break;
+                    case 2:
+                        *((int16_t*)out) = (int16_t)(vrtx[i].data.x * 32767); out += 2;
+                        *((int16_t*)out) = (int16_t)(vrtx[i].data.y * 32767); out += 2;
+                        *((int16_t*)out) = (int16_t)(vrtx[i].data.z * 32767); out += 2;
+                        *((int16_t*)out) = (int16_t)(vrtx[i].data.w * 32767); out += 2;
+                    break;
+                    case 4:
+                        *((float*)out) = vrtx[i].data.x; out += 4;
+                        *((float*)out) = vrtx[i].data.y; out += 4;
+                        *((float*)out) = vrtx[i].data.z; out += 4;
+                        *((float*)out) = vrtx[i].data.w; out += 4;
+                    break;
+                    case 8:
+                        *((double*)out) = vrtx[i].data.x; out += 8;
+                        *((double*)out) = vrtx[i].data.y; out += 8;
+                        *((double*)out) = vrtx[i].data.z; out += 8;
+                        *((double*)out) = vrtx[i].data.w; out += 8;
+                    break;
+                }
+                idx = _m3d_cmapidx(cmap, numcmap, vrtx[i].data.color);
+                switch(ci_s) {
+                    case 1: *out++ = (uint8_t)(idx); break;
+                    case 2: *((uint16_t*)out) = (uint16_t)(idx); out += 2; break;
+                    case 4: *((uint32_t*)out) = vrtx[i].data.color; out += 4; break;
+                }
+                out = _m3d_addidx(out, sk_s, vrtx[i].data.skinid);
+            }
+            *length = (uint32_t)((uintptr_t)out - (uintptr_t)((uint8_t*)h + len));
+            out = NULL;
+            len += *length;
+        }
+        /* bones chunk */
+        if(model->numbone && model->bone && !(flags & M3D_EXP_NOBONE)) {
+            i = 8 + bi_s + sk_s + model->numbone * (bi_s + si_s + 2*vi_s);
+            chunklen = i + numskin * nb_s * (bi_s + 1);
+            h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+            if(!h) goto memerr;
+            memcpy((uint8_t*)h + len, "BONE", 4);
+            length = (uint32_t*)((uint8_t*)h + len + 4);
+            out = (uint8_t*)h + len + 8;
+            out = _m3d_addidx(out, bi_s, model->numbone);
+            out = _m3d_addidx(out, sk_s, maxskin);
+            for(i = 0; i < model->numbone; i++) {
+                out = _m3d_addidx(out, bi_s, model->bone[i].parent);
+                out = _m3d_addidx(out, si_s, _m3d_stridx(str, numstr, model->bone[i].name));
+                out = _m3d_addidx(out, vi_s, vrtxidx[model->bone[i].pos]);
+                out = _m3d_addidx(out, vi_s, vrtxidx[model->bone[i].ori]);
+            }
+            if(numskin && skin && sk_s) {
+                last = M3D_UNDEF;
+                for(i = 0; i < numskin; i++) {
+                    if(skin[i].newidx == last) continue;
+                    last = skin[i].newidx;
+                    memset(&weights, 0, nb_s);
+                    for(j = k = l = 0, mw = 0.0; j < (uint32_t)nb_s && skin[i].data.boneid[j] != M3D_UNDEF &&
+                        skin[i].data.weight[j] > (M3D_FLOAT)0.0; j++) {
+                            if(mw < skin[i].data.weight[j]) { mw = skin[i].data.weight[j]; k = j; }
+                            weights[j] = (uint8_t)(skin[i].data.weight[j] * 255);
+                            if(!weights[j]) { weights[j]++; l--; }
+                        }
+                    weights[k] += l;
+                    switch(nb_s) {
+                        case 1: weights[0] = 255; break;
+                        case 2: memcpy(out, weights, 2); out += 2; break;
+                        case 4: memcpy(out, weights, 4); out += 4; break;
+                        case 8: memcpy(out, weights, 8); out += 8; break;
+                    }
+                    for(j = 0; j < (uint32_t)nb_s && skin[i].data.boneid[j] != M3D_UNDEF && weights[j]; j++) {
+                        out = _m3d_addidx(out, bi_s, skin[i].data.boneid[j]);
+                        *length += bi_s;
+                    }
+                }
+            }
+            *length = (uint32_t)((uintptr_t)out - (uintptr_t)((uint8_t*)h + len));
+            out = NULL;
+            len += *length;
+        }
+        /* materials */
+        if(model->nummaterial && !(flags & M3D_EXP_NOMATERIAL)) {
+            for(j = 0; j < model->nummaterial; j++) {
+                if(mtrlidx[j] == M3D_UNDEF || !model->material[j].numprop || !model->material[j].prop) continue;
+                m = &model->material[j];
+                chunklen = 12 + si_s + m->numprop * 5;
+                h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+                if(!h) goto memerr;
+                memcpy((uint8_t*)h + len, "MTRL", 4);
+                length = (uint32_t*)((uint8_t*)h + len + 4);
+                out = (uint8_t*)h + len + 8;
+                out = _m3d_addidx(out, si_s, _m3d_stridx(str, numstr, m->name));
+                for(i = 0; i < m->numprop; i++) {
+                    if(m->prop[i].type >= 128) {
+                        if(m->prop[i].value.textureid >= model->numtexture ||
+                            !model->texture[m->prop[i].value.textureid].name) continue;
+                        k = m3dpf_map;
+                    } else {
+                        for(k = 256, l = 0; l < sizeof(m3d_propertytypes)/sizeof(m3d_propertytypes[0]); l++)
+                            if(m->prop[i].type == m3d_propertytypes[l].id) { k = m3d_propertytypes[l].format; break; }
+                    }
+                    if(k == 256) continue;
+                    *out++ = m->prop[i].type;
+                    switch(k) {
+                        case m3dpf_color:
+                            if(!(flags & M3D_EXP_NOCMAP)) {
+                                idx = _m3d_cmapidx(cmap, numcmap, m->prop[i].value.color);
+                                switch(ci_s) {
+                                    case 1: *out++ = (uint8_t)(idx); break;
+                                    case 2: *((uint16_t*)out) = (uint16_t)(idx); out += 2; break;
+                                    case 4: *((uint32_t*)out) = (uint32_t)(m->prop[i].value.color); out += 4; break;
+                                }
+                            } else out--;
+                        break;
+                        case m3dpf_uint8:  *out++ = m->prop[i].value.num; break;
+                        case m3dpf_uint16: *((uint16_t*)out) = m->prop[i].value.num; out += 2; break;
+                        case m3dpf_uint32: *((uint32_t*)out) = m->prop[i].value.num; out += 4; break;
+                        case m3dpf_float:  *((float*)out) = m->prop[i].value.fnum; out += 4; break;
+
+                        case m3dpf_map:
+                            idx = _m3d_stridx(str, numstr, model->texture[m->prop[i].value.textureid].name);
+                            out = _m3d_addidx(out, si_s, idx);
+                        break;
+                    }
+                }
+                *length = (uint32_t)((uintptr_t)out - (uintptr_t)((uint8_t*)h + len));
+                len += *length;
+                out = NULL;
+            }
+        }
+        /* procedural face */
+        if(model->numinlined && model->inlined && !(flags & M3D_EXP_NOFACE)) {
+            /* all inlined assets which are not textures should be procedural surfaces */
+            for(j = 0; j < model->numinlined; j++) {
+                if(!model->inlined[j].name || !model->inlined[j].name[0] || model->inlined[j].length < 4 ||
+                    !model->inlined[j].data || (model->inlined[j].data[1] == 'P' && model->inlined[j].data[2] == 'N' &&
+                    model->inlined[j].data[3] == 'G'))
+                    continue;
+                for(i = k = 0; i < model->numtexture; i++) {
+                    if(!strcmp(model->inlined[j].name, model->texture[i].name)) { k = 1; break; }
+                }
+                if(k) continue;
+                numproc++;
+                chunklen = 8 + si_s;
+                h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+                if(!h) goto memerr;
+                memcpy((uint8_t*)h + len, "PROC", 4);
+                *((uint32_t*)((uint8_t*)h + len + 4)) = chunklen;
+                out = (uint8_t*)h + len + 8;
+                out = _m3d_addidx(out, si_s, _m3d_stridx(str, numstr, model->inlined[j].name));
+                out = NULL;
+                len += chunklen;
+            }
+        }
+        /* mesh face */
+        if(model->numface && face && !(flags & M3D_EXP_NOFACE)) {
+            chunklen = 8 + si_s + model->numface * (9 * vi_s + 3 * ti_s + si_s + 1);
+            h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+            if(!h) goto memerr;
+            memcpy((uint8_t*)h + len, "MESH", 4);
+            length = (uint32_t*)((uint8_t*)h + len + 4);
+            out = (uint8_t*)h + len + 8;
+            last = M3D_UNDEF;
+#ifdef M3D_VERTEXMAX
+            lastp = M3D_UNDEF;
+#endif
+            for(i = 0; i < model->numface; i++) {
+                if(!(flags & M3D_EXP_NOMATERIAL) && face[i].data.materialid != last) {
+                    last = face[i].data.materialid;
+                    idx = last < model->nummaterial ? _m3d_stridx(str, numstr, model->material[last].name) : 0;
+                    *out++ = 0;
+                    out = _m3d_addidx(out, si_s, idx);
+                }
+#ifdef M3D_VERTEXMAX
+                if(!(flags & M3D_EXP_NOVRTMAX) && face[i].data.paramid != lastp) {
+                    lastp = face[i].data.paramid;
+                    idx = lastp < model->numparam ? _m3d_stridx(str, numstr, model->param[lastp].name) : 0;
+                    *out++ = 0;
+                    out = _m3d_addidx(out, si_s, idx);
+                }
+#endif
+                /* hardcoded triangles. */
+                k = (3 << 4) |
+                    (((flags & M3D_EXP_NOTXTCRD) || !ti_s || face[i].data.texcoord[0] == M3D_UNDEF ||
+                    face[i].data.texcoord[1] == M3D_UNDEF || face[i].data.texcoord[2] == M3D_UNDEF) ? 0 : 1) |
+                    (((flags & M3D_EXP_NONORMAL) || face[i].data.normal[0] == M3D_UNDEF ||
+                    face[i].data.normal[1] == M3D_UNDEF || face[i].data.normal[2] == M3D_UNDEF) ? 0 : 2)
+#ifdef M3D_VERTEXMAX
+                    | (((flags & M3D_EXP_NOVRTMAX) || face[i].data.vertmax[0] == M3D_UNDEF ||
+                    face[i].data.vertmax[1] == M3D_UNDEF || face[i].data.vertmax[2] == M3D_UNDEF) ? 0 : 4)
+#endif
+                    ;
+                *out++ = k;
+                for(j = 0; j < 3; j++) {
+                    out = _m3d_addidx(out, vi_s, vrtxidx[face[i].data.vertex[j]]);
+                    if(k & 1)
+                        out = _m3d_addidx(out, ti_s, tmapidx[face[i].data.texcoord[j]]);
+                    if(k & 2)
+                        out = _m3d_addidx(out, vi_s, vrtxidx[face[i].data.normal[j]]);
+#ifdef M3D_VERTEXMAX
+                    if(k & 4)
+                        out = _m3d_addidx(out, vi_s, vrtxidx[face[i].data.vertmax[j]]);
+#endif
+                }
+            }
+            *length = (uint32_t)((uintptr_t)out - (uintptr_t)((uint8_t*)h + len));
+            len += *length;
+            out = NULL;
+        }
+        /* voxel face */
+        if(model->numvoxtype && model->voxtype && !(flags & M3D_EXP_NOFACE)) {
+            chunklen = 8 + si_s + model->numvoxtype * (ci_s + si_s + 3 + sk_s);
+            for(i = 0; i < model->numvoxtype; i++)
+                chunklen += model->voxtype[i].numitem * (2 + si_s);
+            h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+            if(!h) goto memerr;
+            memcpy((uint8_t*)h + len, "VOXT", 4);
+            length = (uint32_t*)((uint8_t*)h + len + 4);
+            out = (uint8_t*)h + len + 8;
+            for(i = 0; i < model->numvoxtype; i++) {
+                if(!(flags & M3D_EXP_NOCMAP)) {
+                    idx = _m3d_cmapidx(cmap, numcmap, model->voxtype[i].color);
+                    switch(ci_s) {
+                        case 1: *out++ = (uint8_t)(idx); break;
+                        case 2: *((uint16_t*)out) = (uint16_t)(idx); out += 2; break;
+                        case 4: *((uint32_t*)out) = (uint32_t)(model->voxtype[i].color); out += 4; break;
+                    }
+                }
+                out = _m3d_addidx(out, si_s, _m3d_stridx(str, numstr, model->voxtype[i].name));
+                *out++ = (model->voxtype[i].rotation & 0xBF) | (((model->voxtype[i].voxshape >> 8) & 1) << 6);
+                *out++ = model->voxtype[i].voxshape;
+                *out++ = model->voxtype[i].numitem;
+                if(!(flags & M3D_EXP_NOBONE) && model->numbone && maxskin)
+                    out = _m3d_addidx(out, sk_s, skinidx[model->voxtype[i].skinid]);
+                for(j = 0; j < model->voxtype[i].numitem; j++) {
+                    out = _m3d_addidx(out, 2, model->voxtype[i].item[j].count);
+                    out = _m3d_addidx(out, si_s, _m3d_stridx(str, numstr, model->voxtype[i].item[j].name));
+                }
+            }
+            *length = (uint32_t)((uintptr_t)out - (uintptr_t)((uint8_t*)h + len));
+            len += *length;
+            out = NULL;
+        }
+        if(model->numvoxel && model->voxel && !(flags & M3D_EXP_NOFACE)) {
+            for(j = 0; j < model->numvoxel; j++) {
+                chunklen = 8 + si_s + 6 * vd_s + 2 + model->voxel[j].w * model->voxel[j].h * model->voxel[j].d * 3;
+                h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+                if(!h) goto memerr;
+                memcpy((uint8_t*)h + len, "VOXD", 4);
+                length = (uint32_t*)((uint8_t*)h + len + 4);
+                out = (uint8_t*)h + len + 8;
+                out = _m3d_addidx(out, si_s, _m3d_stridx(str, numstr, model->voxel[j].name));
+                out = _m3d_addidx(out, vd_s, model->voxel[j].x);
+                out = _m3d_addidx(out, vd_s, model->voxel[j].y);
+                out = _m3d_addidx(out, vd_s, model->voxel[j].z);
+                out = _m3d_addidx(out, vd_s, model->voxel[j].w);
+                out = _m3d_addidx(out, vd_s, model->voxel[j].h);
+                out = _m3d_addidx(out, vd_s, model->voxel[j].d);
+                *out++ = model->voxel[j].uncertain;
+                *out++ = model->voxel[j].groupid;
+                /* RLE compress voxel data */
+                n = model->voxel[j].w * model->voxel[j].h * model->voxel[j].d;
+                k = o = 0; out[o++] = 0;
+                for(i = 0; i < n; i++) {
+                    for(l = 1; l < 128 && i + l < n && model->voxel[j].data[i] == model->voxel[j].data[i + l]; l++);
+                    if(l > 1) {
+                        l--;
+                        if(out[k]) { out[k]--; out[o++] = 0x80 | l; }
+                        else out[k] = 0x80 | l;
+                        switch(vp_s) {
+                            case 1: out[o++] = model->voxel[j].data[i]; break;
+                            default: *((uint16_t*)(out + o)) = model->voxel[j].data[i]; o += 2; break;
+                        }
+                        k = o; out[o++] = 0;
+                        i += l;
+                        continue;
+                    }
+                    out[k]++;
+                    switch(vp_s) {
+                        case 1: out[o++] = model->voxel[j].data[i]; break;
+                        default: *((uint16_t*)(out + o)) = model->voxel[j].data[i]; o += 2; break;
+                    }
+                    if(out[k] > 127) { out[k]--; k = o; out[o++] = 0; }
+                }
+                if(!(out[k] & 0x80)) { if(out[k]) out[k]--; else o--; }
+                *length = (uint32_t)((uintptr_t)out + (uintptr_t)o - (uintptr_t)((uint8_t*)h + len));
+                len += *length;
+                out = NULL;
+            }
+        }
+        /* mathematical shapes face */
+        if(model->numshape && model->shape && !(flags & M3D_EXP_NOFACE)) {
+            for(j = 0; j < model->numshape; j++) {
+                chunklen = 12 + si_s + model->shape[j].numcmd * (M3D_CMDMAXARG + 1) * 4;
+                h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+                if(!h) goto memerr;
+                memcpy((uint8_t*)h + len, "SHPE", 4);
+                length = (uint32_t*)((uint8_t*)h + len + 4);
+                out = (uint8_t*)h + len + 8;
+                out = _m3d_addidx(out, si_s, _m3d_stridx(str, numstr, model->shape[j].name));
+                out = _m3d_addidx(out, bi_s, model->shape[j].group);
+                for(i = 0; i < model->shape[j].numcmd; i++) {
+                    cmd = &model->shape[j].cmd[i];
+                    if(cmd->type >= (unsigned int)(sizeof(m3d_commandtypes)/sizeof(m3d_commandtypes[0])) || !cmd->arg)
+                        continue;
+                    cd = &m3d_commandtypes[cmd->type];
+                    *out++ = (cmd->type & 0x7F) | (cmd->type > 127 ? 0x80 : 0);
+                    if(cmd->type > 127) *out++ = (cmd->type >> 7) & 0xff;
+                    for(k = n = 0, l = cd->p; k < l; k++) {
+                        switch(cd->a[((k - n) % (cd->p - n)) + n]) {
+                            case m3dcp_mi_t:
+                                out = _m3d_addidx(out, si_s, cmd->arg[k] < model->nummaterial ?
+                                    _m3d_stridx(str, numstr, model->material[cmd->arg[k]].name) : 0);
+                            break;
+                            case m3dcp_vc_t:
+                                min_x = *((float*)&cmd->arg[k]);
+                                switch(vc_s) {
+                                    case 1: *out++ = (int8_t)(min_x * 127); break;
+                                    case 2: *((int16_t*)out) = (int16_t)(min_x * 32767); out += 2; break;
+                                    case 4: *((float*)out) = min_x; out += 4; break;
+                                    case 8: *((double*)out) = min_x; out += 8; break;
+                                }
+                            break;
+                            case m3dcp_hi_t: out = _m3d_addidx(out, hi_s, cmd->arg[k]); break;
+                            case m3dcp_fi_t: out = _m3d_addidx(out, fi_s, cmd->arg[k]); break;
+                            case m3dcp_ti_t: out = _m3d_addidx(out, ti_s, cmd->arg[k]); break;
+                            case m3dcp_qi_t:
+                            case m3dcp_vi_t: out = _m3d_addidx(out, vi_s, cmd->arg[k]); break;
+                            case m3dcp_i1_t: out = _m3d_addidx(out, 1, cmd->arg[k]); break;
+                            case m3dcp_i2_t: out = _m3d_addidx(out, 2, cmd->arg[k]); break;
+                            case m3dcp_i4_t: out = _m3d_addidx(out, 4, cmd->arg[k]); break;
+                            case m3dcp_va_t: out = _m3d_addidx(out, 4, cmd->arg[k]);
+                                n = k + 1; l += (cmd->arg[k] - 1) * (cd->p - k - 1);
+                            break;
+                        }
+                    }
+                }
+                *length = (uint32_t)((uintptr_t)out - (uintptr_t)((uint8_t*)h + len));
+                len += *length;
+                out = NULL;
+            }
+        }
+        /* annotation labels */
+        if(model->numlabel && model->label) {
+            for(i = 0, length = NULL; i < model->numlabel; i++) {
+                if(!i || _m3d_strcmp(sl, model->label[i].lang) || _m3d_strcmp(sn, model->label[i].name)) {
+                    sl = model->label[i].lang;
+                    sn = model->label[i].name;
+                    if(length) {
+                        *length = (uint32_t)((uintptr_t)out - (uintptr_t)((uint8_t*)h + len));
+                        len += *length;
+                    }
+                    chunklen = 8 + 2 * si_s + ci_s + model->numlabel * (vi_s + si_s);
+                    h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+                    if(!h) { sn = NULL; sl = NULL; goto memerr; }
+                    memcpy((uint8_t*)h + len, "LBLS", 4);
+                    length = (uint32_t*)((uint8_t*)h + len + 4);
+                    out = (uint8_t*)h + len + 8;
+                    out = _m3d_addidx(out, si_s, _m3d_stridx(str, numstr, model->label[l].name));
+                    out = _m3d_addidx(out, si_s, _m3d_stridx(str, numstr, model->label[l].lang));
+                    idx = _m3d_cmapidx(cmap, numcmap, model->label[i].color);
+                    switch(ci_s) {
+                        case 1: *out++ = (uint8_t)(idx); break;
+                        case 2: *((uint16_t*)out) = (uint16_t)(idx); out += 2; break;
+                        case 4: *((uint32_t*)out) = model->label[i].color; out += 4; break;
+                    }
+                }
+                out = _m3d_addidx(out, vi_s, vrtxidx[model->label[i].vertexid]);
+                out = _m3d_addidx(out, si_s, _m3d_stridx(str, numstr, model->label[l].text));
+            }
+            if(length) {
+                *length = (uint32_t)((uintptr_t)out - (uintptr_t)((uint8_t*)h + len));
+                len += *length;
+            }
+            out = NULL;
+            sn = sl = NULL;
+        }
+        /* actions */
+        if(model->numaction && model->action && model->numbone && model->bone && !(flags & M3D_EXP_NOACTION)) {
+            for(j = 0; j < model->numaction; j++) {
+                a = &model->action[j];
+                chunklen = 14 + si_s + a->numframe * (4 + fc_s + maxt * (bi_s + 2 * vi_s));
+                h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+                if(!h) goto memerr;
+                memcpy((uint8_t*)h + len, "ACTN", 4);
+                length = (uint32_t*)((uint8_t*)h + len + 4);
+                out = (uint8_t*)h + len + 8;
+                out = _m3d_addidx(out, si_s, _m3d_stridx(str, numstr, a->name));
+                *((uint16_t*)out) = (uint16_t)(a->numframe); out += 2;
+                *((uint32_t*)out) = (uint32_t)(a->durationmsec); out += 4;
+                for(i = 0; i < a->numframe; i++) {
+                    *((uint32_t*)out) = (uint32_t)(a->frame[i].msec); out += 4;
+                    out = _m3d_addidx(out, fc_s, a->frame[i].numtransform);
+                    for(k = 0; k < a->frame[i].numtransform; k++) {
+                        out = _m3d_addidx(out, bi_s, a->frame[i].transform[k].boneid);
+                        out = _m3d_addidx(out, vi_s, vrtxidx[a->frame[i].transform[k].pos]);
+                        out = _m3d_addidx(out, vi_s, vrtxidx[a->frame[i].transform[k].ori]);
+                    }
+                }
+                *length = (uint32_t)((uintptr_t)out - (uintptr_t)((uint8_t*)h + len));
+                len += *length;
+                out = NULL;
+            }
+        }
+        /* inlined assets */
+        if(model->numinlined && model->inlined && (numproc || (flags & M3D_EXP_INLINE))) {
+            for(j = 0; j < model->numinlined; j++) {
+                if(!model->inlined[j].name || !model->inlined[j].name[0] || model->inlined[j].length<4 || !model->inlined[j].data)
+                    continue;
+                if(!(flags & M3D_EXP_INLINE)) {
+                    if(model->inlined[j].data[1] == 'P' && model->inlined[j].data[2] == 'N' && model->inlined[j].data[3] == 'G')
+                        continue;
+                    for(i = k = 0; i < model->numtexture; i++) {
+                        if(!strcmp(model->inlined[j].name, model->texture[i].name)) { k = 1; break; }
+                    }
+                    if(k) continue;
+                }
+                chunklen = 8 + si_s + model->inlined[j].length;
+                h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+                if(!h) goto memerr;
+                memcpy((uint8_t*)h + len, "ASET", 4);
+                *((uint32_t*)((uint8_t*)h + len + 4)) = chunklen;
+                out = (uint8_t*)h + len + 8;
+                out = _m3d_addidx(out, si_s, _m3d_stridx(str, numstr, model->inlined[j].name));
+                memcpy(out, model->inlined[j].data, model->inlined[j].length);
+                out = NULL;
+                len += chunklen;
+            }
+        }
+        /* extra chunks */
+        if(model->numextra && model->extra && (flags & M3D_EXP_EXTRA)) {
+            for(j = 0; j < model->numextra; j++) {
+                if(!model->extra[j] || model->extra[j]->length < 8)
+                    continue;
+                chunklen = model->extra[j]->length;
+                h = (m3dhdr_t*)M3D_REALLOC(h, len + chunklen);
+                if(!h) goto memerr;
+                memcpy((uint8_t*)h + len, model->extra[j], chunklen);
+                len += chunklen;
+            }
+        }
+        /* add end chunk */
+        h = (m3dhdr_t*)M3D_REALLOC(h, len + 4);
+        if(!h) goto memerr;
+        memcpy((uint8_t*)h + len, "OMD3", 4);
+        len += 4;
+        /* zlib compress */
+        if(!(flags & M3D_EXP_NOZLIB)) {
+            M3D_LOG("Deflating chunks");
+            z = stbi_zlib_compress((unsigned char *)h, len, (int*)&l, 9);
+            if(z && l > 0 && l < len) { len = l; M3D_FREE(h); h = (m3dhdr_t*)z; }
+        }
+        /* add file header at the begining */
+        len += 8;
+        out = (unsigned char*)M3D_MALLOC(len);
+        if(!out) goto memerr;
+        memcpy(out, "3DMO", 4);
+        *((uint32_t*)(out + 4)) = len;
+        /* preview image chunk, must be the first if exists */
+        if(model->preview.data && model->preview.length) {
+            chunklen = 8 + model->preview.length;
+            out = (unsigned char*)M3D_REALLOC(out, len + chunklen);
+            if(!out) goto memerr;
+            memcpy((uint8_t*)out + 8, "PRVW", 4);
+            *((uint32_t*)((uint8_t*)out + 8 + 4)) = chunklen;
+            memcpy((uint8_t*)out + 8 + 8, model->preview.data, model->preview.length);
+            *((uint32_t*)(out + 4)) += chunklen;
+        } else
+            chunklen = 0;
+        memcpy(out + 8 + chunklen, h, len - 8);
+    }
+    if(size) *size = out ? len : 0;
+    if(vrtxidx) M3D_FREE(vrtxidx);
+    if(mtrlidx) M3D_FREE(mtrlidx);
+    if(tmapidx) M3D_FREE(tmapidx);
+    if(skinidx) M3D_FREE(skinidx);
+    if(norm) M3D_FREE(norm);
+    if(face) M3D_FREE(face);
+    if(cmap) M3D_FREE(cmap);
+    if(tmap) M3D_FREE(tmap);
+    if(skin) M3D_FREE(skin);
+    if(str) M3D_FREE(str);
+    if(vrtx) M3D_FREE(vrtx);
+    if(opa) M3D_FREE(opa);
+    if(h) M3D_FREE(h);
+    return out;
+}
+#endif
+
+#endif
+
+#ifdef  __cplusplus
+}
+#ifdef M3D_CPPWRAPPER
+#include <vector>
+#include <string>
+#include <memory>
+
+/*** C++ wrapper class ***/
+namespace M3D {
+#ifdef M3D_IMPLEMENTATION
+
+    class Model {
+        public:
+            m3d_t *model;
+
+        public:
+            Model() {
+                this->model = (m3d_t*)M3D_MALLOC(sizeof(m3d_t)); memset(this->model, 0, sizeof(m3d_t));
+            }
+            Model(_unused const std::string &data, _unused m3dread_t ReadFileCB,
+                _unused m3dfree_t FreeCB, _unused M3D::Model mtllib) {
+#ifndef M3D_NOIMPORTER
+                this->model = m3d_load((unsigned char *)data.data(), ReadFileCB, FreeCB, mtllib.model);
+#else
+                Model();
+#endif
+            }
+            Model(_unused const std::vector<unsigned char> data, _unused m3dread_t ReadFileCB,
+                _unused m3dfree_t FreeCB, _unused M3D::Model mtllib) {
+#ifndef M3D_NOIMPORTER
+                this->model = m3d_load((unsigned char *)&data[0], ReadFileCB, FreeCB, mtllib.model);
+#else
+                Model();
+#endif
+            }
+            Model(_unused const unsigned char *data, _unused m3dread_t ReadFileCB,
+                _unused m3dfree_t FreeCB, _unused M3D::Model mtllib) {
+#ifndef M3D_NOIMPORTER
+                this->model = m3d_load((unsigned char*)data, ReadFileCB, FreeCB, mtllib.model);
+#else
+                Model();
+#endif
+            }
+            ~Model() { m3d_free(this->model); }
+
+        public:
+            m3d_t *getCStruct() { return this->model; }
+            std::string getName() { return std::string(this->model->name); }
+            void setName(std::string name) { this->model->name = (char*)name.c_str(); }
+            std::string getLicense() { return std::string(this->model->license); }
+            void setLicense(std::string license) { this->model->license = (char*)license.c_str(); }
+            std::string getAuthor() { return std::string(this->model->author); }
+            void setAuthor(std::string author) { this->model->author = (char*)author.c_str(); }
+            std::string getDescription() { return std::string(this->model->desc); }
+            void setDescription(std::string desc) { this->model->desc = (char*)desc.c_str(); }
+            float getScale() { return this->model->scale; }
+            void setScale(float scale) { this->model->scale = scale; }
+            std::vector<unsigned char> getPreview() { return this->model->preview.data ?
+                std::vector<unsigned char>(this->model->preview.data, this->model->preview.data + this->model->preview.length) :
+                std::vector<unsigned char>(); }
+            std::vector<uint32_t> getColorMap() { return this->model->cmap ? std::vector<uint32_t>(this->model->cmap,
+                this->model->cmap + this->model->numcmap) : std::vector<uint32_t>(); }
+            std::vector<m3dti_t> getTextureMap() { return this->model->tmap ? std::vector<m3dti_t>(this->model->tmap,
+                this->model->tmap + this->model->numtmap) : std::vector<m3dti_t>(); }
+            std::vector<m3dtx_t> getTextures() { return this->model->texture ? std::vector<m3dtx_t>(this->model->texture,
+                this->model->texture + this->model->numtexture) : std::vector<m3dtx_t>(); }
+            std::string getTextureName(int idx) { return idx >= 0 && (unsigned int)idx < this->model->numtexture ?
+                std::string(this->model->texture[idx].name) : nullptr; }
+            std::vector<m3db_t> getBones() { return this->model->bone ? std::vector<m3db_t>(this->model->bone, this->model->bone +
+                this->model->numbone) : std::vector<m3db_t>(); }
+            std::string getBoneName(int idx) { return idx >= 0 && (unsigned int)idx < this->model->numbone ?
+                std::string(this->model->bone[idx].name) : nullptr; }
+            std::vector<m3dm_t> getMaterials() { return this->model->material ? std::vector<m3dm_t>(this->model->material,
+                this->model->material + this->model->nummaterial) : std::vector<m3dm_t>(); }
+            std::string getMaterialName(int idx) { return idx >= 0 && (unsigned int)idx < this->model->nummaterial ?
+                std::string(this->model->material[idx].name) : nullptr; }
+            int getMaterialPropertyInt(int idx, int type) {
+                    if (idx < 0 || (unsigned int)idx >= this->model->nummaterial || type < 0 || type >= 127 ||
+                        !this->model->material[idx].prop) return -1;
+                    for (int i = 0; i < this->model->material[idx].numprop; i++) {
+                        if (this->model->material[idx].prop[i].type == type)
+                            return this->model->material[idx].prop[i].value.num;
+                    }
+                    return -1;
+                }
+            uint32_t getMaterialPropertyColor(int idx, int type) { return this->getMaterialPropertyInt(idx, type); }
+            float getMaterialPropertyFloat(int idx, int type) {
+                    if (idx < 0 || (unsigned int)idx >= this->model->nummaterial || type < 0 || type >= 127 ||
+                        !this->model->material[idx].prop) return -1.0f;
+                    for (int i = 0; i < this->model->material[idx].numprop; i++) {
+                        if (this->model->material[idx].prop[i].type == type)
+                            return this->model->material[idx].prop[i].value.fnum;
+                    }
+                    return -1.0f;
+                }
+            m3dtx_t* getMaterialPropertyMap(int idx, int type) {
+                    if (idx < 0 || (unsigned int)idx >= this->model->nummaterial || type < 128 || type > 255 ||
+                        !this->model->material[idx].prop) return nullptr;
+                    for (int i = 0; i < this->model->material[idx].numprop; i++) {
+                        if (this->model->material[idx].prop[i].type == type)
+                            return this->model->material[idx].prop[i].value.textureid < this->model->numtexture ?
+                                &this->model->texture[this->model->material[idx].prop[i].value.textureid] : nullptr;
+                    }
+                    return nullptr;
+                }
+            std::vector<m3dv_t> getVertices() { return this->model->vertex ? std::vector<m3dv_t>(this->model->vertex,
+                this->model->vertex + this->model->numvertex) : std::vector<m3dv_t>(); }
+            std::vector<m3df_t> getFace() { return this->model->face ? std::vector<m3df_t>(this->model->face, this->model->face +
+                this->model->numface) : std::vector<m3df_t>(); }
+            std::vector<m3dvt_t> getVoxelTypes() { return this->model->voxtype ? std::vector<m3dvt_t>(this->model->voxtype,
+                this->model->voxtype + this->model->numvoxtype) : std::vector<m3dvt_t>(); }
+            std::string getVoxelTypeName(int idx) { return idx >= 0 && (unsigned int)idx < this->model->numvoxtype &&
+                this->model->voxtype[idx].name && this->model->voxtype[idx].name[0] ?
+                std::string(this->model->voxtype[idx].name) : nullptr; }
+            std::vector<m3dvi_t> getVoxelTypeItems(int idx) { return idx >= 0 && (unsigned int)idx < this->model->numvoxtype &&
+                this->model->voxtype[idx].item ? std::vector<m3dvi_t>(this->model->voxtype[idx].item,
+                this->model->voxtype[idx].item + this->model->voxtype[idx].numitem) : std::vector<m3dvi_t>(); }
+            std::vector<m3dvx_t> getVoxelBlocks() { return this->model->voxel ? std::vector<m3dvx_t>(this->model->voxel,
+                this->model->voxel + this->model->numvoxel) : std::vector<m3dvx_t>(); }
+            std::string getVoxelBlockName(int idx) { return idx >= 0 && (unsigned int)idx < this->model->numvoxel &&
+                this->model->voxel[idx].name && this->model->voxel[idx].name[0] ?
+                std::string(this->model->voxel[idx].name) : nullptr; }
+            std::vector<M3D_VOXEL> getVoxelBlockData(int idx) { return idx >= 0 && (unsigned int)idx < this->model->numvoxel &&
+                this->model->voxel[idx].data ? std::vector<M3D_VOXEL>(this->model->voxel[idx].data,
+                this->model->voxel[idx].data + this->model->voxel[idx].w*this->model->voxel[idx].h*this->model->voxel[idx].d) :
+                std::vector<M3D_VOXEL>(); }
+            std::vector<m3dh_t> getShape() { return this->model->shape ? std::vector<m3dh_t>(this->model->shape,
+                this->model->shape + this->model->numshape) : std::vector<m3dh_t>(); }
+            std::string getShapeName(int idx) { return idx >= 0 && (unsigned int)idx < this->model->numshape &&
+                this->model->shape[idx].name && this->model->shape[idx].name[0] ?
+                std::string(this->model->shape[idx].name) : nullptr; }
+            unsigned int getShapeGroup(int idx) { return idx >= 0 && (unsigned int)idx < this->model->numshape ?
+                this->model->shape[idx].group : 0xFFFFFFFF; }
+            std::vector<m3dc_t> getShapeCommands(int idx) { return idx >= 0 && (unsigned int)idx < this->model->numshape &&
+                this->model->shape[idx].cmd ? std::vector<m3dc_t>(this->model->shape[idx].cmd, this->model->shape[idx].cmd +
+                this->model->shape[idx].numcmd) : std::vector<m3dc_t>(); }
+            std::vector<m3dl_t> getAnnotationLabels() { return this->model->label ? std::vector<m3dl_t>(this->model->label,
+                this->model->label + this->model->numlabel) : std::vector<m3dl_t>(); }
+            std::vector<m3ds_t> getSkin() { return this->model->skin ? std::vector<m3ds_t>(this->model->skin, this->model->skin +
+                this->model->numskin) : std::vector<m3ds_t>(); }
+            std::vector<m3da_t> getActions() { return this->model->action ? std::vector<m3da_t>(this->model->action,
+                this->model->action + this->model->numaction) : std::vector<m3da_t>(); }
+            std::string getActionName(int aidx) { return aidx >= 0 && (unsigned int)aidx < this->model->numaction ?
+                std::string(this->model->action[aidx].name) : nullptr; }
+            unsigned int getActionDuration(int aidx) { return aidx >= 0 && (unsigned int)aidx < this->model->numaction ?
+                this->model->action[aidx].durationmsec : 0; }
+            std::vector<m3dfr_t> getActionFrames(int aidx) { return aidx >= 0 && (unsigned int)aidx < this->model->numaction ?
+                std::vector<m3dfr_t>(this->model->action[aidx].frame, this->model->action[aidx].frame +
+                this->model->action[aidx].numframe) : std::vector<m3dfr_t>(); }
+            unsigned int getActionFrameTimestamp(int aidx, int fidx) { return aidx >= 0 && (unsigned int)aidx < this->model->numaction?
+                    (fidx >= 0 && (unsigned int)fidx < this->model->action[aidx].numframe ?
+                    this->model->action[aidx].frame[fidx].msec : 0) : 0; }
+            std::vector<m3dtr_t> getActionFrameTransforms(int aidx, int fidx) {
+                return aidx >= 0 && (unsigned int)aidx < this->model->numaction ? (
+                    fidx >= 0 && (unsigned int)fidx < this->model->action[aidx].numframe ?
+                    std::vector<m3dtr_t>(this->model->action[aidx].frame[fidx].transform,
+                    this->model->action[aidx].frame[fidx].transform + this->model->action[aidx].frame[fidx].numtransform) :
+                    std::vector<m3dtr_t>()) : std::vector<m3dtr_t>(); }
+            std::vector<m3dtr_t> getActionFrame(int aidx, int fidx, std::vector<m3dtr_t> skeleton) {
+                m3dtr_t *pose = m3d_frame(this->model, (unsigned int)aidx, (unsigned int)fidx,
+                    skeleton.size() ? &skeleton[0] : nullptr);
+                return std::vector<m3dtr_t>(pose, pose + this->model->numbone); }
+            std::vector<m3db_t> getActionPose(int aidx, unsigned int msec) {
+                m3db_t *pose = m3d_pose(this->model, (unsigned int)aidx, (unsigned int)msec);
+                return std::vector<m3db_t>(pose, pose + this->model->numbone); }
+            std::vector<m3di_t> getInlinedAssets() { return this->model->inlined ? std::vector<m3di_t>(this->model->inlined,
+                this->model->inlined + this->model->numinlined) : std::vector<m3di_t>(); }
+            std::vector<std::unique_ptr<m3dchunk_t>> getExtras() { return this->model->extra ?
+                std::vector<std::unique_ptr<m3dchunk_t>>(this->model->extra,
+                this->model->extra + this->model->numextra) : std::vector<std::unique_ptr<m3dchunk_t>>(); }
+            std::vector<unsigned char> Save(_unused int quality, _unused int flags) {
+#ifdef M3D_EXPORTER
+                unsigned int size;
+                unsigned char *ptr = m3d_save(this->model, quality, flags, &size);
+                return ptr && size ? std::vector<unsigned char>(ptr, ptr + size) : std::vector<unsigned char>();
+#else
+                return std::vector<unsigned char>();
+#endif
+            }
+    };
+
+#else
+    class Model {
+        private:
+            m3d_t *model;
+
+        public:
+            Model(const std::string &data, m3dread_t ReadFileCB, m3dfree_t FreeCB);
+            Model(const std::vector<unsigned char> data, m3dread_t ReadFileCB, m3dfree_t FreeCB);
+            Model(const unsigned char *data, m3dread_t ReadFileCB, m3dfree_t FreeCB);
+            Model();
+            ~Model();
+
+        public:
+            m3d_t *getCStruct();
+            std::string getName();
+            void setName(std::string name);
+            std::string getLicense();
+            void setLicense(std::string license);
+            std::string getAuthor();
+            void setAuthor(std::string author);
+            std::string getDescription();
+            void setDescription(std::string desc);
+            float getScale();
+            void setScale(float scale);
+            std::vector<unsigned char> getPreview();
+            std::vector<uint32_t> getColorMap();
+            std::vector<m3dti_t> getTextureMap();
+            std::vector<m3dtx_t> getTextures();
+            std::string getTextureName(int idx);
+            std::vector<m3db_t> getBones();
+            std::string getBoneName(int idx);
+            std::vector<m3dm_t> getMaterials();
+            std::string getMaterialName(int idx);
+            int getMaterialPropertyInt(int idx, int type);
+            uint32_t getMaterialPropertyColor(int idx, int type);
+            float getMaterialPropertyFloat(int idx, int type);
+            m3dtx_t* getMaterialPropertyMap(int idx, int type);
+            std::vector<m3dv_t> getVertices();
+            std::vector<m3df_t> getFace();
+            std::vector<m3dvt_t> getVoxelTypes();
+            std::string getVoxelTypeName(int idx);
+            std::vector<m3dvi_t> getVoxelTypeItems(int idx);
+            std::vector<m3dvx_t> getVoxelBlocks();
+            std::string getVoxelBlockName(int idx);
+            std::vector<M3D_VOXEL> getVoxelBlockData(int idx);
+            std::vector<m3dh_t> getShape();
+            std::string getShapeName(int idx);
+            unsigned int getShapeGroup(int idx);
+            std::vector<m3dc_t> getShapeCommands(int idx);
+            std::vector<m3dl_t> getAnnotationLabels();
+            std::vector<m3ds_t> getSkin();
+            std::vector<m3da_t> getActions();
+            std::string getActionName(int aidx);
+            unsigned int getActionDuration(int aidx);
+            std::vector<m3dfr_t> getActionFrames(int aidx);
+            unsigned int getActionFrameTimestamp(int aidx, int fidx);
+            std::vector<m3dtr_t> getActionFrameTransforms(int aidx, int fidx);
+            std::vector<m3dtr_t> getActionFrame(int aidx, int fidx, std::vector<m3dtr_t> skeleton);
+            std::vector<m3db_t> getActionPose(int aidx, unsigned int msec);
+            std::vector<m3di_t> getInlinedAssets();
+            std::vector<std::unique_ptr<m3dchunk_t>> getExtras();
+            std::vector<unsigned char> Save(int quality, int flags);
+    };
+
+#endif /* impl */
+}
+#endif
+
+#endif /* __cplusplus */
+
+#endif
diff --git a/external/stb/stb.c b/external/stb/stb.c
new file mode 100644
index 0000000..8ddfd1f
--- /dev/null
+++ b/external/stb/stb.c
@@ -0,0 +1,2 @@
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
diff --git a/external/stb/stb_image.h b/external/stb/stb_image.h
new file mode 100644
index 0000000..9eedabe
--- /dev/null
+++ b/external/stb/stb_image.h
@@ -0,0 +1,7988 @@
+/* stb_image - v2.30 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
+
+   Do this:
+      #define STB_IMAGE_IMPLEMENTATION
+   before you include this file in *one* C or C++ file to create the implementation.
+
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8/16-bit-per-channel
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
+
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
+      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
+      - decode from arbitrary I/O callbacks
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.30  (2024-05-31) avoid erroneous gcc warning
+      2.29  (2023-05-xx) optimizations
+      2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
+      2.26  (2020-07-13) many minor fixes
+      2.25  (2020-02-02) fix warnings
+      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
+      2.23  (2019-08-11) fix clang static analysis warning
+      2.22  (2019-03-04) gif fixes, fix warnings
+      2.21  (2019-02-25) fix typo in comment
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+
+   See end of file for full revision history.
+
+
+ ============================    Contributors    =========================
+
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
+    John-Mark Allen
+    Carmelo J Fdez-Aguera
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
+    Phil Jordan                                Dave Moore           Roy Eltham
+    Hayaki Saito            Nathan Reed        Won Chun
+    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
+    Thomas Ruf              Ronny Chevalier                         github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
+    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
+    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
+    Cass Everitt            Ryamond Barbiero                        github:grim210
+    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
+    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
+    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
+    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
+    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
+                            Brad Weinberger    Matvey Cherevko      github:mosra
+    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
+    Ryan C. Gordon          [reserved]                              [reserved]
+                     DO NOT ADD YOUR NAME HERE
+
+                     Jacko Dirks
+
+  To add your name to the credits, pick a random blank space in the middle and fill it.
+  80% of merge conflicts on stb PRs are due to people adding their name at the end
+  of the credits.
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// DOCUMENTATION
+//
+// Limitations:
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below for HDR usage):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ...
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    // ... but 'n' will always be the number that it would have been if you said 0
+//    stbi_image_free(data);
+//
+// Standard parameters:
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// To query the width, height and component count of an image without having to
+// decode the full file, you can use the stbi_info family of functions:
+//
+//   int x,y,n,ok;
+//   ok = stbi_info(filename, &x, &y, &n);
+//   // returns ok=1 and sets x, y, n if image is a supported format,
+//   // 0 otherwise.
+//
+// Note that stb_image pervasively uses ints in its public API for sizes,
+// including sizes of memory buffers. This is now part of the API and thus
+// hard to change without causing breakage. As a result, the various image
+// loaders all have certain limits on image size; these differ somewhat
+// by format but generally boil down to either just under 2GB or just under
+// 1GB. When the decoded image would be larger than this, stb_image decoding
+// will fail.
+//
+// Additionally, stb_image will reject image files that have any of their
+// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+// the only way to have an image with such dimensions load correctly
+// is for it to have a rather extreme aspect ratio. Either way, the
+// assumption here is that such larger images are likely to be malformed
+// or malicious. If you do need to load an image with individual dimensions
+// larger than that, and it still fits in the overall size limit, you can
+// #define STBI_MAX_DIMENSIONS on your own to be something larger.
+//
+// ===========================================================================
+//
+// UNICODE:
+//
+//   If compiling for Windows and you wish to use Unicode filenames, compile
+//   with
+//       #define STBI_WINDOWS_UTF8
+//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
+//   Windows wchar_t filenames to utf8.
+//
+// ===========================================================================
+//
+// Philosophy
+//
+// stb libraries are designed with the following priorities:
+//
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy-to-use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// provide more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small source code footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image supports loading HDR images in general, and currently the Radiance
+// .HDR file format specifically. You can still load any file through the existing
+// interface; if you attempt to load an HDR file, it will be automatically remapped
+// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+//
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// We optionally support converting iPhone-formatted PNGs (which store
+// premultiplied BGRA) back to RGB, even though they're internally encoded
+// differently. To enable this conversion, call
+// stbi_convert_iphone_png_to_rgb(1).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
+//  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
+//    than that size (in either width or height) without further processing.
+//    This is to let programs in the wild set an upper bound to prevent
+//    denial-of-service attacks on untrusted data, as one could generate a
+//    valid image of gigantic dimensions and force stb_image to allocate a
+//    huge block of memory and spend disproportionate time decoding it. By
+//    default this is set to (1 << 24), which is 16777216, but that's still
+//    very big.
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif // STBI_NO_STDIO
+
+#define STBI_VERSION 1
+
+enum
+{
+   STBI_default = 0, // only used for desired_channels
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+#include <stdlib.h>
+typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef STBIDEF
+#ifdef STB_IMAGE_STATIC
+#define STBIDEF static
+#else
+#define STBIDEF extern
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PRIMARY API - works on images of any type
+//
+
+//
+// load image by filename, open file, or memory buffer
+//
+
+typedef struct
+{
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
+} stbi_io_callbacks;
+
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+#endif
+
+#ifdef STBI_WINDOWS_UTF8
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
+#ifndef STBI_NO_LINEAR
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
+
+   #ifndef STBI_NO_STDIO
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+   #endif
+#endif
+
+#ifndef STBI_NO_HDR
+   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
+   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
+   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_LINEAR
+
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
+STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename);
+STBIDEF int      stbi_is_hdr_from_file(FILE *f);
+#endif // STBI_NO_STDIO
+
+
+// get a VERY brief reason for failure
+// on most compilers (and ALL modern mainstream compilers) this is threadsafe
+STBIDEF const char *stbi_failure_reason  (void);
+
+// free the loaded image -- this is just free()
+STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
+#endif
+
+
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+// flip the image vertically, so the first pixel in the output array is the bottom left
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
+
+// as above, but only applies to images loaded on the thread that calls the function
+// this function is only available if your compiler supports thread-local variables;
+// calling it will fail to link if your compiler doesn't
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
+
+// ZLIB client - used by PNG, available for other purposes
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifdef STB_IMAGE_IMPLEMENTATION
+
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#include <math.h>  // ldexp, pow
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STBI_ASSERT
+#include <assert.h>
+#define STBI_ASSERT(x) assert(x)
+#endif
+
+#ifdef __cplusplus
+#define STBI_EXTERN extern "C"
+#else
+#define STBI_EXTERN extern
+#endif
+
+
+#ifndef _MSC_VER
+   #ifdef __cplusplus
+   #define stbi_inline inline
+   #else
+   #define stbi_inline
+   #endif
+#else
+   #define stbi_inline __forceinline
+#endif
+
+#ifndef STBI_NO_THREAD_LOCALS
+   #if defined(__cplusplus) &&  __cplusplus >= 201103L
+      #define STBI_THREAD_LOCAL       thread_local
+   #elif defined(__GNUC__) && __GNUC__ < 5
+      #define STBI_THREAD_LOCAL       __thread
+   #elif defined(_MSC_VER)
+      #define STBI_THREAD_LOCAL       __declspec(thread)
+   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+      #define STBI_THREAD_LOCAL       _Thread_local
+   #endif
+
+   #ifndef STBI_THREAD_LOCAL
+      #if defined(__GNUC__)
+        #define STBI_THREAD_LOCAL       __thread
+      #endif
+   #endif
+#endif
+
+#if defined(_MSC_VER) || defined(__SYMBIAN32__)
+typedef unsigned short stbi__uint16;
+typedef   signed short stbi__int16;
+typedef unsigned int   stbi__uint32;
+typedef   signed int   stbi__int32;
+#else
+#include <stdint.h>
+typedef uint16_t stbi__uint16;
+typedef int16_t  stbi__int16;
+typedef uint32_t stbi__uint32;
+typedef int32_t  stbi__int32;
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBI_NOTUSED(v)  (void)(v)
+#else
+#define STBI_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifdef _MSC_VER
+#define STBI_HAS_LROTL
+#endif
+
+#ifdef STBI_HAS_LROTL
+   #define stbi_lrot(x,y)  _lrotl(x,y)
+#else
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
+#endif
+
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+#endif
+
+// x86/x64 detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define STBI__X64_TARGET
+#elif defined(__i386) || defined(_M_IX86)
+#define STBI__X86_TARGET
+#endif
+
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
+#define STBI_NO_SIMD
+#endif
+
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+//
+// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
+// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
+// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
+// simultaneously enabling "-mstackrealign".
+//
+// See https://github.com/nothings/stb/issues/81 for more information.
+//
+// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
+// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
+}
+#endif
+
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
+}
+#endif
+
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
+#ifndef STBI_MAX_DIMENSIONS
+#define STBI_MAX_DIMENSIONS (1 << 24)
+#endif
+
+///////////////////////////////////////////////
+//
+//  stbi__context struct and start_xxx functions
+
+// stbi__context structure is our basic context used by all images, so it
+// contains all the IO context, plus some basic image information
+typedef struct
+{
+   stbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   stbi_io_callbacks io;
+   void *io_user_data;
+
+   int read_from_callbacks;
+   int buflen;
+   stbi_uc buffer_start[128];
+   int callback_already_read;
+
+   stbi_uc *img_buffer, *img_buffer_end;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
+} stbi__context;
+
+
+static void stbi__refill_buffer(stbi__context *s);
+
+// initialize a memory-decode context
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
+{
+   s->io.read = NULL;
+   s->read_from_callbacks = 0;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
+}
+
+// initialize a callback-based context
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
+{
+   s->io = *c;
+   s->io_user_data = user;
+   s->buflen = sizeof(s->buffer_start);
+   s->read_from_callbacks = 1;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = s->buffer_start;
+   stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
+}
+
+#ifndef STBI_NO_STDIO
+
+static int stbi__stdio_read(void *user, char *data, int size)
+{
+   return (int) fread(data,1,size,(FILE*) user);
+}
+
+static void stbi__stdio_skip(void *user, int n)
+{
+   int ch;
+   fseek((FILE*) user, n, SEEK_CUR);
+   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
+   if (ch != EOF) {
+      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
+   }
+}
+
+static int stbi__stdio_eof(void *user)
+{
+   return feof((FILE*) user) || ferror((FILE *) user);
+}
+
+static stbi_io_callbacks stbi__stdio_callbacks =
+{
+   stbi__stdio_read,
+   stbi__stdio_skip,
+   stbi__stdio_eof,
+};
+
+static void stbi__start_file(stbi__context *s, FILE *f)
+{
+   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+}
+
+//static void stop_file(stbi__context *s) { }
+
+#endif // !STBI_NO_STDIO
+
+static void stbi__rewind(stbi__context *s)
+{
+   // conceptually rewind SHOULD rewind to the beginning of the stream,
+   // but we just rewind to the beginning of the initial buffer, because
+   // we only use it after doing 'test', which only ever looks at at most 92 bytes
+   s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
+}
+
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
+#ifndef STBI_NO_JPEG
+static int      stbi__jpeg_test(stbi__context *s);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNG
+static int      stbi__png_test(stbi__context *s);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_BMP
+static int      stbi__bmp_test(stbi__context *s);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_TGA
+static int      stbi__tga_test(stbi__context *s);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PSD
+static int      stbi__psd_test(stbi__context *s);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_HDR
+static int      stbi__hdr_test(stbi__context *s);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PIC
+static int      stbi__pic_test(stbi__context *s);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_GIF
+static int      stbi__gif_test(stbi__context *s);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNM
+static int      stbi__pnm_test(stbi__context *s);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__pnm_is16(stbi__context *s);
+#endif
+
+static
+#ifdef STBI_THREAD_LOCAL
+STBI_THREAD_LOCAL
+#endif
+const char *stbi__g_failure_reason;
+
+STBIDEF const char *stbi_failure_reason(void)
+{
+   return stbi__g_failure_reason;
+}
+
+#ifndef STBI_NO_FAILURE_STRINGS
+static int stbi__err(const char *str)
+{
+   stbi__g_failure_reason = str;
+   return 0;
+}
+#endif
+
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+#endif
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+#endif
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+#endif
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+#endif
+
+// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
+static int stbi__addints_valid(int a, int b)
+{
+   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
+   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product of two ints fits in a signed short, 0 on overflow.
+static int stbi__mul2shorts_valid(int a, int b)
+{
+   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
+   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
+   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
+   return a >= SHRT_MIN / b;
+}
+
+// stbi__err - error
+// stbi__errpf - error returning pointer to float
+// stbi__errpuc - error returning pointer to unsigned char
+
+#ifdef STBI_NO_FAILURE_STRINGS
+   #define stbi__err(x,y)  0
+#elif defined(STBI_FAILURE_USERMSG)
+   #define stbi__err(x,y)  stbi__err(y)
+#else
+   #define stbi__err(x,y)  stbi__err(x)
+#endif
+
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
+
+STBIDEF void stbi_image_free(void *retval_from_stbi_load)
+{
+   STBI_FREE(retval_from_stbi_load);
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
+#endif
+
+static int stbi__vertically_flip_on_load_global = 0;
+
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
+#else
+static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
+
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
+   stbi__vertically_flip_on_load_set = 1;
+}
+
+#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
+                                         ? stbi__vertically_flip_on_load_local  \
+                                         : stbi__vertically_flip_on_load_global)
+#endif // STBI_THREAD_LOCAL
+
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
+   // test the formats with a very explicit header first (at least a FOURCC
+   // or distinctive magic number first)
+   #ifndef STBI_NO_PNG
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
+   #else
+   STBI_NOTUSED(bpc);
+   #endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   // then the formats that can end up attempting to load with just 1 or 2
+   // bytes matching expectations; these are prone to false positives, so
+   // try them later
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
+      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   #ifndef STBI_NO_TGA
+   // test tga last because it's a crappy test!
+   if (stbi__tga_test(s))
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
+
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
+
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
+      }
+   }
+}
+
+#ifndef STBI_NO_GIF
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
+
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
+      bytes += slice_size;
+   }
+}
+#endif
+
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 8) {
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
+
+   return (unsigned char *) result;
+}
+
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 16) {
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
+
+   return (stbi__uint16 *) result;
+}
+
+#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
+{
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
+   }
+}
+#endif
+
+#ifndef STBI_NO_STDIO
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+#endif
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbi__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
+#endif //!STBI_NO_STDIO
+
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp );
+   }
+
+   return result;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
+      if (hdr_data)
+         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
+      return hdr_data;
+   }
+   #endif
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
+   if (data)
+      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+}
+
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   float *result;
+   FILE *f = stbi__fopen(filename, "rb");
+   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+#endif // !STBI_NO_STDIO
+
+#endif // !STBI_NO_LINEAR
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
+// reports false!
+
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
+}
+
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
+   #else
+   STBI_NOTUSED(f);
+   return 0;
+   #endif
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_LINEAR
+static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
+
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#endif
+
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
+
+static void stbi__refill_buffer(stbi__context *s)
+{
+   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
+   if (n == 0) {
+      // at end of file, treat same as if from memory, but need to handle case
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+      s->read_from_callbacks = 0;
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start+1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   if (s->read_from_callbacks) {
+      stbi__refill_buffer(s);
+      return *s->img_buffer++;
+   }
+   return 0;
+}
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+stbi_inline static int stbi__at_eof(stbi__context *s)
+{
+   if (s->io.read) {
+      if (!(s->io.eof)(s->io_user_data)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->read_from_callbacks == 0) return 1;
+   }
+
+   return s->img_buffer >= s->img_buffer_end;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
+// nothing
+#else
+static void stbi__skip(stbi__context *s, int n)
+{
+   if (n == 0) return;  // already there!
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         (s->io.skip)(s->io_user_data, n - blen);
+         return;
+      }
+   }
+   s->img_buffer += n;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
+// nothing
+#else
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
+{
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         int res, count;
+
+         memcpy(buffer, s->img_buffer, blen);
+
+         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
+         res = (count == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
+
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static int stbi__get16be(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return (z << 8) + stbi__get8(s);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static stbi__uint32 stbi__get32be(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16be(s);
+   return (z << 16) + stbi__get16be(s);
+}
+#endif
+
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
+static int stbi__get16le(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return z + (stbi__get8(s) << 8);
+}
+#endif
+
+#ifndef STBI_NO_BMP
+static stbi__uint32 stbi__get32le(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16le(s);
+   z += (stbi__uint32)stbi__get16le(s) << 16;
+   return z;
+}
+#endif
+
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static stbi_uc stbi__compute_y(int r, int g, int b)
+{
+   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+      }
+   }
+   if (n < comp) {
+      for (i=0; i < x*y; ++i) {
+         output[i*comp + n] = data[i*comp + n]/255.0f;
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+#ifndef STBI_NO_HDR
+#define stbi__float2int(x)   ((int) (x))
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder
+//
+//    simple implementation
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
+//      - uses a lot of intermediate memory, could cache poorly
+
+#ifndef STBI_NO_JPEG
+
+// huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+typedef struct
+{
+   stbi_uc  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   stbi__uint16 code[256];
+   stbi_uc  values[256];
+   stbi_uc  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+} stbi__huffman;
+
+typedef struct
+{
+   stbi__context *s;
+   stbi__huffman huff_dc[4];
+   stbi__huffman huff_ac[4];
+   stbi__uint16 dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
+
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
+
+      int x,y,w2,h2;
+      stbi_uc *data;
+      void *raw_data, *raw_coeff;
+      stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+   } img_comp[4];
+
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
+
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
+
+   int scan_n, order[4];
+   int restart_interval, todo;
+
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
+} stbi__jpeg;
+
+static int stbi__build_huffman(stbi__huffman *h, int *count)
+{
+   int i,j,k=0;
+   unsigned int code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i) {
+      for (j=0; j < count[i]; ++j) {
+         h->size[k++] = (stbi_uc) (i+1);
+         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
+      }
+   }
+   h->size[k] = 0;
+
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (stbi__uint16) (code++);
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
+
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (stbi_uc) i;
+         }
+      }
+   }
+   return 1;
+}
+
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
+
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (~0U << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
+         }
+      }
+   }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
+{
+   do {
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
+      if (b == 0xff) {
+         int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+// decode a jpeg huffman value from the bitstream
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
+{
+   unsigned int temp;
+   int c,k;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
+
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
+
+   if (k > j->code_bits)
+      return -1;
+
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   if(c < 0 || c >= 256) // symbol id out of bounds!
+       return -1;
+   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
+}
+
+// bias[n] = (-1<<n) + 1
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   int sgn;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+
+   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k + (stbi__jbias[n] & (sgn - 1));
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
+
+// decode one 64-entry block--
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
+{
+   int diff,dc,k;
+   int t;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
+   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
+
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
+
+   diff = t ? stbi__extend_receive(j, t) : 0;
+   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+   data[0] = (short) (dc * dequant[0]);
+
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+         j->code_buffer <<= s;
+         j->code_bits -= s;
+         // decode into unzigzag'd location
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
+      }
+   } while (k < 64);
+   return 1;
+}
+
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      diff = t ? stbi__extend_receive(j, t) : 0;
+
+      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      data[0] = (short) (dc * (1 << j->succ_low));
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
+
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
+
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) * (1 << shift));
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
+            } else {
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
+            }
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
+
+      short bit = (short) (1 << j->succ_low);
+
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else {
+                  // r=15 s=0 should write 16 0s, so we just do
+                  // a run of 15 0s and then write s (which is 0),
+                  // so we don't have to do anything special here
+               }
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
+            }
+
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k++]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+               } else {
+                  if (r == 0) {
+                     *p = (short) s;
+                     break;
+                  }
+                  --r;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
+}
+
+// take a -128..127 value and stbi__clamp it and convert to 0..255
+stbi_inline static stbi_uc stbi__clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (stbi_uc) x;
+}
+
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define stbi__fsh(x)  ((x) * 4096)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
+{
+   int i,val[64],*v=val;
+   stbi_uc *o;
+   short *d = data;
+
+   // columns
+   for (i=0; i < 8; ++i,++d, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0]*4;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
+
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = stbi__clamp((x0+t3) >> 17);
+      o[7] = stbi__clamp((x0-t3) >> 17);
+      o[1] = stbi__clamp((x1+t2) >> 17);
+      o[6] = stbi__clamp((x1-t2) >> 17);
+      o[2] = stbi__clamp((x2+t1) >> 17);
+      o[5] = stbi__clamp((x2-t1) >> 17);
+      o[3] = stbi__clamp((x3+t0) >> 17);
+      o[4] = stbi__clamp((x3-t0) >> 17);
+   }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
+
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
+
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
+
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
+
+   // column pass
+   dct_pass(bias_0, 10);
+
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
+
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
+
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
+
+   // row pass
+   dct_pass(bias_1, 17);
+
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
+
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
+
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
+
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
+
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
+
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
+
+   // 16bit 8x8 transpose
+   {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
+
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
+
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+   }
+
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
+
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
+
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
+
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
+
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+   }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif // STBI_NEON
+
+#define STBI__MARKER_none  0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static stbi_uc stbi__get_marker(stbi__jpeg *j)
+{
+   stbi_uc x;
+   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
+   x = stbi__get8(j->s);
+   if (x != 0xff) return STBI__MARKER_none;
+   while (x == 0xff)
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
+   return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, stbi__jpeg_reset the entropy decoder and
+// the dc prediction
+static void stbi__jpeg_reset(stbi__jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+   j->marker = STBI__MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
+{
+   stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+            }
+         }
+      }
+   }
+}
+
+static int stbi__process_marker(stbi__jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case STBI__MARKER_none: // no marker found
+         return stbi__err("expected marker","Corrupt JPEG");
+
+      case 0xDD: // DRI - specify restart interval
+         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
+         z->restart_interval = stbi__get16be(z->s);
+         return 1;
+
+      case 0xDB: // DQT - define quantization table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            int q = stbi__get8(z->s);
+            int p = q >> 4, sixteen = (p != 0);
+            int t = q & 15,i;
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
+            for (i=0; i < 64; ++i)
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
+         }
+         return L==0;
+
+      case 0xC4: // DHT - define huffman table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            stbi_uc *v;
+            int sizes[16],i,n=0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = stbi__get8(z->s);
+               n += sizes[i];
+            }
+            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
+            L -= 17;
+            if (tc == 0) {
+               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
+            } else {
+               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
+            }
+            for (i=0; i < n; ++i)
+               v[i] = stbi__get8(z->s);
+            if (tc != 0)
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+         }
+         return L==0;
+   }
+
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
+      return 1;
+   }
+
+   return stbi__err("unknown marker","Corrupt JPEG");
+}
+
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg *z)
+{
+   int i;
+   int Ls = stbi__get16be(z->s);
+   z->scan_n = stbi__get8(z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = stbi__get8(z->s), which;
+      int q = stbi__get8(z->s);
+      for (which = 0; which < z->s->img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s->img_n) return 0; // no match
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
+
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
+static int stbi__process_frame_header(stbi__jpeg *z, int scan)
+{
+   stbi__context *s = z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   c = stbi__get8(s);
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
+
+   z->rgb = 0;
+   for (i=0; i < s->img_n; ++i) {
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
+      z->img_comp[i].id = stbi__get8(s);
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
+      q = stbi__get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
+   }
+
+   if (scan != STBI__SCAN_load) return 1;
+
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
+
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
+
+   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
+   // and I've never seen a non-corrupted JPEG file actually use them
+   for (i=0; i < s->img_n; ++i) {
+      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
+      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
+   }
+
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
+
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+      // align blocks for idct using mmx/sse
+      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      if (z->progressive) {
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      }
+   }
+
+   return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define stbi__DNL(x)         ((x) == 0xdc)
+#define stbi__SOI(x)         ((x) == 0xd8)
+#define stbi__EOI(x)         ((x) == 0xd9)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x)         ((x) == 0xda)
+
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
+{
+   int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
+   z->marker = STBI__MARKER_none; // initialize cached marker to empty
+   m = stbi__get_marker(z);
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
+   m = stbi__get_marker(z);
+   while (!stbi__SOF(m)) {
+      if (!stbi__process_marker(z,m)) return 0;
+      m = stbi__get_marker(z);
+      while (m == STBI__MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
+         m = stbi__get_marker(z);
+      }
+   }
+   z->progressive = stbi__SOF_progressive(m);
+   if (!stbi__process_frame_header(z, scan)) return 0;
+   return 1;
+}
+
+static stbi_uc stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
+{
+   // some JPEGs have junk at end, skip over it but if we find what looks
+   // like a valid marker, resume there
+   while (!stbi__at_eof(j->s)) {
+      stbi_uc x = stbi__get8(j->s);
+      while (x == 0xff) { // might be a marker
+         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
+         x = stbi__get8(j->s);
+         if (x != 0x00 && x != 0xff) {
+            // not a stuffed zero or lead-in to another marker, looks
+            // like an actual marker, return it
+            return x;
+         }
+         // stuffed zero has x=0 now which ends the loop, meaning we go
+         // back to regular scan loop.
+         // repeated 0xff keeps trying to read the next byte of the marker.
+      }
+   }
+   return STBI__MARKER_none;
+}
+
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
+{
+   int m;
+   for (m = 0; m < 4; m++) {
+      j->img_comp[m].raw_data = NULL;
+      j->img_comp[m].raw_coeff = NULL;
+   }
+   j->restart_interval = 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
+   m = stbi__get_marker(j);
+   while (!stbi__EOI(m)) {
+      if (stbi__SOS(m)) {
+         if (!stbi__process_scan_header(j)) return 0;
+         if (!stbi__parse_entropy_coded_data(j)) return 0;
+         if (j->marker == STBI__MARKER_none ) {
+         j->marker = stbi__skip_jpeg_junk_at_end(j);
+            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+         }
+         m = stbi__get_marker(j);
+         if (STBI__RESTART(m))
+            m = stbi__get_marker(j);
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+         m = stbi__get_marker(j);
+      } else {
+         if (!stbi__process_marker(j, m)) return 1;
+         m = stbi__get_marker(j);
+      }
+   }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
+   return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
+                                    int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
+
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
+}
+
+static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
+   return out;
+}
+
+static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   stbi_uc *input = in_near;
+
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
+
+   out[0] = input[0];
+   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = stbi__div4(n+input[i-1]);
+      out[i*2+1] = stbi__div4(n+input[i+1]);
+   }
+   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
+
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
+
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = stbi__div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
+
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
+
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
+
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+#elif defined(STBI_NEON)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
+#endif
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
+
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+#endif
+
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   STBI_NOTUSED(in_far);
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
+}
+
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
+
+#ifdef STBI_SSE2
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
+
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
+
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
+
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
+#endif
+
+#ifdef STBI_NEON
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
+
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
+#endif
+
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
+#endif
+
+#ifdef STBI_NEON
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
+
+// clean up the temporary component buffers
+static void stbi__cleanup_jpeg(stbi__jpeg *j)
+{
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
+}
+
+typedef struct
+{
+   resample_row_func resample;
+   stbi_uc *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
+} stbi__resample;
+
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n, is_rgb;
+   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
+      decode_n = 1;
+   else
+      decode_n = z->s->img_n;
+
+   // nothing to do if no components requested; check this now to avoid
+   // accessing uninitialized coutput[0] later
+   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // resample and color-convert
+   {
+      int k;
+      unsigned int i,j;
+      stbi_uc *output;
+      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
+
+      stbi__resample res_comp[4];
+
+      for (k=0; k < decode_n; ++k) {
+         stbi__resample *r = &res_comp[k];
+
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
+         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+         else                               r->resample = stbi__resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s->img_y; ++j) {
+         stbi_uc *out = output + n * z->s->img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi__resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
+            }
+         }
+         if (n >= 3) {
+            stbi_uc *y = coutput[0];
+            if (z->s->img_n == 3) {
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+            }
+         }
+      }
+      stbi__cleanup_jpeg(z);
+      *out_x = z->s->img_x;
+      *out_y = z->s->img_y;
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+      return output;
+   }
+}
+
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__errpuc("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
+}
+
+static int stbi__jpeg_test(stbi__context *s)
+{
+   int r;
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
+   stbi__rewind(s);
+   STBI_FREE(j);
+   return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
+{
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+      stbi__rewind( j->s );
+      return 0;
+   }
+   if (x) *x = j->s->img_x;
+   if (y) *y = j->s->img_y;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
+   return 1;
+}
+
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
+}
+#endif
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+#ifndef STBI_NO_ZLIB
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct
+{
+   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   stbi__uint16 firstcode[16];
+   int maxcode[17];
+   stbi__uint16 firstsymbol[16];
+   stbi_uc  size[STBI__ZNSYMS];
+   stbi__uint16 value[STBI__ZNSYMS];
+} stbi__zhuffman;
+
+stbi_inline static int stbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+stbi_inline static int stbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return stbi__bitreverse16(v) >> (16-bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return stbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (stbi__uint16) code;
+      z->firstsymbol[i] = (stbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
+         z->size [c] = (stbi_uc     ) s;
+         z->value[c] = (stbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct
+{
+   stbi_uc *zbuffer, *zbuffer_end;
+   int num_bits;
+   int hit_zeof_once;
+   stbi__uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   stbi__zhuffman z_length, z_distance;
+} stbi__zbuf;
+
+stbi_inline static int stbi__zeof(stbi__zbuf *z)
+{
+   return (z->zbuffer >= z->zbuffer_end);
+}
+
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
+{
+   return stbi__zeof(z) ? 0 : *z->zbuffer++;
+}
+
+static void stbi__fill_bits(stbi__zbuf *z)
+{
+   do {
+      if (z->code_buffer >= (1U << z->num_bits)) {
+        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
+        return;
+      }
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) stbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s,k;
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = stbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s >= 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
+   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) {
+      if (stbi__zeof(a)) {
+         if (!a->hit_zeof_once) {
+            // This is the first time we hit eof, insert 16 extra padding btis
+            // to allow us to keep going; if we actually consume any of them
+            // though, that is invalid data. This is caught later.
+            a->hit_zeof_once = 1;
+            a->num_bits += 16; // add 16 implicit zero bits
+         } else {
+            // We already inserted our extra 16 padding bits and are again
+            // out, this stream is actually prematurely terminated.
+            return -1;
+         }
+      } else {
+         stbi__fill_bits(a);
+      }
+   }
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
+{
+   char *q;
+   unsigned int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
+   cur   = (unsigned int) (z->zout - z->zout_start);
+   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
+   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
+   while (cur + n > limit) {
+      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
+      limit *= 2;
+   }
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return stbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static const int stbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static const int stbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static const int stbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = stbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         stbi_uc *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            if (a->hit_zeof_once && a->num_bits < 16) {
+               // The first time we hit zeof, we inserted 16 extra zero bits into our bit
+               // buffer so the decoder can just do its speculative decoding. But if we
+               // actually consumed any of those bits (which is the case when num_bits < 16),
+               // the stream actually read past the end so it is malformed.
+               return stbi__err("unexpected end","Corrupt PNG");
+            }
+            return 1;
+         }
+         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
+         z -= 257;
+         len = stbi__zlength_base[z];
+         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
+         z = stbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
+         dist = stbi__zdist_base[z];
+         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (len > a->zout_end - zout) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a)
+{
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   stbi__zhuffman z_codelength;
+   stbi_uc lencodes[286+32+137];//padding for maximum single op
+   stbi_uc codelength_sizes[19];
+   int i,n;
+
+   int hlit  = stbi__zreceive(a,5) + 257;
+   int hdist = stbi__zreceive(a,5) + 1;
+   int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = stbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
+   }
+   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < ntot) {
+      int c = stbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (stbi_uc) c;
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17) {
+            c = stbi__zreceive(a,3)+3;
+         } else if (c == 18) {
+            c = stbi__zreceive(a,7)+11;
+         } else {
+            return stbi__err("bad codelengths", "Corrupt PNG");
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
+         n += c;
+      }
+   }
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
+   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
+{
+   stbi_uc header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      stbi__zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = stbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf *a)
+{
+   int cmf   = stbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = stbi__zget8(a);
+   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
+}
+
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
+}
+*/
+
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!stbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   a->hit_zeof_once = 0;
+   do {
+      final = stbi__zreceive(a,1);
+      type = stbi__zreceive(a,2);
+      if (type == 0) {
+         if (!stbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!stbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!stbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+
+   return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer+len;
+   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+#endif
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+#ifndef STBI_NO_PNG
+typedef struct
+{
+   stbi__uint32 length;
+   stbi__uint32 type;
+} stbi__pngchunk;
+
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
+{
+   stbi__pngchunk c;
+   c.length = stbi__get32be(s);
+   c.type   = stbi__get32be(s);
+   return c;
+}
+
+static int stbi__check_png_header(stbi__context *s)
+{
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   stbi__context *s;
+   stbi_uc *idata, *expanded, *out;
+   int depth;
+} stbi__png;
+
+
+enum {
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filter used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first
+};
+
+static stbi_uc first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_sub // Paeth with b=c=0 turns out to be equivalent to sub
+};
+
+static int stbi__paeth(int a, int b, int c)
+{
+   // This formulation looks very different from the reference in the PNG spec, but is
+   // actually equivalent and has favorable data dependencies and admits straightforward
+   // generation of branch-free code, which helps performance significantly.
+   int thresh = c*3 - (a + b);
+   int lo = a < b ? a : b;
+   int hi = a < b ? b : a;
+   int t0 = (hi <= thresh) ? lo : c;
+   int t1 = (thresh <= lo) ? hi : t0;
+   return t1;
+}
+
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+// adds an extra all-255 alpha channel
+// dest == src is legal
+// img_n must be 1 or 3
+static void stbi__create_png_alpha_expand8(stbi_uc *dest, stbi_uc *src, stbi__uint32 x, int img_n)
+{
+   int i;
+   // must process data backwards since we allow dest==src
+   if (img_n == 1) {
+      for (i=x-1; i >= 0; --i) {
+         dest[i*2+1] = 255;
+         dest[i*2+0] = src[i];
+      }
+   } else {
+      STBI_ASSERT(img_n == 3);
+      for (i=x-1; i >= 0; --i) {
+         dest[i*4+3] = 255;
+         dest[i*4+2] = src[i*3+2];
+         dest[i*4+1] = src[i*3+1];
+         dest[i*4+0] = src[i*3+0];
+      }
+   }
+}
+
+// create the png data from post-deflated data
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   stbi__context *s = a->s;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
+   stbi__uint32 img_len, img_width_bytes;
+   stbi_uc *filter_buf;
+   int all_ok = 1;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
+
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+   if (!a->out) return stbi__err("outofmem", "Out of memory");
+
+   // note: error exits here don't need to clean up a->out individually,
+   // stbi__do_png always does on error.
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   if (!stbi__mad2sizes_valid(img_width_bytes, y, img_width_bytes)) return stbi__err("too large", "Corrupt PNG");
+   img_len = (img_width_bytes + 1) * y;
+
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
+
+   // Allocate two scan lines worth of filter workspace buffer.
+   filter_buf = (stbi_uc *) stbi__malloc_mad2(img_width_bytes, 2, 0);
+   if (!filter_buf) return stbi__err("outofmem", "Out of memory");
+
+   // Filtering for low-bit-depth images
+   if (depth < 8) {
+      filter_bytes = 1;
+      width = img_width_bytes;
+   }
+
+   for (j=0; j < y; ++j) {
+      // cur/prior filter buffers alternate
+      stbi_uc *cur = filter_buf + (j & 1)*img_width_bytes;
+      stbi_uc *prior = filter_buf + (~j & 1)*img_width_bytes;
+      stbi_uc *dest = a->out + stride*j;
+      int nk = width * filter_bytes;
+      int filter = *raw++;
+
+      // check filter type
+      if (filter > 4) {
+         all_ok = stbi__err("invalid filter","Corrupt PNG");
+         break;
+      }
+
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+
+      // perform actual filtering
+      switch (filter) {
+      case STBI__F_none:
+         memcpy(cur, raw, nk);
+         break;
+      case STBI__F_sub:
+         memcpy(cur, raw, filter_bytes);
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]);
+         break;
+      case STBI__F_up:
+         for (k = 0; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
+         break;
+      case STBI__F_avg:
+         for (k = 0; k < filter_bytes; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1));
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1));
+         break;
+      case STBI__F_paeth:
+         for (k = 0; k < filter_bytes; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + prior[k]); // prior[k] == stbi__paeth(0,prior[k],0)
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes], prior[k], prior[k-filter_bytes]));
+         break;
+      case STBI__F_avg_first:
+         memcpy(cur, raw, filter_bytes);
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1));
+         break;
+      }
+
+      raw += nk;
+
+      // expand decoded bits in cur to dest, also adding an extra alpha channel if desired
+      if (depth < 8) {
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+         stbi_uc *in = cur;
+         stbi_uc *out = dest;
+         stbi_uc inb = 0;
+         stbi__uint32 nsmp = x*img_n;
+
+         // expand bits to bytes first
+         if (depth == 4) {
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 1) == 0) inb = *in++;
+               *out++ = scale * (inb >> 4);
+               inb <<= 4;
+            }
+         } else if (depth == 2) {
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 3) == 0) inb = *in++;
+               *out++ = scale * (inb >> 6);
+               inb <<= 2;
+            }
+         } else {
+            STBI_ASSERT(depth == 1);
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 7) == 0) inb = *in++;
+               *out++ = scale * (inb >> 7);
+               inb <<= 1;
+            }
+         }
+
+         // insert alpha=255 values if desired
+         if (img_n != out_n)
+            stbi__create_png_alpha_expand8(dest, dest, x, img_n);
+      } else if (depth == 8) {
+         if (img_n == out_n)
+            memcpy(dest, cur, x*img_n);
+         else
+            stbi__create_png_alpha_expand8(dest, cur, x, img_n);
+      } else if (depth == 16) {
+         // convert the image data from big-endian to platform-native
+         stbi__uint16 *dest16 = (stbi__uint16*)dest;
+         stbi__uint32 nsmp = x*img_n;
+
+         if (img_n == out_n) {
+            for (i = 0; i < nsmp; ++i, ++dest16, cur += 2)
+               *dest16 = (cur[0] << 8) | cur[1];
+         } else {
+            STBI_ASSERT(img_n+1 == out_n);
+            if (img_n == 1) {
+               for (i = 0; i < x; ++i, dest16 += 2, cur += 2) {
+                  dest16[0] = (cur[0] << 8) | cur[1];
+                  dest16[1] = 0xffff;
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (i = 0; i < x; ++i, dest16 += 4, cur += 6) {
+                  dest16[0] = (cur[0] << 8) | cur[1];
+                  dest16[1] = (cur[2] << 8) | cur[3];
+                  dest16[2] = (cur[4] << 8) | cur[5];
+                  dest16[3] = 0xffff;
+               }
+            }
+         }
+      }
+   }
+
+   STBI_FREE(filter_buf);
+   if (!all_ok) return 0;
+
+   return 1;
+}
+
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
+   stbi_uc *final;
+   int p;
+   if (!interlaced)
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+   // de-interlacing
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   if (!final) return stbi__err("outofmem", "Out of memory");
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
+
+   return 1;
+}
+
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
+{
+   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   stbi_uc *p, *temp_out, *orig = a->out;
+
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+static int stbi__unpremultiply_on_load_global = 0;
+static int stbi__de_iphone_flag_global = 0;
+
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_global = flag_true_if_should_convert;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
+#else
+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
+
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_set = 1;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_local = flag_true_if_should_convert;
+   stbi__de_iphone_flag_set = 1;
+}
+
+#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
+                                       ? stbi__unpremultiply_on_load_local      \
+                                       : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
+                                ? stbi__de_iphone_flag_local                    \
+                                : stbi__de_iphone_flag_global)
+#endif // STBI_THREAD_LOCAL
+
+static void stbi__de_iphone(stbi__png *z)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         stbi_uc t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      STBI_ASSERT(s->img_out_n == 4);
+      if (stbi__unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc a = p[3];
+            stbi_uc t = p[0];
+            if (a) {
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            }
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 4;
+         }
+      }
+   }
+}
+
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
+
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
+{
+   stbi_uc palette[1024], pal_img_n=0;
+   stbi_uc has_trans=0, tc[3]={0};
+   stbi__uint16 tc16[3];
+   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
+   stbi__context *s = z->s;
+
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
+
+   if (!stbi__check_png_header(s)) return 0;
+
+   if (scan == STBI__SCAN_type) return 1;
+
+   for (;;) {
+      stbi__pngchunk c = stbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
+            s->img_x = stbi__get32be(s);
+            s->img_y = stbi__get32be(s);
+            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
+            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
+            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
+            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+            } else {
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
+            }
+            // even with SCAN_header, have to scan to see if we have a tRNS
+            break;
+         }
+
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = stbi__get8(s);
+               palette[i*4+1] = stbi__get8(s);
+               palette[i*4+2] = stbi__get8(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = stbi__get8(s);
+            } else {
+               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
+               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n && k < 3; ++k) // extra loop test to suppress false GCC warning
+                     tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n && k < 3; ++k)
+                     tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
+            if (scan == STBI__SCAN_header) {
+               // header scan definitely stops at first IDAT
+               if (pal_img_n)
+                  s->img_n = pal_img_n;
+               return 1;
+            }
+            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
+            if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
+               stbi_uc *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            stbi__uint32 raw_len, bpl;
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+               stbi__de_iphone(z);
+            if (pal_img_n) {
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            } else if (has_trans) {
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
+            }
+            STBI_FREE(z->expanded); z->expanded = NULL;
+            // end of PNG chunk, read and skip CRC
+            stbi__get32be(s);
+            return 1;
+         }
+
+         default:
+            // if critical, fail
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX PNG chunk not known";
+               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
+               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
+               #endif
+               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
+      }
+      // end of PNG chunk, read and skip CRC
+      stbi__get32be(s);
+   }
+}
+
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
+{
+   void *result=NULL;
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth <= 8)
+         ri->bits_per_channel = 8;
+      else if (p->depth == 16)
+         ri->bits_per_channel = 16;
+      else
+         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
+}
+
+static int stbi__png_test(stbi__context *s)
+{
+   int r;
+   r = stbi__check_png_header(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
+{
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+      stbi__rewind( p->s );
+      return 0;
+   }
+   if (x) *x = p->s->img_x;
+   if (y) *y = p->s->img_y;
+   if (comp) *comp = p->s->img_n;
+   return 1;
+}
+
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__png_info_raw(&p, x, y, comp);
+}
+
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+	   return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
+}
+#endif
+
+// Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test_raw(stbi__context *s)
+{
+   int r;
+   int sz;
+   if (stbi__get8(s) != 'B') return 0;
+   if (stbi__get8(s) != 'M') return 0;
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   stbi__get32le(s); // discard data offset
+   sz = stbi__get32le(s);
+   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+   return r;
+}
+
+static int stbi__bmp_test(stbi__context *s)
+{
+   int r = stbi__bmp_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+
+// returns 0..31 for the highest set bit
+static int stbi__high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) { n += 16; z >>= 16; }
+   if (z >= 0x00100) { n +=  8; z >>=  8; }
+   if (z >= 0x00010) { n +=  4; z >>=  4; }
+   if (z >= 0x00004) { n +=  2; z >>=  2; }
+   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
+   return n;
+}
+
+static int stbi__bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
+}
+
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
+static int stbi__shiftsigned(unsigned int v, int shift, int bits)
+{
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
+}
+
+typedef struct
+{
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+   int extra_read;
+} stbi__bmp_data;
+
+static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
+{
+   // BI_BITFIELDS specifies masks explicitly, don't override
+   if (compress == 3)
+      return 1;
+
+   if (compress == 0) {
+      if (info->bpp == 16) {
+         info->mr = 31u << 10;
+         info->mg = 31u <<  5;
+         info->mb = 31u <<  0;
+      } else if (info->bpp == 32) {
+         info->mr = 0xffu << 16;
+         info->mg = 0xffu <<  8;
+         info->mb = 0xffu <<  0;
+         info->ma = 0xffu << 24;
+         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+      } else {
+         // otherwise, use defaults, which is all-0
+         info->mr = info->mg = info->mb = info->ma = 0;
+      }
+      return 1;
+   }
+   return 0; // error
+}
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
+   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+   info->extra_read = 14;
+
+   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
+
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = stbi__get16le(s);
+      s->img_y = stbi__get16le(s);
+   } else {
+      s->img_x = stbi__get32le(s);
+      s->img_y = stbi__get32le(s);
+   }
+   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
+   info->bpp = stbi__get16le(s);
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
+      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
+      stbi__get32le(s); // discard sizeof
+      stbi__get32le(s); // discard hres
+      stbi__get32le(s); // discard vres
+      stbi__get32le(s); // discard colorsused
+      stbi__get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+         }
+         if (info->bpp == 16 || info->bpp == 32) {
+            if (compress == 0) {
+               stbi__bmp_set_mask_defaults(info, compress);
+            } else if (compress == 3) {
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
+               info->extra_read += 12;
+               // not documented, but generated by photoshop and handled by mspaint
+               if (info->mr == info->mg && info->mg == info->mb) {
+                  // ?!?!?
+                  return stbi__errpuc("bad BMP", "bad BMP");
+               }
+            } else
+               return stbi__errpuc("bad BMP", "bad BMP");
+         }
+      } else {
+         // V4/V5 header
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
+         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+            stbi__bmp_set_mask_defaults(info, compress);
+         stbi__get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            stbi__get32le(s); // discard color space parameters
+         if (hsz == 124) {
+            stbi__get32le(s); // discard rendering intent
+            stbi__get32le(s); // discard offset of profile data
+            stbi__get32le(s); // discard size of profile data
+            stbi__get32le(s); // discard reserved
+         }
+      }
+   }
+   return (void *) 1;
+}
+
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
+
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - info.extra_read - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - info.extra_read - info.hsz) >> 2;
+   }
+   if (psize == 0) {
+      // accept some number of extra bytes after the header, but if the offset points either to before
+      // the header ends or implies a large amount of extra data, reject the file as malformed
+      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
+      int header_limit = 1024; // max we actually read is below 256 bytes currently.
+      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
+      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
+         return stbi__errpuc("bad header", "Corrupt BMP");
+      }
+      // we established that bytes_read_so_far is positive and sensible.
+      // the first half of this test rejects offsets that are either too small positives, or
+      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
+      // ensures the number computed in the second half of the test can't overflow.
+      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
+         return stbi__errpuc("bad offset", "Corrupt BMP");
+      } else {
+         stbi__skip(s, info.offset - bytes_read_so_far);
+      }
+   }
+
+   if (info.bpp == 24 && ma == 0xff000000)
+      s->img_n = 3;
+   else
+      s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
+
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (info.bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = stbi__get8(s);
+         pal[i][1] = stbi__get8(s);
+         pal[i][0] = stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
+         pal[i][3] = 255;
+      }
+      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
+            }
+            stbi__skip(s, pad);
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+            }
+            stbi__skip(s, pad);
+         }
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      stbi__skip(s, info.offset - info.extra_read - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (info.bpp == 24) {
+         easy = 1;
+      } else if (info.bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         // right shift amt to put high bit in position #7
+         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
+         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
+         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
+         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               unsigned char a;
+               out[z+2] = stbi__get8(s);
+               out[z+1] = stbi__get8(s);
+               out[z+0] = stbi__get8(s);
+               z += 3;
+               a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = a;
+            }
+         } else {
+            int bpp = info.bpp;
+            for (i=0; i < (int) s->img_x; ++i) {
+               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
+               unsigned int a;
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
+            }
+         }
+         stbi__skip(s, pad);
+      }
+   }
+
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+   return out;
+}
+#endif
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+#ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if (is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+               // fallthrough
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+               return STBI_rgb;
+      case 24: // fallthrough
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
+{
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
+    stbi__get8(s);                   // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
+        stbi__rewind(s);
+        return 0;      // only RGB or indexed allowed
+    }
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
+    tga_w = stbi__get16le(s);
+    if( tga_w < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test width
+    }
+    tga_h = stbi__get16le(s);
+    if( tga_h < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test height
+    }
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
+    }
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp;
+    return 1;                   // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context *s)
+{
+   int res = 0;
+   int sz, tga_color_type;
+   stbi__get8(s);      //   discard Offset
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
+   sz = stbi__get8(s);   //   image type
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
+   sz = stbi__get8(s);   //   bits per pixel
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
+   stbi__rewind(s);
+   return res;
+}
+
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   //   read in the TGA header stuff
+   int tga_offset = stbi__get8(s);
+   int tga_indexed = stbi__get8(s);
+   int tga_image_type = stbi__get8(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = stbi__get16le(s);
+   int tga_palette_len = stbi__get16le(s);
+   int tga_palette_bits = stbi__get8(s);
+   int tga_x_origin = stbi__get16le(s);
+   int tga_y_origin = stbi__get16le(s);
+   int tga_width = stbi__get16le(s);
+   int tga_height = stbi__get16le(s);
+   int tga_bits_per_pixel = stbi__get8(s);
+   int tga_comp, tga_rgb16=0;
+   int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4] = {0};
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
+   STBI_NOTUSED(tga_x_origin); // @TODO
+   STBI_NOTUSED(tga_y_origin); // @TODO
+
+   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if (comp) *comp = tga_comp;
+
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
+
+   // skip to the data's starting position (offset usually = 0)
+   stbi__skip(s, tga_offset );
+
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
+      for (i=0; i < tga_height; ++i) {
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
+         stbi__getn(s, tga_row, tga_width * tga_comp);
+      }
+   } else  {
+      //   do I need to load a palette?
+      if ( tga_indexed)
+      {
+         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
+            STBI_FREE(tga_data);
+            return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+
+         //   any data to skip? (offset usually = 0)
+         stbi__skip(s, tga_palette_start );
+         //   load the palette
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+         if (!tga_palette) {
+            STBI_FREE(tga_data);
+            return stbi__errpuc("outofmem", "Out of memory");
+         }
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+      }
+      //   load the data
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+         if ( tga_is_RLE )
+         {
+            if ( RLE_count == 0 )
+            {
+               //   yep, get the next byte as a RLE command
+               int RLE_cmd = stbi__get8(s);
+               RLE_count = 1 + (RLE_cmd & 127);
+               RLE_repeating = RLE_cmd >> 7;
+               read_next_pixel = 1;
+            } else if ( !RLE_repeating )
+            {
+               read_next_pixel = 1;
+            }
+         } else
+         {
+            read_next_pixel = 1;
+         }
+         //   OK, if I need to read a pixel, do it now
+         if ( read_next_pixel )
+         {
+            //   load however much data we did have
+            if ( tga_indexed )
+            {
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
+                  pal_idx = 0;
+               }
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = tga_palette[pal_idx+j];
+               }
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
+               //   read in the data raw
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = stbi__get8(s);
+               }
+            }
+            //   clear the reading flag for the next pixel
+            read_next_pixel = 0;
+         } // end of reading a pixel
+
+         // copy data
+         for (j = 0; j < tga_comp; ++j)
+           tga_data[i*tga_comp+j] = raw_data[j];
+
+         //   in case we're in RLE mode, keep counting down
+         --RLE_count;
+      }
+      //   do I need to invert the image?
+      if ( tga_inverted )
+      {
+         for (j = 0; j*2 < tga_height; ++j)
+         {
+            int index1 = j * tga_width * tga_comp;
+            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+            for (i = tga_width * tga_comp; i > 0; --i)
+            {
+               unsigned char temp = tga_data[index1];
+               tga_data[index1] = tga_data[index2];
+               tga_data[index2] = temp;
+               ++index1;
+               ++index2;
+            }
+         }
+      }
+      //   clear my palette, if I had one
+      if ( tga_palette != NULL )
+      {
+         STBI_FREE( tga_palette );
+      }
+   }
+
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
+   {
+      unsigned char* tga_pixel = tga_data;
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         unsigned char temp = tga_pixel[0];
+         tga_pixel[0] = tga_pixel[2];
+         tga_pixel[2] = temp;
+         tga_pixel += tga_comp;
+      }
+   }
+
+   // convert to target component count
+   if (req_comp && req_comp != tga_comp)
+      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   STBI_NOTUSED(tga_palette_start);
+   //   OK, done
+   return tga_data;
+}
+#endif
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context *s)
+{
+   int r = (stbi__get32be(s) == 0x38425053);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
+{
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
+   int channelCount, compression;
+   int channel, i;
+   int bitdepth;
+   int w,h;
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
+      return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+   // Check file type version.
+   if (stbi__get16be(s) != 1)
+      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+   // Skip 6 reserved bytes.
+   stbi__skip(s, 6 );
+
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16)
+      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+   // Read the rows and columns of the image.
+   h = stbi__get32be(s);
+   w = stbi__get32be(s);
+
+   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   // Make sure the depth is 8 bits.
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (stbi__get16be(s) != 3)
+      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   stbi__skip(s,stbi__get32be(s) );
+
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Skip the reserved data.
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = stbi__get16be(s);
+   if (compression > 1)
+      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
+   // Create the destination image.
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   pixelCount = w*h;
+
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
+
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
+
+      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      stbi__skip(s, h * channelCount * 2 );
+
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = (channel == 3 ? 255 : 0);
+         } else {
+            // Read the RLE data.
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
+            }
+         }
+      }
+
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
+
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
+            } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
+            }
+         } else {
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
+            }
+         }
+      }
+   }
+
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
+         }
+      }
+   }
+
+   // convert to desired output format
+   if (req_comp && req_comp != 4) {
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   if (comp) *comp = 4;
+   *y = h;
+   *x = w;
+
+   return out;
+}
+#endif
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_is4(stbi__context *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (stbi__get8(s) != (stbi_uc)str[i])
+         return 0;
+
+   return 1;
+}
+
+static int stbi__pic_test_core(stbi__context *s)
+{
+   int i;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
+
+   for(i=0;i<84;++i)
+      stbi__get8(s);
+
+   if (!stbi__pic_is4(s,"PICT"))
+      return 0;
+
+   return 1;
+}
+
+typedef struct
+{
+   stbi_uc size,type,channel;
+} stbi__pic_packet;
+
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
+
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
+         dest[i]=stbi__get8(s);
+      }
+   }
+
+   return dest;
+}
+
+static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
+
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
+}
+
+static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   stbi__pic_packet packets[10];
+
+   // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return stbi__errpuc("bad format","too many packets");
+
+      packet = &packets[num_packets++];
+
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+   for(y=0; y<height; ++y) {
+      int packet_idx;
+
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         stbi__pic_packet *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
+
+         switch (packet->type) {
+            default:
+               return stbi__errpuc("bad format","packet has bad compression type");
+
+            case 0: {//uncompressed
+               int x;
+
+               for(x=0;x<width;++x, dest+=4)
+                  if (!stbi__readval(s,packet->channel,dest))
+                     return 0;
+               break;
+            }
+
+            case 1://Pure RLE
+               {
+                  int left=width, i;
+
+                  while (left>0) {
+                     stbi_uc count,value[4];
+
+                     count=stbi__get8(s);
+                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
+
+                     if (count > left)
+                        count = (stbi_uc) left;
+
+                     if (!stbi__readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        stbi__copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = stbi__get8(s), i;
+                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+
+                     if (count==128)
+                        count = stbi__get16be(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return stbi__errpuc("bad file","scanline overrun");
+
+                     if (!stbi__readval(s,packet->channel,value))
+                        return 0;
+
+                     for(i=0;i<count;++i, dest += 4)
+                        stbi__copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
+
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!stbi__readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *result;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
+
+   for (i=0; i<92; ++i)
+      stbi__get8(s);
+
+   x = stbi__get16be(s);
+   y = stbi__get16be(s);
+
+   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
+
+   stbi__get32be(s); //skip `ratio'
+   stbi__get16be(s); //skip `fields'
+   stbi__get16be(s); //skip `pad'
+
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   if (!result) return stbi__errpuc("outofmem", "Out of memory");
+   memset(result, 0xff, x*y*4);
+
+   if (!stbi__pic_load_core(s,x,y,comp, result)) {
+      STBI_FREE(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=stbi__convert_format(result,4,req_comp,x,y);
+
+   return result;
+}
+
+static int stbi__pic_test(stbi__context *s)
+{
+   int r = stbi__pic_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+
+#ifndef STBI_NO_GIF
+typedef struct
+{
+   stbi__int16 prefix;
+   stbi_uc first;
+   stbi_uc suffix;
+} stbi__gif_lzw;
+
+typedef struct
+{
+   int w,h;
+   stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history;
+   int flags, bgindex, ratio, transparent, eflags;
+   stbi_uc  pal[256][4];
+   stbi_uc lpal[256][4];
+   stbi__gif_lzw codes[8192];
+   stbi_uc *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+   int delay;
+} stbi__gif;
+
+static int stbi__gif_test_raw(stbi__context *s)
+{
+   int sz;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
+   sz = stbi__get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (stbi__get8(s) != 'a') return 0;
+   return 1;
+}
+
+static int stbi__gif_test(stbi__context *s)
+{
+   int r = stbi__gif_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      pal[i][3] = transp == i ? 0 : 255;
+   }
+}
+
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
+{
+   stbi_uc version;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+      return stbi__err("not GIF", "Corrupt GIF");
+
+   version = stbi__get8(s);
+   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+
+   stbi__g_failure_reason = "";
+   g->w = stbi__get16le(s);
+   g->h = stbi__get16le(s);
+   g->flags = stbi__get8(s);
+   g->bgindex = stbi__get8(s);
+   g->ratio = stbi__get8(s);
+   g->transparent = -1;
+
+   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+
+   if (is_info) return 1;
+
+   if (g->flags & 0x80)
+      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+
+   return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!g) return stbi__err("outofmem", "Out of memory");
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
+   return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
+{
+   stbi_uc *p, *c;
+   int idx;
+
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi__out_gif_code(g, g->codes[code].prefix);
+
+   if (g->cur_y >= g->max_y) return;
+
+   idx = g->cur_x + g->cur_y;
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;
+
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels;
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
+
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
+
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
+}
+
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
+{
+   stbi_uc lzw_cs;
+   stbi__int32 len, init_code;
+   stbi__uint32 first;
+   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi__gif_lzw *p;
+
+   lzw_cs = stbi__get8(s);
+   if (lzw_cs > 12) return NULL;
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
+   }
+
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
+
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = stbi__get8(s); // start new block
+            if (len == 0)
+               return g->out;
+         }
+         --len;
+         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         stbi__int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            stbi__skip(s, len);
+            while ((len = stbi__get8(s)) > 0)
+               stbi__skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
+            }
+
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
+
+               p->prefix = (stbi__int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+
+            stbi__out_gif_code(g, (stbi__uint16) code);
+
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
+            }
+
+            oldcode = code;
+         } else {
+            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+         }
+      }
+   }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
+{
+   int dispose;
+   int first_frame;
+   int pi;
+   int pcount;
+   STBI_NOTUSED(req_comp);
+
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0;
+   if (g->out == 0) {
+      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
+      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
+         return stbi__errpuc("too large", "GIF image is too large");
+      pcount = g->w * g->h;
+      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->history = (stbi_uc *) stbi__malloc(pcount);
+      if (!g->out || !g->background || !g->history)
+         return stbi__errpuc("outofmem", "Out of memory");
+
+      // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to the color that was there the previous frame.
+      memset(g->out, 0x00, 4 * pcount);
+      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
+      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
+      first_frame = 1;
+   } else {
+      // second frame - how do we dispose of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2;
+      pcount = g->w * g->h;
+
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
+      }
+
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 );
+            }
+         }
+      } else if (dispose == 2) {
+         // restore what was changed last frame to background before that frame;
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 );
+            }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
+      }
+
+      // background is what out is after the undoing of the previou frame;
+      memcpy( g->background, g->out, 4 * g->w * g->h );
+   }
+
+   // clear my history;
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+
+   for (;;) {
+      int tag = stbi__get8(s);
+      switch (tag) {
+         case 0x2C: /* Image Descriptor */
+         {
+            stbi__int32 x, y, w, h;
+            stbi_uc *o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
+
+            // if the width of the specified rectangle is 0, that means
+            // we may not see *any* pixels or the image is malformed;
+            // to make sure this is caught, move the current y down to
+            // max_y (which is what out_gif_code checks).
+            if (w == 0)
+               g->cur_y = g->max_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40) {
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
+            } else {
+               g->step = g->line_size;
+               g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (stbi_uc *) g->lpal;
+            } else if (g->flags & 0x80) {
+               g->color_table = (stbi_uc *) g->pal;
+            } else
+               return stbi__errpuc("missing color table", "Corrupt GIF");
+
+            o = stbi__process_gif_raster(s, g);
+            if (!o) return NULL;
+
+            // if this was the first frame,
+            pcount = g->w * g->h;
+            if (first_frame && (g->bgindex > 0)) {
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 );
+                  }
+               }
+            }
+
+            return o;
+         }
+
+         case 0x21: // Comment Extension.
+         {
+            int len;
+            int ext = stbi__get8(s);
+            if (ext == 0xF9) { // Graphic Control Extension.
+               len = stbi__get8(s);
+               if (len == 4) {
+                  g->eflags = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255;
+                  }
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0;
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1);
+                     g->transparent = -1;
+                  }
+               } else {
+                  stbi__skip(s, len);
+                  break;
+               }
+            }
+            while ((len = stbi__get8(s)) != 0) {
+               stbi__skip(s, len);
+            }
+            break;
+         }
+
+         case 0x3B: // gif stream termination code
+            return (stbi_uc *) s; // using '1' causes warning on some compilers
+
+         default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
+      }
+   }
+}
+
+static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
+{
+   STBI_FREE(g->out);
+   STBI_FREE(g->history);
+   STBI_FREE(g->background);
+
+   if (out) STBI_FREE(out);
+   if (delays && *delays) STBI_FREE(*delays);
+   return stbi__errpuc("outofmem", "Out of memory");
+}
+
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0;
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0;
+      stbi__gif g;
+      int stride;
+      int out_size = 0;
+      int delays_size = 0;
+
+      STBI_NOTUSED(out_size);
+      STBI_NOTUSED(delays_size);
+
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0;
+      }
+
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers;
+            stride = g.w * g.h * 4;
+
+            if (out) {
+               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
+               if (!tmp)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               else {
+                   out = (stbi_uc*) tmp;
+                   out_size = layers * stride;
+               }
+
+               if (delays) {
+                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!new_delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  *delays = new_delays;
+                  delays_size = layers * sizeof(int);
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride );
+               if (!out)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               out_size = layers * stride;
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  delays_size = layers * sizeof(int);
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride );
+            if (layers >= 2) {
+               two_back = out - 2 * stride;
+            }
+
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay;
+            }
+         }
+      } while (u != 0);
+
+      // free temp buffer;
+      STBI_FREE(g.out);
+      STBI_FREE(g.history);
+      STBI_FREE(g.background);
+
+      // do the final conversion after loading everything;
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers;
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type.");
+   }
+}
+
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *u = 0;
+   stbi__gif g;
+   memset(&g, 0, sizeof(g));
+   STBI_NOTUSED(ri);
+
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
+   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g.w;
+      *y = g.h;
+
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames.
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
+   } else if (g.out) {
+      // if there was an error and we allocated an image buffer, free it!
+      STBI_FREE(g.out);
+   }
+
+   // free buffers needed for multiple frame loading;
+   STBI_FREE(g.history);
+   STBI_FREE(g.background);
+
+   return u;
+}
+
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   return stbi__gif_info_raw(s,x,y,comp);
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
+{
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (stbi__get8(s) != signature[i])
+          return 0;
+   stbi__rewind(s);
+   return 1;
+}
+
+static int stbi__hdr_test(stbi__context* s)
+{
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
+   stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
+   return r;
+}
+
+#define STBI__HDR_BUFLEN  1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
+
+   c = (char) stbi__get8(z);
+
+   while (!stbi__at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == STBI__HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) stbi__get8(z);
+   }
+
+   buffer[len] = 0;
+   return buffer;
+}
+
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
+}
+
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+   const char *headerToken;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+      return stbi__errpf("not HDR", "Corrupt HDR image");
+
+   // Parse header
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = (int) strtol(token, NULL, 10);
+
+   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+
+   *x = width;
+   *y = height;
+
+   if (comp) *comp = 3;
+   if (req_comp == 0) req_comp = 3;
+
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
+   // Read data
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
+
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            stbi__getn(s, rgbe, 4);
+            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
+
+      for (j = 0; j < height; ++j) {
+         c1 = stbi__get8(s);
+         c2 = stbi__get8(s);
+         len = stbi__get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            stbi_uc rgbe[4];
+            rgbe[0] = (stbi_uc) c1;
+            rgbe[1] = (stbi_uc) c2;
+            rgbe[2] = (stbi_uc) len;
+            rgbe[3] = (stbi_uc) stbi__get8(s);
+            stbi__hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
+            STBI_FREE(scanline);
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= stbi__get8(s);
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
+
+         for (k = 0; k < 4; ++k) {
+            int nleft;
+            i = 0;
+            while ((nleft = width - i) > 0) {
+               count = stbi__get8(s);
+               if (count > 128) {
+                  // Run
+                  value = stbi__get8(s);
+                  count -= 128;
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = stbi__get8(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      if (scanline)
+         STBI_FREE(scanline);
+   }
+
+   return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (stbi__hdr_test(s) == 0) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *y = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *x = (int) strtol(token, NULL, 10);
+   *comp = 3;
+   return 1;
+}
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   if (p == NULL) {
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) {
+      if (info.bpp == 24 && info.ma == 0xff000000)
+         *comp = 3;
+      else
+         *comp = info.ma ? 4 : 3;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *y = stbi__get32be(s);
+   *x = stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 3) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *comp = 4;
+   return 1;
+}
+
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   STBI_NOTUSED(stbi__get32be(s));
+   STBI_NOTUSED(stbi__get32be(s));
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int act_comp=0,num_packets=0,chained,dummy;
+   stbi__pic_packet packets[10];
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
+
+   *x = stbi__get16be(s);
+   *y = stbi__get16be(s);
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
+   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
+      stbi__rewind( s );
+      return 0;
+   }
+
+   stbi__skip(s, 8);
+
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return 0;
+
+      packet = &packets[num_packets++];
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s)) {
+          stbi__rewind( s );
+          return 0;
+      }
+      if (packet->size != 8) {
+          stbi__rewind( s );
+          return 0;
+      }
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3);
+
+   return 1;
+}
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+
+#ifndef STBI_NO_PNM
+
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+   if (ri->bits_per_channel == 0)
+      return 0;
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+
+   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
+      return stbi__errpuc("too large", "PNM too large");
+
+   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
+      STBI_FREE(out);
+      return stbi__errpuc("bad PNM", "PNM file truncated");
+   }
+
+   if (req_comp && req_comp != s->img_n) {
+      if (ri->bits_per_channel == 16) {
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
+      } else {
+         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      }
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
+}
+
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
+}
+
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
+}
+
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+      if((value > 214748364) || (value == 214748364 && *c > '7'))
+          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv, dummy;
+   char c, p, t;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind(s);
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   if(*x == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   if (*y == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+   if (maxv > 65535)
+      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+   else if (maxv > 255)
+      return 16;
+   else
+      return 8;
+}
+
+static int stbi__pnm_is16(stbi__context *s)
+{
+   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+	   return 1;
+   return 0;
+}
+#endif
+
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
+
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
+   #endif
+
+   // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
+   if (stbi__tga_info(s, x, y, comp))
+       return 1;
+   #endif
+   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_is16(s))  return 1;
+   #endif
+   return 0;
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__info_main(&s,x,y,comp);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
+}
+
+#endif // STB_IMAGE_IMPLEMENTATION
+
+/*
+   revision history:
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) extra corruption checking (mmozeiko)
+                         stbi_set_flip_vertically_on_load (nguillemot)
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
+      1.18  (2008-08-02)
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50  (2006-11-19)
+              first released version
+*/
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/external/xxhash/LICENSE b/external/xxhash/LICENSE
new file mode 100644
index 0000000..e4c5da7
--- /dev/null
+++ b/external/xxhash/LICENSE
@@ -0,0 +1,26 @@
+xxHash Library
+Copyright (c) 2012-2021 Yann Collet
+All rights reserved.
+
+BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/external/xxhash/README.md b/external/xxhash/README.md
new file mode 100644
index 0000000..1ff987b
--- /dev/null
+++ b/external/xxhash/README.md
@@ -0,0 +1,274 @@
+
+xxHash - Extremely fast hash algorithm
+======================================
+
+xxHash is an Extremely fast Hash algorithm, processing at RAM speed limits.
+Code is highly portable, and produces hashes identical across all platforms (little / big endian).
+The library includes the following algorithms :
+- XXH32 : generates 32-bit hashes, using 32-bit arithmetic
+- XXH64 : generates 64-bit hashes, using 64-bit arithmetic
+- XXH3 (since `v0.8.0`): generates 64 or 128-bit hashes, using vectorized arithmetic.
+  The 128-bit variant is called XXH128.
+
+All variants successfully complete the [SMHasher](https://code.google.com/p/smhasher/wiki/SMHasher) test suite
+which evaluates the quality of hash functions (collision, dispersion and randomness).
+Additional tests, which evaluate more thoroughly speed and collision properties of 64-bit hashes, [are also provided](https://github.com/Cyan4973/xxHash/tree/dev/tests).
+
+|Branch      |Status   |
+|------------|---------|
+|release     | [![Build Status](https://github.com/Cyan4973/xxHash/actions/workflows/ci.yml/badge.svg?branch=release)](https://github.com/Cyan4973/xxHash/actions?query=branch%3Arelease+) |
+|dev         | [![Build Status](https://github.com/Cyan4973/xxHash/actions/workflows/ci.yml/badge.svg?branch=dev)](https://github.com/Cyan4973/xxHash/actions?query=branch%3Adev+) |
+
+
+Benchmarks
+-------------------------
+
+The benchmarked reference system uses an Intel i7-9700K cpu, and runs Ubuntu x64 20.04.
+The [open source benchmark program] is compiled with `clang` v10.0 using `-O3` flag.
+
+| Hash Name     | Width | Bandwidth (GB/s) | Small Data Velocity | Quality | Comment |
+| ---------     | ----- | ---------------- | ----- | --- | --- |
+| __XXH3__ (SSE2) |  64 | 31.5 GB/s        | 133.1 | 10
+| __XXH128__ (SSE2) | 128 | 29.6 GB/s      | 118.1 | 10
+| _RAM sequential read_ | N/A | 28.0 GB/s  |   N/A | N/A | _for reference_
+| City64        |    64 | 22.0 GB/s        |  76.6 | 10
+| T1ha2         |    64 | 22.0 GB/s        |  99.0 |  9 | Slightly worse [collisions]
+| City128       |   128 | 21.7 GB/s        |  57.7 | 10
+| __XXH64__     |    64 | 19.4 GB/s        |  71.0 | 10
+| SpookyHash    |    64 | 19.3 GB/s        |  53.2 | 10
+| Mum           |    64 | 18.0 GB/s        |  67.0 |  9 | Slightly worse [collisions]
+| __XXH32__     |    32 |  9.7 GB/s        |  71.9 | 10
+| City32        |    32 |  9.1 GB/s        |  66.0 | 10
+| Murmur3       |    32 |  3.9 GB/s        |  56.1 | 10
+| SipHash       |    64 |  3.0 GB/s        |  43.2 | 10
+| FNV64         |    64 |  1.2 GB/s        |  62.7 |  5 | Poor avalanche properties
+| Blake2        |   256 |  1.1 GB/s        |   5.1 | 10 | Cryptographic
+| SHA1          |   160 |  0.8 GB/s        |   5.6 | 10 | Cryptographic but broken
+| MD5           |   128 |  0.6 GB/s        |   7.8 | 10 | Cryptographic but broken
+
+[open source benchmark program]: https://github.com/Cyan4973/xxHash/tree/release/tests/bench
+[collisions]: https://github.com/Cyan4973/xxHash/wiki/Collision-ratio-comparison#collision-study
+
+note 1: Small data velocity is a _rough_ evaluation of algorithm's efficiency on small data. For more detailed analysis, please refer to next paragraph.
+
+note 2: some algorithms feature _faster than RAM_ speed. In which case, they can only reach their full speed potential when input is already in CPU cache (L3 or better). Otherwise, they max out on RAM speed limit.
+
+### Small data
+
+Performance on large data is only one part of the picture.
+Hashing is also very useful in constructions like hash tables and bloom filters.
+In these use cases, it's frequent to hash a lot of small data (starting at a few bytes).
+Algorithm's performance can be very different for such scenarios, since parts of the algorithm,
+such as initialization or finalization, become fixed cost.
+The impact of branch mis-prediction also becomes much more present.
+
+XXH3 has been designed for excellent performance on both long and small inputs,
+which can be observed in the following graph:
+
+![XXH3, latency, random size](https://user-images.githubusercontent.com/750081/61976089-aedeab00-af9f-11e9-9239-e5375d6c080f.png)
+
+For a more detailed analysis, please visit the wiki :
+https://github.com/Cyan4973/xxHash/wiki/Performance-comparison#benchmarks-concentrating-on-small-data-
+
+Quality
+-------------------------
+
+Speed is not the only property that matters.
+Produced hash values must respect excellent dispersion and randomness properties,
+so that any sub-section of it can be used to maximally spread out a table or index,
+as well as reduce the amount of collisions to the minimal theoretical level, following the [birthday paradox].
+
+`xxHash` has been tested with Austin Appleby's excellent SMHasher test suite,
+and passes all tests, ensuring reasonable quality levels.
+It also passes extended tests from [newer forks of SMHasher], featuring additional scenarios and conditions.
+
+Finally, xxHash provides its own [massive collision tester](https://github.com/Cyan4973/xxHash/tree/dev/tests/collisions),
+able to generate and compare billions of hashes to test the limits of 64-bit hash algorithms.
+On this front too, xxHash features good results, in line with the [birthday paradox].
+A more detailed analysis is documented [in the wiki](https://github.com/Cyan4973/xxHash/wiki/Collision-ratio-comparison).
+
+[birthday paradox]: https://en.wikipedia.org/wiki/Birthday_problem
+[newer forks of SMHasher]: https://github.com/rurban/smhasher
+
+
+### Build modifiers
+
+The following macros can be set at compilation time to modify `libxxhash`'s behavior. They are generally disabled by default.
+
+- `XXH_INLINE_ALL`: Make all functions `inline`, implementation is directly included within `xxhash.h`.
+                    Inlining functions is beneficial for speed, notably for small keys.
+                    It's _extremely effective_ when key's length is expressed as _a compile time constant_,
+                    with performance improvements observed in the +200% range .
+                    See [this article](https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html) for details.
+- `XXH_PRIVATE_API`: same outcome as `XXH_INLINE_ALL`. Still available for legacy support.
+                     The name underlines that `XXH_*` symbol names will not be exported.
+- `XXH_STATIC_LINKING_ONLY`: gives access to internal state declaration, required for static allocation.
+                             Incompatible with dynamic linking, due to risks of ABI changes.
+- `XXH_NAMESPACE`: Prefixes all symbols with the value of `XXH_NAMESPACE`.
+                   This macro can only use compilable character set.
+                   Useful to evade symbol naming collisions,
+                   in case of multiple inclusions of xxHash's source code.
+                   Client applications still use the regular function names,
+                   as symbols are automatically translated through `xxhash.h`.
+- `XXH_FORCE_ALIGN_CHECK`: Use a faster direct read path when input is aligned.
+                           This option can result in dramatic performance improvement on architectures unable to load memory from unaligned addresses
+                           when input to hash happens to be aligned on 32 or 64-bit boundaries.
+                           It is (slightly) detrimental on platform with good unaligned memory access performance (same instruction for both aligned and unaligned accesses).
+                           This option is automatically disabled on `x86`, `x64` and `aarch64`, and enabled on all other platforms.
+- `XXH_FORCE_MEMORY_ACCESS`: The default method `0` uses a portable `memcpy()` notation.
+                             Method `1` uses a gcc-specific `packed` attribute, which can provide better performance for some targets.
+                             Method `2` forces unaligned reads, which is not standard compliant, but might sometimes be the only way to extract better read performance.
+                             Method `3` uses a byteshift operation, which is best for old compilers which don't inline `memcpy()` or big-endian systems without a byteswap instruction.
+- `XXH_CPU_LITTLE_ENDIAN`: By default, endianness is determined by a runtime test resolved at compile time.
+                           If, for some reason, the compiler cannot simplify the runtime test, it can cost performance.
+                           It's possible to skip auto-detection and simply state that the architecture is little-endian by setting this macro to 1.
+                           Setting it to 0 states big-endian.
+- `XXH_ENABLE_AUTOVECTORIZE`: Auto-vectorization may be triggered for XXH32 and XXH64, depending on cpu vector capabilities and compiler version.
+                              Note: auto-vectorization tends to be triggered more easily with recent versions of `clang`.
+                              For XXH32, SSE4.1 or equivalent (NEON) is enough, while XXH64 requires AVX512.
+                              Unfortunately, auto-vectorization is generally detrimental to XXH performance.
+                              For this reason, the xxhash source code tries to prevent auto-vectorization by default.
+                              That being said, systems evolve, and this conclusion is not forthcoming.
+                              For example, it has been reported that recent Zen4 cpus are more likely to improve performance with vectorization.
+                              Therefore, should you prefer or want to test vectorized code, you can enable this flag:
+                              it will remove the no-vectorization protection code, thus making it more likely for XXH32 and XXH64 to be auto-vectorized.
+- `XXH32_ENDJMP`: Switch multi-branch finalization stage of XXH32 by a single jump.
+                  This is generally undesirable for performance, especially when hashing inputs of random sizes.
+                  But depending on exact architecture and compiler, a jump might provide slightly better performance on small inputs. Disabled by default.
+- `XXH_IMPORT`: MSVC specific: should only be defined for dynamic linking, as it prevents linkage errors.
+- `XXH_NO_STDLIB`: Disable invocation of `<stdlib.h>` functions, notably `malloc()` and `free()`.
+                   `libxxhash`'s `XXH*_createState()` will always fail and return `NULL`.
+                   But one-shot hashing (like `XXH32()`) or streaming using statically allocated states
+                   still work as expected.
+                   This build flag is useful for embedded environments without dynamic allocation.
+- `XXH_memcpy`, `XXH_memset`, `XXH_memcmp` : redirect `memcpy()`, `memset()` and `memcmp()` to some user-selected symbol at compile time.
+                   Redirecting all 3 removes the need to include `<string.h>` standard library.
+- `XXH_NO_EXTERNC_GUARD`: When `xxhash.h` is compiled in C++ mode, removes the `extern "C" { .. }` block guard.
+- `XXH_DEBUGLEVEL` : When set to any value >= 1, enables `assert()` statements.
+                     This (slightly) slows down execution, but may help finding bugs during debugging sessions.
+
+#### Binary size control
+- `XXH_NO_XXH3` : removes symbols related to `XXH3` (both 64 & 128 bits) from generated binary.
+                  `XXH3` is by far the largest contributor to `libxxhash` size,
+                  so it's useful to reduce binary size for applications which do not employ `XXH3`.
+- `XXH_NO_LONG_LONG`: removes compilation of algorithms relying on 64-bit `long long` types
+                      which include `XXH3` and `XXH64`.
+                      Only `XXH32` will be compiled.
+                      Useful for targets (architectures and compilers) without 64-bit support.
+- `XXH_NO_STREAM`: Disables the streaming API, limiting the library to single shot variants only.
+- `XXH_NO_INLINE_HINTS`: By default, xxHash uses `__attribute__((always_inline))` and `__forceinline` to improve performance at the cost of code size.
+                         Defining this macro to 1 will mark all internal functions as `static`, allowing the compiler to decide whether to inline a function or not.
+                         This is very useful when optimizing for smallest binary size,
+                         and is automatically defined when compiling with `-O0`, `-Os`, `-Oz`, or `-fno-inline` on GCC and Clang.
+                         It may also be required to successfully compile using `-Og`, depending on compiler version.
+- `XXH_SIZE_OPT`: `0`: default, optimize for speed
+                  `1`: default for `-Os` and `-Oz`: disables some speed hacks for size optimization
+                  `2`: makes code as small as possible, performance may cry
+
+#### Build modifiers specific for XXH3
+- `XXH_VECTOR` : manually select a vector instruction set (default: auto-selected at compilation time). Available instruction sets are `XXH_SCALAR`, `XXH_SSE2`, `XXH_AVX2`, `XXH_AVX512`, `XXH_NEON` and `XXH_VSX`. Compiler may require additional flags to ensure proper support (for example, `gcc` on x86_64 requires `-mavx2` for `AVX2`, or `-mavx512f` for `AVX512`).
+- `XXH_PREFETCH_DIST` : select prefetching distance. For close-to-metal adaptation to specific hardware platforms. XXH3 only.
+- `XXH_NO_PREFETCH` : disable prefetching. Some platforms or situations may perform better without prefetching. XXH3 only.
+
+#### Makefile variables
+When compiling the Command Line Interface `xxhsum` using `make`, the following environment variables can also be set :
+- `DISPATCH=1` : use `xxh_x86dispatch.c`, select at runtime between `scalar`, `sse2`, `avx2` or `avx512` instruction set. This option is only valid for `x86`/`x64` systems. It is enabled by default when target `x86`/`x64` is detected. It can be forcefully turned off using `DISPATCH=0`.
+- `LIBXXH_DISPATCH=1` : same idea, implemented a runtime vector extension detector, but within `libxxhash`. This parameter is disabled by default. When enabled (only valid for `x86`/`x64` systems), new symbols published in `xxh_x86dispatch.h` become accessible. At the time of this writing, it's required to include `xxh_x86dispatch.h` in order to access the symbols with runtime vector extension detection.
+- `XXH_1ST_SPEED_TARGET` : select an initial speed target, expressed in MB/s, for the first speed test in benchmark mode. Benchmark will adjust the target at subsequent iterations, but the first test is made "blindly" by targeting this speed. Currently conservatively set to 10 MB/s, to support very slow (emulated) platforms.
+- `NODE_JS=1` : When compiling `xxhsum` for Node.js with Emscripten, this links the `NODERAWFS` library for unrestricted filesystem access and patches `isatty` to make the command line utility correctly detect the terminal. This does make the binary specific to Node.js.
+
+### Building xxHash - Using vcpkg
+
+You can download and install xxHash using the [vcpkg](https://github.com/Microsoft/vcpkg) dependency manager:
+
+    git clone https://github.com/Microsoft/vcpkg.git
+    cd vcpkg
+    ./bootstrap-vcpkg.sh
+    ./vcpkg integrate install
+    ./vcpkg install xxhash
+
+The xxHash port in vcpkg is kept up to date by Microsoft team members and community contributors. If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
+
+### Example
+
+The simplest example calls xxhash 64-bit variant as a one-shot function
+generating a hash value from a single buffer, and invoked from a C/C++ program:
+
+```C
+#include "xxhash.h"
+
+    (...)
+    XXH64_hash_t hash = XXH64(buffer, size, seed);
+}
+```
+
+Streaming variant is more involved, but makes it possible to provide data incrementally:
+
+```C
+#include "stdlib.h"   /* abort() */
+#include "xxhash.h"
+
+
+XXH64_hash_t calcul_hash_streaming(FileHandler fh)
+{
+    /* create a hash state */
+    XXH64_state_t* const state = XXH64_createState();
+    if (state==NULL) abort();
+
+    size_t const bufferSize = SOME_SIZE;
+    void* const buffer = malloc(bufferSize);
+    if (buffer==NULL) abort();
+
+    /* Initialize state with selected seed */
+    XXH64_hash_t const seed = 0;   /* or any other value */
+    if (XXH64_reset(state, seed) == XXH_ERROR) abort();
+
+    /* Feed the state with input data, any size, any number of times */
+    (...)
+    while ( /* some data left */ ) {
+        size_t const length = get_more_data(buffer, bufferSize, fh);
+        if (XXH64_update(state, buffer, length) == XXH_ERROR) abort();
+        (...)
+    }
+    (...)
+
+    /* Produce the final hash value */
+    XXH64_hash_t const hash = XXH64_digest(state);
+
+    /* State could be re-used; but in this example, it is simply freed  */
+    free(buffer);
+    XXH64_freeState(state);
+
+    return hash;
+}
+```
+
+
+### License
+
+The library files `xxhash.c` and `xxhash.h` are BSD licensed.
+The utility `xxhsum` is GPL licensed.
+
+
+### Other programming languages
+
+Beyond the C reference version,
+xxHash is also available from many different programming languages,
+thanks to great contributors.
+They are [listed here](http://www.xxhash.com/#other-languages).
+
+
+### Packaging status
+
+Many distributions bundle a package manager
+which allows easy xxhash installation as both a `libxxhash` library
+and `xxhsum` command line interface.
+
+[![Packaging status](https://repology.org/badge/vertical-allrepos/xxhash.svg)](https://repology.org/project/xxhash/versions)
+
+
+### Special Thanks
+
+- Takayuki Matsuoka, aka @t-mat, for creating `xxhsum -c` and great support during early xxh releases
+- Mathias Westerdahl, aka @JCash, for introducing the first version of `XXH64`
+- Devin Hussey, aka @easyaspi314, for incredible low-level optimizations on `XXH3` and `XXH128`
diff --git a/external/xxhash/SECURITY.md b/external/xxhash/SECURITY.md
new file mode 100644
index 0000000..2a8b4c8
--- /dev/null
+++ b/external/xxhash/SECURITY.md
@@ -0,0 +1,13 @@
+# Security Policy
+
+## Supported Versions
+
+Security updates are applied only to the latest release.
+
+## Reporting a Vulnerability
+
+If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
+
+Please disclose it at [security advisory](https://github.com/Cyan4973/xxHash/security/advisories/new).
+
+This project is maintained by a team of volunteers on a reasonable-effort basis. As such, please give us at least 90 days to work on a fix before public exposure.
diff --git a/external/xxhash/xxh3.h b/external/xxhash/xxh3.h
new file mode 100644
index 0000000..7e3ce68
--- /dev/null
+++ b/external/xxhash/xxh3.h
@@ -0,0 +1,55 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Development source file for `xxh3`
+ * Copyright (C) 2019-2021 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * Note: This file used to host the source code of XXH3_* variants.
+ * during the development period.
+ * The source code is now properly integrated within xxhash.h.
+ *
+ * xxh3.h is no longer useful,
+ * but it is still provided for compatibility with source code
+ * which used to include it directly.
+ *
+ * Programs are now highly discouraged to include xxh3.h.
+ * Include `xxhash.h` instead, which is the officially supported interface.
+ *
+ * In the future, xxh3.h will start to generate warnings, then errors,
+ * then it will be removed from source package and from include directory.
+ */
+
+/* Simulate the same impact as including the old xxh3.h source file */
+
+#define XXH_INLINE_ALL
+#include "xxhash.h"
diff --git a/external/xxhash/xxh_x86dispatch.c b/external/xxhash/xxh_x86dispatch.c
new file mode 100644
index 0000000..0c15820
--- /dev/null
+++ b/external/xxhash/xxh_x86dispatch.c
@@ -0,0 +1,821 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2020-2021 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+
+/*!
+ * @file xxh_x86dispatch.c
+ *
+ * Automatic dispatcher code for the @ref XXH3_family on x86-based targets.
+ *
+ * Optional add-on.
+ *
+ * **Compile this file with the default flags for your target.**
+ * Note that compiling with flags like `-mavx*`, `-march=native`, or `/arch:AVX*`
+ * will make the resulting binary incompatible with cpus not supporting the requested instruction set.
+ *
+ * @defgroup dispatch x86 Dispatcher
+ * @{
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#if !(defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64))
+#  error "Dispatching is currently only supported on x86 and x86_64."
+#endif
+
+/*! @cond Doxygen ignores this part */
+#ifndef XXH_HAS_INCLUDE
+#  ifdef __has_include
+/*
+ * Not defined as XXH_HAS_INCLUDE(x) (function-like) because
+ * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion)
+ */
+#    define XXH_HAS_INCLUDE __has_include
+#  else
+#    define XXH_HAS_INCLUDE(x) 0
+#  endif
+#endif
+/*! @endcond */
+
+/*!
+ * @def XXH_DISPATCH_SCALAR
+ * @brief Enables/dispatching the scalar code path.
+ *
+ * If this is defined to 0, SSE2 support is assumed. This reduces code size
+ * when the scalar path is not needed.
+ *
+ * This is automatically defined to 0 when...
+ *   - SSE2 support is enabled in the compiler
+ *   - Targeting x86_64
+ *   - Targeting Android x86
+ *   - Targeting macOS
+ */
+#ifndef XXH_DISPATCH_SCALAR
+#  if defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) /* SSE2 on by default */ \
+     || defined(__x86_64__) || defined(_M_X64) /* x86_64 */ \
+     || defined(__ANDROID__) || defined(__APPLE__) /* Android or macOS */
+#     define XXH_DISPATCH_SCALAR 0 /* disable */
+#  else
+#     define XXH_DISPATCH_SCALAR 1
+#  endif
+#endif
+/*!
+ * @def XXH_DISPATCH_AVX2
+ * @brief Enables/disables dispatching for AVX2.
+ *
+ * This is automatically detected if it is not defined.
+ *  - GCC 4.7 and later are known to support AVX2, but >4.9 is required for
+ *    to get the AVX2 intrinsics and typedefs without -mavx -mavx2.
+ *  - Visual Studio 2013 Update 2 and later are known to support AVX2.
+ *  - The GCC/Clang internal header `<avx2intrin.h>` is detected. While this is
+ *    not allowed to be included directly, it still appears in the builtin
+ *    include path and is detectable with `__has_include`.
+ *
+ * @see XXH_AVX2
+ */
+#ifndef XXH_DISPATCH_AVX2
+#  if (defined(__GNUC__) && (__GNUC__ > 4)) /* GCC 5.0+ */ \
+   || (defined(_MSC_VER) && _MSC_VER >= 1900) /* VS 2015+ */ \
+   || (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 180030501) /* VS 2013 Update 2 */ \
+   || XXH_HAS_INCLUDE(<avx2intrin.h>) /* GCC/Clang internal header */
+#    define XXH_DISPATCH_AVX2 1   /* enable dispatch towards AVX2 */
+#  else
+#    define XXH_DISPATCH_AVX2 0
+#  endif
+#endif /* XXH_DISPATCH_AVX2 */
+
+/*!
+ * @def XXH_DISPATCH_AVX512
+ * @brief Enables/disables dispatching for AVX512.
+ *
+ * Automatically detected if one of the following conditions is met:
+ *  - GCC 4.9 and later are known to support AVX512.
+ *  - Visual Studio 2017  and later are known to support AVX2.
+ *  - The GCC/Clang internal header `<avx512fintrin.h>` is detected. While this
+ *    is not allowed to be included directly, it still appears in the builtin
+ *    include path and is detectable with `__has_include`.
+ *
+ * @see XXH_AVX512
+ */
+#ifndef XXH_DISPATCH_AVX512
+#  if (defined(__GNUC__) \
+       && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9))) /* GCC 4.9+ */ \
+   || (defined(_MSC_VER) && _MSC_VER >= 1910) /* VS 2017+ */ \
+   || XXH_HAS_INCLUDE(<avx512fintrin.h>) /* GCC/Clang internal header */
+#    define XXH_DISPATCH_AVX512 1   /* enable dispatch towards AVX512 */
+#  else
+#    define XXH_DISPATCH_AVX512 0
+#  endif
+#endif /* XXH_DISPATCH_AVX512 */
+
+/*!
+ * @def XXH_TARGET_SSE2
+ * @brief Allows a function to be compiled with SSE2 intrinsics.
+ *
+ * Uses `__attribute__((__target__("sse2")))` on GCC to allow SSE2 to be used
+ * even with `-mno-sse2`.
+ *
+ * @def XXH_TARGET_AVX2
+ * @brief Like @ref XXH_TARGET_SSE2, but for AVX2.
+ *
+ * @def XXH_TARGET_AVX512
+ * @brief Like @ref XXH_TARGET_SSE2, but for AVX512.
+ *
+ */
+#if defined(__GNUC__)
+#  include <emmintrin.h> /* SSE2 */
+#  if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512
+#    include <immintrin.h> /* AVX2, AVX512F */
+#  endif
+#  define XXH_TARGET_SSE2 __attribute__((__target__("sse2")))
+#  define XXH_TARGET_AVX2 __attribute__((__target__("avx2")))
+#  define XXH_TARGET_AVX512 __attribute__((__target__("avx512f")))
+#elif defined(__clang__) && defined(_MSC_VER) /* clang-cl.exe */
+#  include <emmintrin.h> /* SSE2 */
+#  if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512
+#    include <immintrin.h> /* AVX2, AVX512F */
+#    include <smmintrin.h>
+#    include <avxintrin.h>
+#    include <avx2intrin.h>
+#    include <avx512fintrin.h>
+#  endif
+#  define XXH_TARGET_SSE2 __attribute__((__target__("sse2")))
+#  define XXH_TARGET_AVX2 __attribute__((__target__("avx2")))
+#  define XXH_TARGET_AVX512 __attribute__((__target__("avx512f")))
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#  define XXH_TARGET_SSE2
+#  define XXH_TARGET_AVX2
+#  define XXH_TARGET_AVX512
+#else
+#  error "Dispatching is currently not supported for your compiler."
+#endif
+
+/*! @cond Doxygen ignores this part */
+#ifdef XXH_DISPATCH_DEBUG
+/* debug logging */
+#  include <stdio.h>
+#  define XXH_debugPrint(str) { fprintf(stderr, "DEBUG: xxHash dispatch: %s \n", str); fflush(NULL); }
+#else
+#  define XXH_debugPrint(str) ((void)0)
+#  undef NDEBUG /* avoid redefinition */
+#  define NDEBUG
+#endif
+/*! @endcond */
+#include <assert.h>
+
+#ifndef XXH_DOXYGEN
+#define XXH_INLINE_ALL
+#define XXH_X86DISPATCH
+#include "xxhash.h"
+#endif
+
+/*! @cond Doxygen ignores this part */
+#ifndef XXH_HAS_ATTRIBUTE
+#  ifdef __has_attribute
+#    define XXH_HAS_ATTRIBUTE(...) __has_attribute(__VA_ARGS__)
+#  else
+#    define XXH_HAS_ATTRIBUTE(...) 0
+#  endif
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+#if XXH_HAS_ATTRIBUTE(constructor)
+#  define XXH_CONSTRUCTOR __attribute__((constructor))
+#  define XXH_DISPATCH_MAYBE_NULL 0
+#else
+#  define XXH_CONSTRUCTOR
+#  define XXH_DISPATCH_MAYBE_NULL 1
+#endif
+/*! @endcond */
+
+
+/*! @cond Doxygen ignores this part */
+/*
+ * Support both AT&T and Intel dialects
+ *
+ * GCC doesn't convert AT&T syntax to Intel syntax, and will error out if
+ * compiled with -masm=intel. Instead, it supports dialect switching with
+ * curly braces: { AT&T syntax | Intel syntax }
+ *
+ * Clang's integrated assembler automatically converts AT&T syntax to Intel if
+ * needed, making the dialect switching useless (it isn't even supported).
+ *
+ * Note: Comments are written in the inline assembly itself.
+ */
+#ifdef __clang__
+#  define XXH_I_ATT(intel, att) att "\n\t"
+#else
+#  define XXH_I_ATT(intel, att) "{" att "|" intel "}\n\t"
+#endif
+/*! @endcond */
+
+/*!
+ * @private
+ * @brief Runs CPUID.
+ *
+ * @param eax , ecx The parameters to pass to CPUID, %eax and %ecx respectively.
+ * @param abcd The array to store the result in, `{ eax, ebx, ecx, edx }`
+ */
+static void XXH_cpuid(xxh_u32 eax, xxh_u32 ecx, xxh_u32* abcd)
+{
+#if defined(_MSC_VER)
+    __cpuidex((int*)abcd, eax, ecx);
+#else
+    xxh_u32 ebx, edx;
+# if defined(__i386__) && defined(__PIC__)
+    __asm__(
+        "# Call CPUID\n\t"
+        "#\n\t"
+        "# On 32-bit x86 with PIC enabled, we are not allowed to overwrite\n\t"
+        "# EBX, so we use EDI instead.\n\t"
+        XXH_I_ATT("mov     edi, ebx",   "movl    %%ebx, %%edi")
+        XXH_I_ATT("cpuid",              "cpuid"               )
+        XXH_I_ATT("xchg    edi, ebx",   "xchgl   %%ebx, %%edi")
+        : "=D" (ebx),
+# else
+    __asm__(
+        "# Call CPUID\n\t"
+        XXH_I_ATT("cpuid",              "cpuid")
+        : "=b" (ebx),
+# endif
+              "+a" (eax), "+c" (ecx), "=d" (edx));
+    abcd[0] = eax;
+    abcd[1] = ebx;
+    abcd[2] = ecx;
+    abcd[3] = edx;
+#endif
+}
+
+/*
+ * Modified version of Intel's guide
+ * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+ */
+
+#if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512
+/*!
+ * @private
+ * @brief Runs `XGETBV`.
+ *
+ * While the CPU may support AVX2, the operating system might not properly save
+ * the full YMM/ZMM registers.
+ *
+ * xgetbv is used for detecting this: Any compliant operating system will define
+ * a set of flags in the xcr0 register indicating how it saves the AVX registers.
+ *
+ * You can manually disable this flag on Windows by running, as admin:
+ *
+ *   bcdedit.exe /set xsavedisable 1
+ *
+ * and rebooting. Run the same command with 0 to re-enable it.
+ */
+static xxh_u64 XXH_xgetbv(void)
+{
+#if defined(_MSC_VER)
+    return _xgetbv(0);  /* min VS2010 SP1 compiler is required */
+#else
+    xxh_u32 xcr0_lo, xcr0_hi;
+    __asm__(
+        "# Call XGETBV\n\t"
+        "#\n\t"
+        "# Older assemblers (e.g. macOS's ancient GAS version) don't support\n\t"
+        "# the XGETBV opcode, so we encode it by hand instead.\n\t"
+        "# See <https://github.com/asmjit/asmjit/issues/78> for details.\n\t"
+        ".byte   0x0f, 0x01, 0xd0\n\t"
+       : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0));
+    return xcr0_lo | ((xxh_u64)xcr0_hi << 32);
+#endif
+}
+#endif
+
+/*! @cond Doxygen ignores this part */
+#define XXH_SSE2_CPUID_MASK (1 << 26)
+#define XXH_OSXSAVE_CPUID_MASK ((1 << 26) | (1 << 27))
+#define XXH_AVX2_CPUID_MASK (1 << 5)
+#define XXH_AVX2_XGETBV_MASK ((1 << 2) | (1 << 1))
+#define XXH_AVX512F_CPUID_MASK (1 << 16)
+#define XXH_AVX512F_XGETBV_MASK ((7 << 5) | (1 << 2) | (1 << 1))
+/*! @endcond */
+
+/*!
+ * @private
+ * @brief Returns the best XXH3 implementation.
+ *
+ * Runs various CPUID/XGETBV tests to try and determine the best implementation.
+ *
+ * @return The best @ref XXH_VECTOR implementation.
+ * @see XXH_VECTOR_TYPES
+ */
+int XXH_featureTest(void)
+{
+    xxh_u32 abcd[4];
+    xxh_u32 max_leaves;
+    int best = XXH_SCALAR;
+#if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512
+    xxh_u64 xgetbv_val;
+#endif
+#if defined(__GNUC__) && defined(__i386__)
+    xxh_u32 cpuid_supported;
+    __asm__(
+        "# For the sake of ruthless backwards compatibility, check if CPUID\n\t"
+        "# is supported in the EFLAGS on i386.\n\t"
+        "# This is not necessary on x86_64 - CPUID is mandatory.\n\t"
+        "#   The ID flag (bit 21) in the EFLAGS register indicates support\n\t"
+        "#   for the CPUID instruction. If a software procedure can set and\n\t"
+        "#   clear this flag, the processor executing the procedure supports\n\t"
+        "#   the CPUID instruction.\n\t"
+        "#   <https://c9x.me/x86/html/file_module_x86_id_45.html>\n\t"
+        "#\n\t"
+        "# Routine is from <https://wiki.osdev.org/CPUID>.\n\t"
+
+        "# Save EFLAGS\n\t"
+        XXH_I_ATT("pushfd",                           "pushfl"                    )
+        "# Store EFLAGS\n\t"
+        XXH_I_ATT("pushfd",                           "pushfl"                    )
+        "# Invert the ID bit in stored EFLAGS\n\t"
+        XXH_I_ATT("xor     dword ptr[esp], 0x200000", "xorl    $0x200000, (%%esp)")
+        "# Load stored EFLAGS (with ID bit inverted)\n\t"
+        XXH_I_ATT("popfd",                            "popfl"                     )
+        "# Store EFLAGS again (ID bit may or not be inverted)\n\t"
+        XXH_I_ATT("pushfd",                           "pushfl"                    )
+        "# eax = modified EFLAGS (ID bit may or may not be inverted)\n\t"
+        XXH_I_ATT("pop     eax",                      "popl    %%eax"             )
+        "# eax = whichever bits were changed\n\t"
+        XXH_I_ATT("xor     eax, dword ptr[esp]",      "xorl    (%%esp), %%eax"    )
+        "# Restore original EFLAGS\n\t"
+        XXH_I_ATT("popfd",                            "popfl"                     )
+        "# eax = zero if ID bit can't be changed, else non-zero\n\t"
+        XXH_I_ATT("and     eax, 0x200000",            "andl    $0x200000, %%eax"  )
+        : "=a" (cpuid_supported) :: "cc");
+
+    if (XXH_unlikely(!cpuid_supported)) {
+        XXH_debugPrint("CPUID support is not detected!");
+        return best;
+    }
+
+#endif
+    /* Check how many CPUID pages we have */
+    XXH_cpuid(0, 0, abcd);
+    max_leaves = abcd[0];
+
+    /* Shouldn't happen on hardware, but happens on some QEMU configs. */
+    if (XXH_unlikely(max_leaves == 0)) {
+        XXH_debugPrint("Max CPUID leaves == 0!");
+        return best;
+    }
+
+    /* Check for SSE2, OSXSAVE and xgetbv */
+    XXH_cpuid(1, 0, abcd);
+
+    /*
+     * Test for SSE2. The check is redundant on x86_64, but it doesn't hurt.
+     */
+    if (XXH_unlikely((abcd[3] & XXH_SSE2_CPUID_MASK) != XXH_SSE2_CPUID_MASK))
+        return best;
+
+    XXH_debugPrint("SSE2 support detected.");
+
+    best = XXH_SSE2;
+#if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512
+    /* Make sure we have enough leaves */
+    if (XXH_unlikely(max_leaves < 7))
+        return best;
+
+    /* Test for OSXSAVE and XGETBV */
+    if ((abcd[2] & XXH_OSXSAVE_CPUID_MASK) != XXH_OSXSAVE_CPUID_MASK)
+        return best;
+
+    /* CPUID check for AVX features */
+    XXH_cpuid(7, 0, abcd);
+
+    xgetbv_val = XXH_xgetbv();
+#if XXH_DISPATCH_AVX2
+    /* Validate that AVX2 is supported by the CPU */
+    if ((abcd[1] & XXH_AVX2_CPUID_MASK) != XXH_AVX2_CPUID_MASK)
+        return best;
+
+    /* Validate that the OS supports YMM registers */
+    if ((xgetbv_val & XXH_AVX2_XGETBV_MASK) != XXH_AVX2_XGETBV_MASK) {
+        XXH_debugPrint("AVX2 supported by the CPU, but not the OS.");
+        return best;
+    }
+
+    /* AVX2 supported */
+    XXH_debugPrint("AVX2 support detected.");
+    best = XXH_AVX2;
+#endif
+#if XXH_DISPATCH_AVX512
+    /* Check if AVX512F is supported by the CPU */
+    if ((abcd[1] & XXH_AVX512F_CPUID_MASK) != XXH_AVX512F_CPUID_MASK) {
+        XXH_debugPrint("AVX512F not supported by CPU");
+        return best;
+    }
+
+    /* Validate that the OS supports ZMM registers */
+    if ((xgetbv_val & XXH_AVX512F_XGETBV_MASK) != XXH_AVX512F_XGETBV_MASK) {
+        XXH_debugPrint("AVX512F supported by the CPU, but not the OS.");
+        return best;
+    }
+
+    /* AVX512F supported */
+    XXH_debugPrint("AVX512F support detected.");
+    best = XXH_AVX512;
+#endif
+#endif
+    return best;
+}
+
+
+/* ===   Vector implementations   === */
+
+/*! @cond PRIVATE */
+/*!
+ * @private
+ * @brief Defines the various dispatch functions.
+ *
+ * TODO: Consolidate?
+ *
+ * @param suffix The suffix for the functions, e.g. sse2 or scalar
+ * @param target XXH_TARGET_* or empty.
+ */
+
+#define XXH_DEFINE_DISPATCH_FUNCS(suffix, target)                             \
+                                                                              \
+/* ===   XXH3, default variants   === */                                      \
+                                                                              \
+XXH_NO_INLINE target XXH64_hash_t                                             \
+XXHL64_default_##suffix(XXH_NOESCAPE const void* XXH_RESTRICT input,          \
+                        size_t len)                                           \
+{                                                                             \
+    return XXH3_hashLong_64b_internal(                                        \
+               input, len, XXH3_kSecret, sizeof(XXH3_kSecret),                \
+               XXH3_accumulate_##suffix, XXH3_scrambleAcc_##suffix            \
+    );                                                                        \
+}                                                                             \
+                                                                              \
+/* ===   XXH3, Seeded variants   === */                                       \
+                                                                              \
+XXH_NO_INLINE target XXH64_hash_t                                             \
+XXHL64_seed_##suffix(XXH_NOESCAPE const void* XXH_RESTRICT input, size_t len, \
+                     XXH64_hash_t seed)                                       \
+{                                                                             \
+    return XXH3_hashLong_64b_withSeed_internal(                               \
+                    input, len, seed, XXH3_accumulate_##suffix,               \
+                    XXH3_scrambleAcc_##suffix, XXH3_initCustomSecret_##suffix \
+    );                                                                        \
+}                                                                             \
+                                                                              \
+/* ===   XXH3, Secret variants   === */                                       \
+                                                                              \
+XXH_NO_INLINE target XXH64_hash_t                                             \
+XXHL64_secret_##suffix(XXH_NOESCAPE const void* XXH_RESTRICT input,           \
+                       size_t len, XXH_NOESCAPE const void* secret,           \
+                       size_t secretLen)                                      \
+{                                                                             \
+    return XXH3_hashLong_64b_internal(                                        \
+                    input, len, secret, secretLen,                            \
+                    XXH3_accumulate_##suffix, XXH3_scrambleAcc_##suffix       \
+    );                                                                        \
+}                                                                             \
+                                                                              \
+/* ===   XXH3 update variants   === */                                        \
+                                                                              \
+XXH_NO_INLINE target XXH_errorcode                                            \
+XXH3_update_##suffix(XXH_NOESCAPE XXH3_state_t* state,                        \
+                     XXH_NOESCAPE const void* input, size_t len)              \
+{                                                                             \
+    return XXH3_update(state, (const xxh_u8*)input, len,                      \
+                    XXH3_accumulate_##suffix, XXH3_scrambleAcc_##suffix);     \
+}                                                                             \
+                                                                              \
+/* ===   XXH128 default variants   === */                                     \
+                                                                              \
+XXH_NO_INLINE target XXH128_hash_t                                            \
+XXHL128_default_##suffix(XXH_NOESCAPE  const void* XXH_RESTRICT input,        \
+                         size_t len)                                          \
+{                                                                             \
+    return XXH3_hashLong_128b_internal(                                       \
+                    input, len, XXH3_kSecret, sizeof(XXH3_kSecret),           \
+                    XXH3_accumulate_##suffix, XXH3_scrambleAcc_##suffix       \
+    );                                                                        \
+}                                                                             \
+                                                                              \
+/* ===   XXH128 Secret variants   === */                                      \
+                                                                              \
+XXH_NO_INLINE target XXH128_hash_t                                            \
+XXHL128_secret_##suffix(XXH_NOESCAPE const void* XXH_RESTRICT input,          \
+                        size_t len,                                           \
+                        XXH_NOESCAPE const void* XXH_RESTRICT secret,         \
+                        size_t secretLen)                                     \
+{                                                                             \
+    return XXH3_hashLong_128b_internal(                                       \
+                    input, len, (const xxh_u8*)secret, secretLen,             \
+                    XXH3_accumulate_##suffix, XXH3_scrambleAcc_##suffix);     \
+}                                                                             \
+                                                                              \
+/* ===   XXH128 Seeded variants   === */                                      \
+                                                                              \
+XXH_NO_INLINE target XXH128_hash_t                                            \
+XXHL128_seed_##suffix(XXH_NOESCAPE const void* XXH_RESTRICT input, size_t len,\
+                      XXH64_hash_t seed)                                      \
+{                                                                             \
+    return XXH3_hashLong_128b_withSeed_internal(input, len, seed,             \
+                    XXH3_accumulate_##suffix, XXH3_scrambleAcc_##suffix,      \
+                    XXH3_initCustomSecret_##suffix);                          \
+}
+
+/*! @endcond */
+/* End XXH_DEFINE_DISPATCH_FUNCS */
+
+/*! @cond Doxygen ignores this part */
+#if XXH_DISPATCH_SCALAR
+XXH_DEFINE_DISPATCH_FUNCS(scalar, /* nothing */)
+#endif
+XXH_DEFINE_DISPATCH_FUNCS(sse2, XXH_TARGET_SSE2)
+#if XXH_DISPATCH_AVX2
+XXH_DEFINE_DISPATCH_FUNCS(avx2, XXH_TARGET_AVX2)
+#endif
+#if XXH_DISPATCH_AVX512
+XXH_DEFINE_DISPATCH_FUNCS(avx512, XXH_TARGET_AVX512)
+#endif
+#undef XXH_DEFINE_DISPATCH_FUNCS
+/*! @endcond */
+
+/* ====    Dispatchers    ==== */
+
+/*! @cond Doxygen ignores this part */
+typedef XXH64_hash_t (*XXH3_dispatchx86_hashLong64_default)(XXH_NOESCAPE const void* XXH_RESTRICT, size_t);
+
+typedef XXH64_hash_t (*XXH3_dispatchx86_hashLong64_withSeed)(XXH_NOESCAPE const void* XXH_RESTRICT, size_t, XXH64_hash_t);
+
+typedef XXH64_hash_t (*XXH3_dispatchx86_hashLong64_withSecret)(XXH_NOESCAPE const void* XXH_RESTRICT, size_t, XXH_NOESCAPE const void* XXH_RESTRICT, size_t);
+
+typedef XXH_errorcode (*XXH3_dispatchx86_update)(XXH_NOESCAPE XXH3_state_t*, XXH_NOESCAPE const void*, size_t);
+
+typedef struct {
+    XXH3_dispatchx86_hashLong64_default    hashLong64_default;
+    XXH3_dispatchx86_hashLong64_withSeed   hashLong64_seed;
+    XXH3_dispatchx86_hashLong64_withSecret hashLong64_secret;
+    XXH3_dispatchx86_update                update;
+} XXH_dispatchFunctions_s;
+
+#define XXH_NB_DISPATCHES 4
+/*! @endcond */
+
+/*!
+ * @private
+ * @brief Table of dispatchers for @ref XXH3_64bits().
+ *
+ * @pre The indices must match @ref XXH_VECTOR_TYPE.
+ */
+static const XXH_dispatchFunctions_s XXH_kDispatch[XXH_NB_DISPATCHES] = {
+#if XXH_DISPATCH_SCALAR
+    /* Scalar */ { XXHL64_default_scalar, XXHL64_seed_scalar, XXHL64_secret_scalar, XXH3_update_scalar },
+#else
+    /* Scalar */ { NULL, NULL, NULL, NULL },
+#endif
+    /* SSE2   */ { XXHL64_default_sse2,   XXHL64_seed_sse2,   XXHL64_secret_sse2,   XXH3_update_sse2 },
+#if XXH_DISPATCH_AVX2
+    /* AVX2   */ { XXHL64_default_avx2,   XXHL64_seed_avx2,   XXHL64_secret_avx2,   XXH3_update_avx2 },
+#else
+    /* AVX2   */ { NULL, NULL, NULL, NULL },
+#endif
+#if XXH_DISPATCH_AVX512
+    /* AVX512 */ { XXHL64_default_avx512, XXHL64_seed_avx512, XXHL64_secret_avx512, XXH3_update_avx512 }
+#else
+    /* AVX512 */ { NULL, NULL, NULL, NULL }
+#endif
+};
+/*!
+ * @private
+ * @brief The selected dispatch table for @ref XXH3_64bits().
+ */
+static XXH_dispatchFunctions_s XXH_g_dispatch = { NULL, NULL, NULL, NULL };
+
+
+/*! @cond Doxygen ignores this part */
+typedef XXH128_hash_t (*XXH3_dispatchx86_hashLong128_default)(XXH_NOESCAPE const void* XXH_RESTRICT, size_t);
+
+typedef XXH128_hash_t (*XXH3_dispatchx86_hashLong128_withSeed)(XXH_NOESCAPE const void* XXH_RESTRICT, size_t, XXH64_hash_t);
+
+typedef XXH128_hash_t (*XXH3_dispatchx86_hashLong128_withSecret)(XXH_NOESCAPE const void* XXH_RESTRICT, size_t, XXH_NOESCAPE const void* XXH_RESTRICT, size_t);
+
+typedef struct {
+    XXH3_dispatchx86_hashLong128_default    hashLong128_default;
+    XXH3_dispatchx86_hashLong128_withSeed   hashLong128_seed;
+    XXH3_dispatchx86_hashLong128_withSecret hashLong128_secret;
+    XXH3_dispatchx86_update                 update;
+} XXH_dispatch128Functions_s;
+/*! @endcond */
+
+
+/*!
+ * @private
+ * @brief Table of dispatchers for @ref XXH3_128bits().
+ *
+ * @pre The indices must match @ref XXH_VECTOR_TYPE.
+ */
+static const XXH_dispatch128Functions_s XXH_kDispatch128[XXH_NB_DISPATCHES] = {
+#if XXH_DISPATCH_SCALAR
+    /* Scalar */ { XXHL128_default_scalar, XXHL128_seed_scalar, XXHL128_secret_scalar, XXH3_update_scalar },
+#else
+    /* Scalar */ { NULL, NULL, NULL, NULL },
+#endif
+    /* SSE2   */ { XXHL128_default_sse2,   XXHL128_seed_sse2,   XXHL128_secret_sse2,   XXH3_update_sse2 },
+#if XXH_DISPATCH_AVX2
+    /* AVX2   */ { XXHL128_default_avx2,   XXHL128_seed_avx2,   XXHL128_secret_avx2,   XXH3_update_avx2 },
+#else
+    /* AVX2   */ { NULL, NULL, NULL, NULL },
+#endif
+#if XXH_DISPATCH_AVX512
+    /* AVX512 */ { XXHL128_default_avx512, XXHL128_seed_avx512, XXHL128_secret_avx512, XXH3_update_avx512 }
+#else
+    /* AVX512 */ { NULL, NULL, NULL, NULL }
+#endif
+};
+
+/*!
+ * @private
+ * @brief The selected dispatch table for @ref XXH3_64bits().
+ */
+static XXH_dispatch128Functions_s XXH_g_dispatch128 = { NULL, NULL, NULL, NULL };
+
+/*!
+ * @private
+ * @brief Runs a CPUID check and sets the correct dispatch tables.
+ */
+static XXH_CONSTRUCTOR void XXH_setDispatch(void)
+{
+    int vecID = XXH_featureTest();
+    XXH_STATIC_ASSERT(XXH_AVX512 == XXH_NB_DISPATCHES-1);
+    assert(XXH_SCALAR <= vecID && vecID <= XXH_AVX512);
+#if !XXH_DISPATCH_SCALAR
+    assert(vecID != XXH_SCALAR);
+#endif
+#if !XXH_DISPATCH_AVX512
+    assert(vecID != XXH_AVX512);
+#endif
+#if !XXH_DISPATCH_AVX2
+    assert(vecID != XXH_AVX2);
+#endif
+    XXH_g_dispatch = XXH_kDispatch[vecID];
+    XXH_g_dispatch128 = XXH_kDispatch128[vecID];
+}
+
+
+/* ====    XXH3 public functions    ==== */
+/*! @cond Doxygen ignores this part */
+
+static XXH64_hash_t
+XXH3_hashLong_64b_defaultSecret_selection(const void* XXH_RESTRICT input, size_t len,
+                                          XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch.hashLong64_default == NULL)
+        XXH_setDispatch();
+    return XXH_g_dispatch.hashLong64_default(input, len);
+}
+
+XXH64_hash_t XXH3_64bits_dispatch(XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_defaultSecret_selection);
+}
+
+static XXH64_hash_t
+XXH3_hashLong_64b_withSeed_selection(const void* XXH_RESTRICT input, size_t len,
+                                     XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch.hashLong64_seed == NULL)
+        XXH_setDispatch();
+    return XXH_g_dispatch.hashLong64_seed(input, len, seed64);
+}
+
+XXH64_hash_t XXH3_64bits_withSeed_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed_selection);
+}
+
+static XXH64_hash_t
+XXH3_hashLong_64b_withSecret_selection(const void* XXH_RESTRICT input, size_t len,
+                                       XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch.hashLong64_secret == NULL)
+        XXH_setDispatch();
+    return XXH_g_dispatch.hashLong64_secret(input, len, secret, secretLen);
+}
+
+XXH64_hash_t XXH3_64bits_withSecret_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretLen)
+{
+    return XXH3_64bits_internal(input, len, 0, secret, secretLen, XXH3_hashLong_64b_withSecret_selection);
+}
+
+XXH_errorcode
+XXH3_64bits_update_dispatch(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch.update == NULL)
+        XXH_setDispatch();
+
+    return XXH_g_dispatch.update(state, (const xxh_u8*)input, len);
+}
+
+/*! @endcond */
+
+
+/* ====    XXH128 public functions    ==== */
+/*! @cond Doxygen ignores this part */
+
+static XXH128_hash_t
+XXH3_hashLong_128b_defaultSecret_selection(const void* input, size_t len,
+                                           XXH64_hash_t seed64, const void* secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch128.hashLong128_default == NULL)
+        XXH_setDispatch();
+    return XXH_g_dispatch128.hashLong128_default(input, len);
+}
+
+XXH128_hash_t XXH3_128bits_dispatch(XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_128bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_128b_defaultSecret_selection);
+}
+
+static XXH128_hash_t
+XXH3_hashLong_128b_withSeed_selection(const void* input, size_t len,
+                                      XXH64_hash_t seed64, const void* secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch128.hashLong128_seed == NULL)
+        XXH_setDispatch();
+    return XXH_g_dispatch128.hashLong128_seed(input, len, seed64);
+}
+
+XXH128_hash_t XXH3_128bits_withSeed_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_128b_withSeed_selection);
+}
+
+static XXH128_hash_t
+XXH3_hashLong_128b_withSecret_selection(const void* input, size_t len,
+                                        XXH64_hash_t seed64, const void* secret, size_t secretLen)
+{
+    (void)seed64;
+    if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch128.hashLong128_secret == NULL)
+        XXH_setDispatch();
+    return XXH_g_dispatch128.hashLong128_secret(input, len, secret, secretLen);
+}
+
+XXH128_hash_t XXH3_128bits_withSecret_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretLen)
+{
+    return XXH3_128bits_internal(input, len, 0, secret, secretLen, XXH3_hashLong_128b_withSecret_selection);
+}
+
+XXH_errorcode
+XXH3_128bits_update_dispatch(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    if (XXH_DISPATCH_MAYBE_NULL && XXH_g_dispatch128.update == NULL)
+        XXH_setDispatch();
+    return XXH_g_dispatch128.update(state, (const xxh_u8*)input, len);
+}
+
+/*! @endcond */
+
+#if defined (__cplusplus)
+}
+#endif
+/*! @} */
diff --git a/external/xxhash/xxh_x86dispatch.h b/external/xxhash/xxh_x86dispatch.h
new file mode 100644
index 0000000..7085221
--- /dev/null
+++ b/external/xxhash/xxh_x86dispatch.h
@@ -0,0 +1,93 @@
+/*
+ * xxHash - XXH3 Dispatcher for x86-based targets
+ * Copyright (C) 2020-2024 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+#ifndef XXH_X86DISPATCH_H_13563687684
+#define XXH_X86DISPATCH_H_13563687684
+
+#include "xxhash.h"  /* XXH64_hash_t, XXH3_state_t */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*!
+ * @brief Returns the best XXH3 implementation for x86
+ *
+ * @return The best @ref XXH_VECTOR implementation.
+ * @see XXH_VECTOR_TYPES
+ */
+XXH_PUBLIC_API int XXH_featureTest(void);
+
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_dispatch(XXH_NOESCAPE const void* input, size_t len);
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_withSeed_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_withSecret_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretLen);
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update_dispatch(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len);
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_dispatch(XXH_NOESCAPE const void* input, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret_dispatch(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretLen);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update_dispatch(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len);
+
+#if defined (__cplusplus)
+}
+#endif
+
+
+/* automatic replacement of XXH3 functions.
+ * can be disabled by setting XXH_DISPATCH_DISABLE_REPLACE */
+#ifndef XXH_DISPATCH_DISABLE_REPLACE
+
+# undef  XXH3_64bits
+# define XXH3_64bits XXH3_64bits_dispatch
+# undef  XXH3_64bits_withSeed
+# define XXH3_64bits_withSeed XXH3_64bits_withSeed_dispatch
+# undef  XXH3_64bits_withSecret
+# define XXH3_64bits_withSecret XXH3_64bits_withSecret_dispatch
+# undef  XXH3_64bits_update
+# define XXH3_64bits_update XXH3_64bits_update_dispatch
+
+# undef  XXH128
+# define XXH128 XXH3_128bits_withSeed_dispatch
+# undef  XXH3_128bits
+# define XXH3_128bits XXH3_128bits_dispatch
+# undef  XXH3_128bits_withSeed
+# define XXH3_128bits_withSeed XXH3_128bits_withSeed_dispatch
+# undef  XXH3_128bits_withSecret
+# define XXH3_128bits_withSecret XXH3_128bits_withSecret_dispatch
+# undef  XXH3_128bits_update
+# define XXH3_128bits_update XXH3_128bits_update_dispatch
+
+#endif /* XXH_DISPATCH_DISABLE_REPLACE */
+
+#endif /* XXH_X86DISPATCH_H_13563687684 */
diff --git a/external/xxhash/xxhash.c b/external/xxhash/xxhash.c
new file mode 100644
index 0000000..e60cc37
--- /dev/null
+++ b/external/xxhash/xxhash.c
@@ -0,0 +1,42 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2023 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * xxhash.c instantiates functions defined in xxhash.h
+ */
+
+#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
+#define XXH_IMPLEMENTATION      /* access definitions */
+
+#include "xxhash.h"
diff --git a/external/xxhash/xxhash.d b/external/xxhash/xxhash.d
new file mode 100644
index 0000000..cff5682
--- /dev/null
+++ b/external/xxhash/xxhash.d
@@ -0,0 +1,3332 @@
+/**
+ * Computes xxHash hashes of arbitrary data. xxHash hashes are either uint32_t,
+ * uint64_t or uint128_t quantities that are like a
+ * checksum or CRC, but are more robust and very performant.
+ *
+$(SCRIPT inhibitQuickIndex = 1;)
+
+$(DIVC quickindex,
+$(BOOKTABLE ,
+$(TR $(TH Category) $(TH Functions)
+)
+$(TR $(TDNW Template API) $(TD $(MYREF XXHTemplate)
+)
+)
+$(TR $(TDNW OOP API) $(TD $(MYREF XXH32Digest))
+)
+$(TR $(TDNW Helpers) $(TD $(MYREF xxh32Of))
+)
+)
+)
+
+ * This module conforms to the APIs defined in `std.digest`. To understand the
+ * differences between the template and the OOP API, see $(MREF std, digest).
+ *
+ * This module publicly imports $(MREF std, digest) and can be used as a stand-alone
+ * module.
+ *
+ * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
+ *
+ * CTFE:
+ * Digests do not work in CTFE
+ *
+ * Authors:
+ * Carsten Schlote, Piotr Szturmaj, Kai Nacke, Johannes Pfau $(BR)
+ * The routines and algorithms are provided by the xxhash.[ch] source
+ * provided at $(I git@github.com:Cyan4973/xxHash.git).
+ *
+ * References:
+ *      $(LINK2 https://github.com/Cyan4973/xxHash, GitHub website of project)
+ *
+ * Source: $(PHOBOSSRC std/digest/xxh.d)
+ *
+ */
+
+/* xxh.d - A wrapper for the original C implementation */
+//module std.digest.xxh;
+module xxhash3;
+
+version (X86)
+    version = HaveUnalignedLoads;
+else version (X86_64)
+    version = HaveUnalignedLoads;
+
+//TODO: Properly detect this requirement.
+//version = CheckACCAlignment;
+
+//TODO: Check, if this code is an advantage over XXH provided code
+//      The code from core.int128 doesn't inline.
+//version = Have128BitInteger;
+
+version (Have128BitInteger)
+{
+    import core.int128;
+}
+
+///
+@safe unittest
+{
+    //Template API
+    import xxhash3 : XXH_32;
+
+    //Feeding data
+    ubyte[1024] data;
+    XXH_32 xxh;
+    xxh.start();
+    xxh.put(data[]);
+    xxh.start(); //Start again
+    xxh.put(data[]);
+    auto hash = xxh.finish();
+}
+
+///
+@safe unittest
+{
+    //OOP API
+    import xxhash3 : XXH32Digest;
+
+    auto xxh = new XXH32Digest();
+    ubyte[] hash = xxh.digest("abc");
+    assert(toHexString(hash) == "32D153FF", "Got " ~ toHexString(hash));
+
+    //Feeding data
+    ubyte[1024] data;
+    xxh.put(data[]);
+    xxh.reset(); //Start again
+    xxh.put(data[]);
+    hash = xxh.finish();
+}
+
+public import std.digest;
+
+/* --- Port of C sources (release 0.8.1) to D language below ---------------- */
+
+enum XXH_NO_STREAM = false;
+enum XXH_SIZE_OPT = 0;
+enum XXH_FORCE_ALIGN_CHECK = true;
+enum XXH32_ENDJMP = false;
+
+private import core.bitop : rol, bswap;
+private import std.exception : enforce;
+private import object : Exception;
+
+/** Thrown on XXH errors. */
+class XXHException : Exception
+{
+    import std.exception : basicExceptionCtors;
+    ///
+    mixin basicExceptionCtors;
+}
+///
+@safe unittest
+{
+    import std.exception : enforce, assertThrown;
+    assertThrown(enforce!XXHException(false, "Throw me..."));
+}
+
+/* *************************************
+*  Misc
+***************************************/
+
+alias XXH32_hash_t = uint;
+alias XXH64_hash_t = ulong;
+/** Storage for 128bit hash digest */
+align(16) struct XXH128_hash_t
+{
+    XXH64_hash_t low64; /** `value & 0xFFFFFFFFFFFFFFFF` */
+    XXH64_hash_t high64; /** `value >> 64` */
+}
+
+alias XXH32_canonical_t = ubyte[XXH32_hash_t.sizeof];
+static assert(XXH32_canonical_t.sizeof == 4, "32bit integers should be 4 bytes?");
+alias XXH64_canonical_t = ubyte[XXH64_hash_t.sizeof];
+static assert(XXH64_hash_t.sizeof == 8, "64bit integers should be 8 bytes?");
+alias XXH128_canonical_t = ubyte[XXH128_hash_t.sizeof];
+static assert(XXH128_hash_t.sizeof == 16, "128bit integers should be 16 bytes?");
+
+enum XXH_VERSION_MAJOR = 0; /** XXHASH Major version */
+enum XXH_VERSION_MINOR = 8; /** XXHASH Minor version */
+enum XXH_VERSION_RELEASE = 1; /** XXHASH Build/Release version */
+
+/** Version number, encoded as two digits each */
+enum XXH_VERSION_NUMBER =
+    (XXH_VERSION_MAJOR * 100 * 100 +
+     XXH_VERSION_MINOR * 100 +
+     XXH_VERSION_RELEASE);
+
+/** Get version number */
+uint xxh_versionNumber() @safe pure nothrow @nogc
+{
+    return XXH_VERSION_NUMBER;
+}
+///
+@safe unittest
+{
+    assert(XXH_VERSION_NUMBER == xxh_versionNumber(), "Version mismatch");
+}
+
+/** The error code of public API functions */
+enum XXH_errorcode
+{
+    XXH_OK = 0, /** OK */
+    XXH_ERROR /** Error */
+}
+
+/** Structure for XXH32 streaming API.
+ *
+ * See: XXH64_state_s, XXH3_state_s
+ */
+struct XXH32_state_t
+{
+    XXH32_hash_t total_len_32; /** Total length hashed, modulo 2^32 */
+    XXH32_hash_t large_len; /** Whether the hash is >= 16 (handles total_len_32 overflow) */
+    XXH32_hash_t[4] v; /** Accumulator lanes */
+    XXH32_hash_t[4] mem32; /** Internal buffer for partial reads. Treated as unsigned char[16]. */
+    XXH32_hash_t memsize; /** Amount of data in mem32 */
+    XXH32_hash_t reserved; /** Reserved field. Do not read nor write to it. */
+}
+
+/* Structure for XXH64 streaming API.
+ *
+ * See: XXH32_state_s, XXH3_state_s
+ */
+struct XXH64_state_t
+{
+    XXH64_hash_t total_len; /** Total length hashed. This is always 64-bit. */
+    XXH64_hash_t[4] v; /** Accumulator lanes */
+    XXH64_hash_t[4] mem64; /** Internal buffer for partial reads. Treated as unsigned char[32]. */
+    XXH32_hash_t memsize; /** Amount of data in mem64 */
+    XXH32_hash_t reserved32; /** Reserved field, needed for padding anyways*/
+    XXH64_hash_t reserved64; /** Reserved field. Do not read or write to it. */
+} /* typedef'd to XXH64_state_t */
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+
+/** Enum to indicate whether a pointer is aligned.
+ */
+private enum XXH_alignment
+{
+    XXH_aligned, /** Aligned */
+    XXH_unaligned /** Possibly unaligned */
+}
+
+private uint xxh_read32(const void* ptr) @trusted pure nothrow @nogc
+{
+    uint val;
+    version (HaveUnalignedLoads)
+        val = *(cast(uint*) ptr);
+    else
+        (cast(ubyte*)&val)[0 .. uint.sizeof] = (cast(ubyte*) ptr)[0 .. uint.sizeof];
+    return val;
+}
+
+private uint xxh_readLE32(const void* ptr) @safe pure nothrow @nogc
+{
+    version (LittleEndian)
+        return xxh_read32(ptr);
+    else
+        return bswap(xxh_read32(ptr));
+}
+
+private uint xxh_readBE32(const void* ptr) @safe pure nothrow @nogc
+{
+    version (LittleEndian)
+        return bswap(xxh_read32(ptr));
+    else
+        return xxh_read32(ptr);
+}
+
+private uint xxh_readLE32_align(const void* ptr, XXH_alignment align_) @trusted pure nothrow @nogc
+{
+    if (align_ == XXH_alignment.XXH_unaligned)
+    {
+        return xxh_readLE32(ptr);
+    }
+    else
+    {
+        version (LittleEndian)
+            return *cast(const uint*) ptr;
+        else
+            return bswap(*cast(const uint*) ptr);
+    }
+}
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+
+enum XXH_PRIME32_1 = 0x9E3779B1U; /** 0b10011110001101110111100110110001 */
+enum XXH_PRIME32_2 = 0x85EBCA77U; /** 0b10000101111010111100101001110111 */
+enum XXH_PRIME32_3 = 0xC2B2AE3DU; /** 0b11000010101100101010111000111101 */
+enum XXH_PRIME32_4 = 0x27D4EB2FU; /** 0b00100111110101001110101100101111 */
+enum XXH_PRIME32_5 = 0x165667B1U; /** 0b00010110010101100110011110110001 */
+
+/**  Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * acc.
+ *
+ * Param: acc = The accumulator lane.
+ * Param: input = The stripe of input to mix.
+ * Return: The mixed accumulator lane.
+ */
+private uint xxh32_round(uint acc, uint input) @safe pure nothrow @nogc
+{
+    acc += input * XXH_PRIME32_2;
+    acc = rol(acc, 13);
+    acc *= XXH_PRIME32_1;
+    return acc;
+}
+
+/** Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * Param: hash = The hash to avalanche.
+ * Return The avalanched hash.
+ */
+private uint xxh32_avalanche(uint hash) @safe pure nothrow @nogc
+{
+    hash ^= hash >> 15;
+    hash *= XXH_PRIME32_2;
+    hash ^= hash >> 13;
+    hash *= XXH_PRIME32_3;
+    hash ^= hash >> 16;
+    return hash;
+}
+
+/* Alias wrapper for xxh_readLE32_align() */
+private uint xxh_get32bits(const void* p, XXH_alignment align_) @safe pure nothrow @nogc
+{
+    return xxh_readLE32_align(p, align_);
+}
+
+/** Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * Param: hash = The hash to finalize.
+ * Param: ptr = The pointer to the remaining input.
+ * Param: len = The remaining length, modulo 16.
+ * Param: align = Whether @p ptr is aligned.
+ * Return: The finalized hash.
+ * See: xxh64_finalize().
+ */
+private uint xxh32_finalize(uint hash, const(ubyte)* ptr, size_t len, XXH_alignment align_)
+    @trusted pure nothrow @nogc
+{
+    static void XXH_PROCESS1(ref uint hash, ref const(ubyte)* ptr)
+    {
+        hash += (*ptr++) * XXH_PRIME32_5;
+        hash = rol(hash, 11) * XXH_PRIME32_1;
+    }
+
+    void XXH_PROCESS4(ref uint hash, ref const(ubyte)* ptr)
+    {
+        hash += xxh_get32bits(ptr, align_) * XXH_PRIME32_3;
+        ptr += 4;
+        hash = rol(hash, 17) * XXH_PRIME32_4;
+    }
+
+    /* Compact rerolled version; generally faster */
+    if (!XXH32_ENDJMP)
+    {
+        len &= 15;
+        while (len >= 4)
+        {
+            XXH_PROCESS4(hash, ptr);
+            len -= 4;
+        }
+        while (len > 0)
+        {
+            XXH_PROCESS1(hash, ptr);
+            --len;
+        }
+        return xxh32_avalanche(hash);
+    }
+    else
+    {
+        switch (len & 15) /* or switch (bEnd - p) */
+        {
+        case 12:
+            XXH_PROCESS4(hash, ptr);
+            goto case;
+        case 8:
+            XXH_PROCESS4(hash, ptr);
+            goto case;
+        case 4:
+            XXH_PROCESS4(hash, ptr);
+            return xxh32_avalanche(hash);
+
+        case 13:
+            XXH_PROCESS4(hash, ptr);
+            goto case;
+        case 9:
+            XXH_PROCESS4(hash, ptr);
+            goto case;
+        case 5:
+            XXH_PROCESS4(hash, ptr);
+            XXH_PROCESS1(hash, ptr);
+            return xxh32_avalanche(hash);
+
+        case 14:
+            XXH_PROCESS4(hash, ptr);
+            goto case;
+        case 10:
+            XXH_PROCESS4(hash, ptr);
+            goto case;
+        case 6:
+            XXH_PROCESS4(hash, ptr);
+            XXH_PROCESS1(hash, ptr);
+            XXH_PROCESS1(hash, ptr);
+            return xxh32_avalanche(hash);
+
+        case 15:
+            XXH_PROCESS4(hash, ptr);
+            goto case;
+        case 11:
+            XXH_PROCESS4(hash, ptr);
+            goto case;
+        case 7:
+            XXH_PROCESS4(hash, ptr);
+            goto case;
+        case 3:
+            XXH_PROCESS1(hash, ptr);
+            goto case;
+        case 2:
+            XXH_PROCESS1(hash, ptr);
+            goto case;
+        case 1:
+            XXH_PROCESS1(hash, ptr);
+            goto case;
+        case 0:
+            return xxh32_avalanche(hash);
+        default:
+            assert(0, "Internal error");
+        }
+        return hash; /* reaching this point is deemed impossible */
+    }
+}
+
+/** The implementation for XXH32().
+ *
+ * Params:
+ *  input = Directly passed from XXH32().
+ *  len = Ditto
+ *  seed = Ditto
+ *  align_ = Whether input is aligned.
+ * Return: The calculated hash.
+ */
+private uint xxh32_endian_align(
+    const(ubyte)* input, size_t len, uint seed, XXH_alignment align_)
+    @trusted pure nothrow @nogc
+{
+    uint h32;
+
+    if (len >= 16)
+    {
+        const ubyte* bEnd = input + len;
+        const ubyte* limit = bEnd - 15;
+        uint v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+        uint v2 = seed + XXH_PRIME32_2;
+        uint v3 = seed + 0;
+        uint v4 = seed - XXH_PRIME32_1;
+
+        do
+        {
+            v1 = xxh32_round(v1, xxh_get32bits(input, align_));
+            input += 4;
+            v2 = xxh32_round(v2, xxh_get32bits(input, align_));
+            input += 4;
+            v3 = xxh32_round(v3, xxh_get32bits(input, align_));
+            input += 4;
+            v4 = xxh32_round(v4, xxh_get32bits(input, align_));
+            input += 4;
+        }
+        while (input < limit);
+
+        h32 = rol(v1, 1) + rol(v2, 7) + rol(v3, 12) + rol(v4, 18);
+    }
+    else
+    {
+        h32 = seed + XXH_PRIME32_5;
+    }
+
+    h32 += cast(uint) len;
+
+    return xxh32_finalize(h32, input, len & 15, align_);
+}
+
+/* XXH PUBLIC API - hidden in D module ! */
+/** Calculate a XXH32 digest on provided data
+ *
+ * Params:
+ *   input = Pointer to data
+ *   len = length of datablock
+ *   seed = seed value
+ * Returns: a XXH32_hash_t
+ */
+private XXH32_hash_t XXH32(const void* input, size_t len, XXH32_hash_t seed)
+    @safe pure nothrow @nogc
+{
+    static if (!XXH_NO_STREAM && XXH_SIZE_OPT >= 2)
+    {
+        /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+        XXH32_state_t state;
+        auto rc = xxh32_reset(&state, seed);
+        if (rc == XXH_errorcode.XXH_OK)
+        {
+            rc = xxh32_update(&state, cast(const(ubyte)*) input, len);
+            if (rc == XXH_errorcode.XXH_OK)
+            {
+                return xxh32_digest(&state);
+            }
+        }
+        return XXH_errorcode.XXH_ERROR;
+    }
+    else
+    {
+        if (XXH_FORCE_ALIGN_CHECK)
+        {
+            if (((cast(size_t) input) & 3) == 0)
+            { /* Input is 4-bytes aligned, leverage the speed benefit */
+                return xxh32_endian_align(cast(const(ubyte)*) input, len,
+                        seed, XXH_alignment.XXH_aligned);
+            }
+        }
+
+        return xxh32_endian_align(cast(const(ubyte)*) input, len, seed,
+                XXH_alignment.XXH_unaligned);
+    }
+}
+
+/* XXH PUBLIC API - hidden in D module */
+/** Reset state with seed
+ *
+ * Params:
+ *   state = Pointer to state structure
+ *   seed = A seed value
+ * Returns: XXH_errorcode.OK or XXH_errorcode.FAIL
+ */
+private XXH_errorcode xxh32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+    @safe pure nothrow @nogc
+in (statePtr != null, "statePtr is null")
+{
+    *statePtr = XXH32_state_t.init;
+    statePtr.v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+    statePtr.v[1] = seed + XXH_PRIME32_2;
+    statePtr.v[2] = seed + 0;
+    statePtr.v[3] = seed - XXH_PRIME32_1;
+    return XXH_errorcode.XXH_OK;
+}
+
+/* XXH PUBLIC API - hidden in D module */
+/** Update the state with some more data
+ *
+ * Params:
+ *   state = Pointer to state structure
+ *   input = A pointer to data
+ *   len = length of input data
+ * Returns: XXH_errorcode.OK or XXH_errorcode.FAIL
+ */
+private XXH_errorcode xxh32_update(XXH32_state_t* state, const void* input, size_t len)
+    @trusted pure nothrow @nogc
+in
+{
+    if (input == null) assert(len == 0, "input null ptr only allowed with len == 0");
+}
+do
+{
+    if (input == null && len == 0)
+        return XXH_errorcode.XXH_OK;
+    else if (input == null && len != 0)
+        return XXH_errorcode.XXH_ERROR;
+    else
+    {
+        const(ubyte)* p = cast(const(ubyte)*) input;
+        const ubyte* bEnd = p + len;
+
+        state.total_len_32 += cast(XXH32_hash_t) len;
+        state.large_len |= cast(XXH32_hash_t)((len >= 16) | (state.total_len_32 >= 16));
+
+        if (state.memsize + len < 16)
+        {
+            /* fill in tmp buffer */
+            (cast(ubyte*) state.mem32)[state.memsize .. state.memsize + len] = (cast(ubyte*) input)[0 .. len];
+            state.memsize += cast(XXH32_hash_t) len;
+            return XXH_errorcode.XXH_OK;
+        }
+
+        if (state.memsize)
+        {
+            /* some data left from previous update */
+            (cast(ubyte*) state.mem32)[state.memsize .. state.memsize + (16 - state.memsize)] =
+                (cast(ubyte*) input)[0 .. (16 - state.memsize)];
+            {
+                const(uint)* p32 = cast(const(uint)*)&state.mem32[0];
+                state.v[0] = xxh32_round(state.v[0], xxh_readLE32(p32));
+                p32++;
+                state.v[1] = xxh32_round(state.v[1], xxh_readLE32(p32));
+                p32++;
+                state.v[2] = xxh32_round(state.v[2], xxh_readLE32(p32));
+                p32++;
+                state.v[3] = xxh32_round(state.v[3], xxh_readLE32(p32));
+            }
+            p += 16 - state.memsize;
+            state.memsize = 0;
+        }
+
+        if (p <= bEnd - 16)
+        {
+            const ubyte* limit = bEnd - 16;
+
+            do
+            {
+                state.v[0] = xxh32_round(state.v[0], xxh_readLE32(p));
+                p += 4;
+                state.v[1] = xxh32_round(state.v[1], xxh_readLE32(p));
+                p += 4;
+                state.v[2] = xxh32_round(state.v[2], xxh_readLE32(p));
+                p += 4;
+                state.v[3] = xxh32_round(state.v[3], xxh_readLE32(p));
+                p += 4;
+            }
+            while (p <= limit);
+
+        }
+
+        if (p < bEnd)
+        {
+            (cast(ubyte*) state.mem32)[0 .. cast(size_t)(bEnd - p) ] =
+                (cast(ubyte*) p)[0 .. cast(size_t)(bEnd - p) ];
+            state.memsize = cast(XXH32_hash_t)(bEnd - p);
+        }
+    }
+
+    return XXH_errorcode.XXH_OK;
+}
+
+/* XXH PUBLIC API - hidden in D module */
+/** Finalize state and return the final XXH32 digest
+ *
+ * Params:
+ *   state = Pointer to state structure
+ * Returns: the final XXH32 digest
+ */
+private XXH32_hash_t xxh32_digest(const XXH32_state_t* state)
+    @trusted pure nothrow @nogc
+{
+    uint h32;
+
+    if (state.large_len)
+    {
+        h32 = rol(state.v[0], 1) + rol(state.v[1], 7) +
+               rol(state.v[2], 12) + rol(state.v[3], 18);
+    }
+    else
+    {
+        h32 = state.v[2] /* == seed */  + XXH_PRIME32_5;
+    }
+
+    h32 += state.total_len_32;
+
+    return xxh32_finalize(h32, cast(const ubyte*) state.mem32, state.memsize,
+            XXH_alignment.XXH_aligned);
+}
+
+/* XXH PUBLIC API - hidden in D module */
+/** Covert the XXH32_hash_t to a byte array of same size
+ *
+ * Params:
+ *   dst = Pointer to target storage
+ *   hash = a XXH32_hash_t value
+ * Returns: nothing
+ */
+private void xxh32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+    @trusted pure nothrow @nogc
+{
+    static assert((XXH32_canonical_t).sizeof == (XXH32_hash_t).sizeof,
+                    "(XXH32_canonical_t).sizeof != (XXH32_hash_t).sizeof");
+    version (LittleEndian)
+        hash = bswap(hash);
+    (cast(ubyte*) dst) [0 .. dst.sizeof] = (cast(ubyte*) &hash) [0 .. dst.sizeof];
+}
+
+/* XXH PUBLIC API - hidden in D module */
+/** Covert the XXH32_hash_t to a byte array of same size
+ *
+ * Params:
+ *   src = Pointer to source storage
+ * Returns: the converted value as XXH32_hash_t
+ */
+ private XXH32_hash_t xxh32_hashFromCanonical(const XXH32_canonical_t* src) @safe pure nothrow @nogc
+{
+    return xxh_readBE32(src);
+}
+
+/* ----------------------------------------------------------------------------------------*/
+
+/* Helper functions to read 64bit data quantities from memory follow */
+
+private ulong xxh_read64(const void* ptr)
+    @trusted pure nothrow @nogc
+{
+    ulong val;
+    version (HaveUnalignedLoads)
+        val = *(cast(ulong*) ptr);
+    else
+        (cast(ubyte*)&val)[0 .. ulong.sizeof] = (cast(ubyte*) ptr)[0 .. ulong.sizeof];
+    return val;
+}
+
+private ulong xxh_readLE64(const void* ptr)
+    @safe pure nothrow @nogc
+{
+    version (LittleEndian)
+        return xxh_read64(ptr);
+    else
+        return bswap(xxh_read64(ptr));
+}
+
+private ulong xxh_readBE64(const void* ptr)
+    @safe pure nothrow @nogc
+{
+    version (LittleEndian)
+        return bswap(xxh_read64(ptr));
+    else
+        return xxh_read64(ptr);
+}
+
+private ulong xxh_readLE64_align(const void* ptr, XXH_alignment align_)
+    @trusted pure nothrow @nogc
+{
+    if (align_ == XXH_alignment.XXH_unaligned)
+    {
+        return xxh_readLE64(ptr);
+    }
+    else
+    {
+        version (LittleEndian)
+            return *cast(const ulong*) ptr;
+        else
+            return bswap(*cast(const ulong*) ptr);
+    }
+}
+
+enum XXH_PRIME64_1 = 0x9E3779B185EBCA87; /** 0b1001111000110111011110011011000110000101111010111100101010000111 */
+enum XXH_PRIME64_2 = 0xC2B2AE3D27D4EB4F; /** 0b1100001010110010101011100011110100100111110101001110101101001111 */
+enum XXH_PRIME64_3 = 0x165667B19E3779F9; /** 0b0001011001010110011001111011000110011110001101110111100111111001 */
+enum XXH_PRIME64_4 = 0x85EBCA77C2B2AE63; /** 0b1000010111101011110010100111011111000010101100101010111001100011 */
+enum XXH_PRIME64_5 = 0x27D4EB2F165667C5; /** 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+private ulong xxh64_round(ulong acc, ulong input)
+    @safe pure nothrow @nogc
+{
+    acc += input * XXH_PRIME64_2;
+    acc = rol(acc, 31);
+    acc *= XXH_PRIME64_1;
+    return acc;
+}
+
+private ulong xxh64_mergeRound(ulong acc, ulong val)
+    @safe pure nothrow @nogc
+{
+    val = xxh64_round(0, val);
+    acc ^= val;
+    acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
+    return acc;
+}
+
+private ulong xxh64_avalanche(ulong hash)
+    @safe pure nothrow @nogc
+{
+    hash ^= hash >> 33;
+    hash *= XXH_PRIME64_2;
+    hash ^= hash >> 29;
+    hash *= XXH_PRIME64_3;
+    hash ^= hash >> 32;
+    return hash;
+}
+
+ulong xxh_get64bits(const void* p, XXH_alignment align_)
+    @safe pure nothrow @nogc
+{
+    return xxh_readLE64_align(p, align_);
+}
+
+/** Processes the last 0-31 bytes of data at ptr addr.
+ *
+ * There may be up to 31 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * Param: hash = The hash to finalize.
+ * Param: ptr = The pointer to the remaining input.
+ * Param: len = The remaining length, modulo 32.
+ * Param: align = Whether @p ptr is aligned.
+ * Return: The finalized hash
+ * See: xxh32_finalize().
+ */
+private ulong xxh64_finalize(ulong hash, const(ubyte)* ptr, size_t len, XXH_alignment align_)
+    @trusted pure nothrow @nogc
+in
+{
+    if (ptr == null) assert(len == 0, "input null ptr only allowed with len == 0");
+}
+do
+{
+    len &= 31;
+    while (len >= 8)
+    {
+        const ulong k1 = xxh64_round(0, xxh_get64bits(ptr, align_));
+        ptr += 8;
+        hash ^= k1;
+        hash = rol(hash, 27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+        len -= 8;
+    }
+    if (len >= 4)
+    {
+        hash ^= cast(ulong)(xxh_get32bits(ptr, align_)) * XXH_PRIME64_1;
+        ptr += 4;
+        hash = rol(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+        len -= 4;
+    }
+    while (len > 0)
+    {
+        hash ^= (*ptr++) * XXH_PRIME64_5;
+        hash = rol(hash, 11) * XXH_PRIME64_1;
+        --len;
+    }
+    return xxh64_avalanche(hash);
+}
+
+/** The implementation for XXH64().
+ *
+ * Params:
+ *   input = pointer to input data, directly passed from XXH64()
+ *   len = length of input data, directly passed from XXH64()
+ *   seed = Seed value, directly passed from XXH64()
+ *   align = Whether input pointer is aligned.
+ * Return: The calculated hash.
+ */
+private ulong xxh64_endian_align(
+    const(ubyte)* input, size_t len,
+    ulong seed, XXH_alignment align_)
+    @trusted pure nothrow @nogc
+in
+{
+    if (input == null) assert(len == 0, "input null ptr only allowed with len == 0");
+}
+do
+{
+    ulong h64;
+
+    if (len >= 32)
+    {
+        const ubyte* bEnd = input + len;
+        const ubyte* limit = bEnd - 31;
+        ulong v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+        ulong v2 = seed + XXH_PRIME64_2;
+        ulong v3 = seed + 0;
+        ulong v4 = seed - XXH_PRIME64_1;
+
+        do
+        {
+            v1 = xxh64_round(v1, xxh_get64bits(input, align_));
+            input += 8;
+            v2 = xxh64_round(v2, xxh_get64bits(input, align_));
+            input += 8;
+            v3 = xxh64_round(v3, xxh_get64bits(input, align_));
+            input += 8;
+            v4 = xxh64_round(v4, xxh_get64bits(input, align_));
+            input += 8;
+        }
+        while (input < limit);
+
+        h64 = rol(v1, 1) + rol(v2, 7) + rol(v3, 12) + rol(v4, 18);
+        h64 = xxh64_mergeRound(h64, v1);
+        h64 = xxh64_mergeRound(h64, v2);
+        h64 = xxh64_mergeRound(h64, v3);
+        h64 = xxh64_mergeRound(h64, v4);
+
+    }
+    else
+    {
+        h64 = seed + XXH_PRIME64_5;
+    }
+
+    h64 += cast(ulong) len;
+
+    return xxh64_finalize(h64, input, len, align_);
+}
+
+/* XXH PUBLIC API - hidden in D module */
+
+private XXH64_hash_t XXH64(const void* input, size_t len, XXH64_hash_t seed)
+    @safe pure nothrow @nogc
+{
+    static if (!XXH_NO_STREAM && XXH_SIZE_OPT >= 2)
+    {
+        /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+        XXH64_state_t state;
+        auto rc = xxh64_reset(&state, seed);
+        if (rc == XXH_errorcode.XXH_OK)
+        {
+            rc = xxh64_update(&state, cast(const(ubyte)*) input, len);
+            if (rc == XXH_errorcode.XXH_OK)
+            {
+                return xxh64_digest(&state);
+            }
+        }
+        return XXH_errorcode.XXH_ERROR;
+    }
+    else
+    {
+        if (XXH_FORCE_ALIGN_CHECK)
+        {
+            if (((cast(size_t) input) & 7) == 0)
+            { /* Input is aligned, let's leverage the speed advantage */
+                return xxh64_endian_align(cast(const(ubyte)*) input, len,
+                        seed, XXH_alignment.XXH_aligned);
+            }
+        }
+
+        return xxh64_endian_align(cast(const(ubyte)*) input, len, seed,
+                XXH_alignment.XXH_unaligned);
+    }
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH_errorcode xxh64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+    @safe pure nothrow @nogc
+{
+    assert(statePtr != null, "statePtr == null");
+    *statePtr = XXH64_state_t.init;
+    statePtr.v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+    statePtr.v[1] = seed + XXH_PRIME64_2;
+    statePtr.v[2] = seed + 0;
+    statePtr.v[3] = seed - XXH_PRIME64_1;
+    return XXH_errorcode.XXH_OK;
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH_errorcode xxh64_update(XXH64_state_t* state, const void* input, size_t len)
+    @trusted pure nothrow @nogc
+in
+{
+    if (input == null) assert(len == 0, "input null ptr only allowed with len == 0");
+}
+do
+{
+    if (input == null && len == 0)
+        return XXH_errorcode.XXH_OK;
+    else if (input == null && len != 0)
+        return XXH_errorcode.XXH_ERROR;
+    else
+    {
+        const(ubyte)* p = cast(const(ubyte)*) input;
+        const ubyte* bEnd = p + len;
+
+        state.total_len += len;
+
+        if (state.memsize + len < 32)
+        {
+            /* fill in tmp buffer */
+            (cast(ubyte*) state.mem64) [state.memsize .. state.memsize + len] =
+                 (cast(ubyte*) input) [0 .. len];
+            state.memsize += cast(uint) len;
+            return XXH_errorcode.XXH_OK;
+        }
+
+        if (state.memsize)
+        {
+            /* tmp buffer is full */
+            (cast(ubyte*) state.mem64) [state.memsize .. state.memsize + (32 - state.memsize)] =
+                 (cast(ubyte*) input) [0 .. (32 - state.memsize)];
+            state.v[0] = xxh64_round(state.v[0], xxh_readLE64(&state.mem64[0]));
+            state.v[1] = xxh64_round(state.v[1], xxh_readLE64(&state.mem64[1]));
+            state.v[2] = xxh64_round(state.v[2], xxh_readLE64(&state.mem64[2]));
+            state.v[3] = xxh64_round(state.v[3], xxh_readLE64(&state.mem64[3]));
+            p += 32 - state.memsize;
+            state.memsize = 0;
+        }
+
+        if (p + 32 <= bEnd)
+        {
+            const ubyte* limit = bEnd - 32;
+
+            do
+            {
+                state.v[0] = xxh64_round(state.v[0], xxh_readLE64(p));
+                p += 8;
+                state.v[1] = xxh64_round(state.v[1], xxh_readLE64(p));
+                p += 8;
+                state.v[2] = xxh64_round(state.v[2], xxh_readLE64(p));
+                p += 8;
+                state.v[3] = xxh64_round(state.v[3], xxh_readLE64(p));
+                p += 8;
+            }
+            while (p <= limit);
+
+        }
+
+        if (p < bEnd)
+        {
+            (cast(void*) &state.mem64[0]) [0 .. cast(size_t) (bEnd - p)] =
+                (cast(void*) p) [0 .. cast(size_t) (bEnd - p)];
+            state.memsize = cast(XXH32_hash_t)(bEnd - p);
+        }
+    }
+
+    return XXH_errorcode.XXH_OK;
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH64_hash_t xxh64_digest(const XXH64_state_t* state)
+    @trusted pure nothrow @nogc
+{
+    ulong h64;
+
+    if (state.total_len >= 32)
+    {
+        h64 = rol(state.v[0], 1) + rol(state.v[1],
+                7) + rol(state.v[2], 12) + rol(state.v[3], 18);
+        h64 = xxh64_mergeRound(h64, state.v[0]);
+        h64 = xxh64_mergeRound(h64, state.v[1]);
+        h64 = xxh64_mergeRound(h64, state.v[2]);
+        h64 = xxh64_mergeRound(h64, state.v[3]);
+    }
+    else
+    {
+        h64 = state.v[2] /*seed*/  + XXH_PRIME64_5;
+    }
+
+    h64 += cast(ulong) state.total_len;
+
+    return xxh64_finalize(h64, cast(const ubyte*) state.mem64,
+            cast(size_t) state.total_len, XXH_alignment.XXH_aligned);
+}
+
+
+/* XXH PUBLIC API - hidden in D module */
+private void xxh64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+    @trusted pure nothrow @nogc
+{
+    static assert((XXH64_canonical_t).sizeof == (XXH64_hash_t).sizeof,
+                    "(XXH64_canonical_t).sizeof != (XXH64_hash_t).sizeof");
+    version (LittleEndian)
+        hash = bswap(hash);
+    (cast(ubyte*) dst) [0 .. dst.sizeof] = (cast(ubyte*) &hash) [0 .. dst.sizeof];
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH64_hash_t xxh64_hashFromCanonical(const XXH64_canonical_t* src)
+    @safe pure nothrow @nogc
+{
+    return xxh_readBE64(src);
+}
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+
+enum XXH3_SECRET_SIZE_MIN = 136; /// The bare minimum size for a custom secret.
+enum XXH3_SECRET_DEFAULT_SIZE = 192; /* minimum XXH3_SECRET_SIZE_MIN */
+enum XXH_SECRET_DEFAULT_SIZE = 192; /* minimum XXH3_SECRET_SIZE_MIN */
+enum XXH3_INTERNALBUFFER_SIZE = 256; ///The size of the internal XXH3 buffer.
+
+/* Structure for XXH3 streaming API.
+ *
+ * Note: ** This structure has a strict alignment requirement of 64 bytes!! **
+ * Do not allocate this with `malloc()` or `new`, it will not be sufficiently aligned.
+ *
+ * Do never access the members of this struct directly.
+ *
+ * See: XXH3_INITSTATE() for stack initialization.
+ * See: XXH32_state_s, XXH64_state_s
+ */
+private align(64) struct XXH3_state_t
+{
+    align(64) XXH64_hash_t[8] acc;
+    /** The 8 accumulators. See XXH32_state_s::v and XXH64_state_s::v */
+    align(64) ubyte[XXH3_SECRET_DEFAULT_SIZE] customSecret;
+    /** Used to store a custom secret generated from a seed. */
+    align(64) ubyte[XXH3_INTERNALBUFFER_SIZE] buffer;
+    /** The internal buffer. See: XXH32_state_s::mem32 */
+    XXH32_hash_t bufferedSize;
+    /** The amount of memory in buffer, See: XXH32_state_s::memsize */
+    XXH32_hash_t useSeed;
+    /** Reserved field. Needed for padding on 64-bit. */
+    size_t nbStripesSoFar;
+    /** Number or stripes processed. */
+    XXH64_hash_t totalLen;
+    /** Total length hashed. 64-bit even on 32-bit targets. */
+    size_t nbStripesPerBlock;
+    /** Number of stripes per block. */
+    size_t secretLimit;
+    /** Size of customSecret or extSecret */
+    XXH64_hash_t seed;
+    /** Seed for _withSeed variants. Must be zero otherwise, See: XXH3_INITSTATE() */
+    XXH64_hash_t reserved64;
+    /** Reserved field. */
+    const(ubyte)* extSecret;
+    /** Reference to an external secret for the _withSecret variants, null
+     *   for other variants. */
+    /* note: there may be some padding at the end due to alignment on 64 bytes */
+} /* typedef'd to XXH3_state_t */
+
+static assert(XXH_SECRET_DEFAULT_SIZE >= XXH3_SECRET_SIZE_MIN, "default keyset is not large enough");
+
+/** Pseudorandom secret taken directly from FARSH. */
+private align(64) immutable ubyte[XXH3_SECRET_DEFAULT_SIZE] xxh3_kSecret = [
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c,
+    0xf7, 0x21, 0xad, 0x1c, 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb,
+    0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, 0xcb, 0x79, 0xe6, 0x4e,
+    0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6,
+    0x81, 0x3a, 0x26, 0x4c, 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb,
+    0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, 0x71, 0x64, 0x48, 0x97,
+    0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7,
+    0xc7, 0x0b, 0x4f, 0x1d, 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31,
+    0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, 0xea, 0xc5, 0xac, 0x83,
+    0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26,
+    0x29, 0xd4, 0x68, 0x9e, 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc,
+    0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, 0x45, 0xcb, 0x3a, 0x8f,
+    0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+];
+
+/* This performs a 32x32 -> 64 bit multiplikation */
+private ulong xxh_mult32to64(uint x, uint y) @safe pure nothrow @nogc
+{
+    return (cast(ulong)(x) * cast(ulong)(y));
+}
+
+/** Calculates a 64 to 128-bit long multiply.
+ *
+ * Param: lhs , rhs The 64-bit integers to be multiplied
+ * Return: The 128-bit result represented in an XXH128_hash_t structure.
+ */
+private XXH128_hash_t xxh_mult64to128(ulong lhs, ulong rhs) @safe pure nothrow @nogc
+{
+    version (Have128BitInteger)
+    {
+        Cent cent_lhs; cent_lhs.lo = lhs;
+        Cent cent_rhs; cent_rhs.lo = rhs;
+        const Cent product = mul(cent_lhs, cent_rhs);
+        XXH128_hash_t r128;
+        r128.low64 = product.lo;
+        r128.high64 = product.hi;
+    }
+    else
+    {
+        /* First calculate all of the cross products. */
+        const ulong lo_lo = xxh_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+        const ulong hi_lo = xxh_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
+        const ulong lo_hi = xxh_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+        const ulong hi_hi = xxh_mult32to64(lhs >> 32, rhs >> 32);
+
+        /* Now add the products together. These will never overflow. */
+        const ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+        const ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
+        const ulong lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+        XXH128_hash_t r128;
+        r128.low64 = lower;
+        r128.high64 = upper;
+    }
+    return r128;
+}
+
+/** Calculates a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ *
+ * Param: lhs , rhs The 64-bit integers to multiply
+ * Return: The low 64 bits of the product XOR'd by the high 64 bits.
+ * See: xxh_mult64to128()
+ */
+private ulong xxh3_mul128_fold64(ulong lhs, ulong rhs) @safe pure nothrow @nogc
+{
+    XXH128_hash_t product = xxh_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/* Seems to produce slightly better code on GCC for some reason. */
+static private ulong xxh_xorshift64(ulong v64, int shift) @safe pure nothrow @nogc
+in(0 <= shift && shift < 64, "shift out of range")
+{
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * This is a fast avalanche stage,
+ * suitable when input bits are already partially mixed
+ */
+static private XXH64_hash_t xxh3_avalanche(ulong h64) @safe pure nothrow @nogc
+{
+    h64 = xxh_xorshift64(h64, 37);
+    h64 *= 0x165667919E3779F9;
+    h64 = xxh_xorshift64(h64, 32);
+    return h64;
+}
+
+/*
+ * This is a stronger avalanche,
+ * inspired by Pelle Evensen's rrmxmx
+ * preferable when input has not been previously mixed
+ */
+static private XXH64_hash_t xxh3_rrmxmx(ulong h64, ulong len) @safe pure nothrow @nogc
+{
+    /* this mix is inspired by Pelle Evensen's rrmxmx */
+    h64 ^= rol(h64, 49) ^ rol(h64, 24);
+    h64 *= 0x9FB21C651E98DF25;
+    h64 ^= (h64 >> 35) + len;
+    h64 *= 0x9FB21C651E98DF25;
+    return xxh_xorshift64(h64, 28);
+}
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+private XXH64_hash_t xxh3_len_1to3_64b(
+    const ubyte* input, size_t len, const ubyte* secret, XXH64_hash_t seed)
+    @trusted pure nothrow @nogc
+in(input != null, "input == null")
+in(1 <= len && len <= 3, "len out of range")
+in(secret != null, "secret == null")
+{
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {
+        const ubyte c1 = input[0];
+        const ubyte c2 = input[len >> 1];
+        const ubyte c3 = input[len - 1];
+        const uint combined = (cast(uint) c1 << 16) | (
+                cast(uint) c2 << 24) | (cast(uint) c3 << 0) | (cast(uint) len << 8);
+        const ulong bitflip = (xxh_readLE32(secret) ^ xxh_readLE32(secret + 4)) + seed;
+        const ulong keyed = cast(ulong) combined ^ bitflip;
+        return xxh64_avalanche(keyed);
+    }
+}
+
+private XXH64_hash_t xxh3_len_4to8_64b(
+    const ubyte* input, size_t len, const ubyte* secret, XXH64_hash_t seed)
+    @trusted pure nothrow @nogc
+in(input != null, "input == null")
+in(secret != null, "secret == null")
+in(4 <= len && len <= 8, "len out of range")
+{
+    seed ^= cast(ulong) bswap(cast(uint) seed) << 32;
+    {
+        const uint input1 = xxh_readLE32(input);
+        const uint input2 = xxh_readLE32(input + len - 4);
+        const ulong bitflip = (xxh_readLE64(secret + 8) ^ xxh_readLE64(secret + 16)) - seed;
+        const ulong input64 = input2 + ((cast(ulong) input1) << 32);
+        const ulong keyed = input64 ^ bitflip;
+        return xxh3_rrmxmx(keyed, len);
+    }
+}
+
+private XXH64_hash_t xxh3_len_9to16_64b(
+    const ubyte* input, size_t len, const ubyte* secret, XXH64_hash_t seed)
+    @trusted pure nothrow @nogc
+in(input != null, "input == null")
+in(secret != null, "secret == null")
+in(9 <= len && len <= 16, "len out of range")
+{
+    {
+        const ulong bitflip1 = (xxh_readLE64(secret + 24) ^ xxh_readLE64(secret + 32)) + seed;
+        const ulong bitflip2 = (xxh_readLE64(secret + 40) ^ xxh_readLE64(secret + 48)) - seed;
+        const ulong input_lo = xxh_readLE64(input) ^ bitflip1;
+        const ulong input_hi = xxh_readLE64(input + len - 8) ^ bitflip2;
+        const ulong acc = len + bswap(input_lo) + input_hi + xxh3_mul128_fold64(input_lo,
+                input_hi);
+        return xxh3_avalanche(acc);
+    }
+}
+
+private bool xxh_likely(bool exp)
+    @safe pure nothrow @nogc
+{
+    return exp;
+}
+
+private bool xxh_unlikely(bool exp)
+    @safe pure nothrow @nogc
+{
+    return exp;
+}
+
+private XXH64_hash_t xxh3_len_0to16_64b(
+    const ubyte* input, size_t len, const ubyte* secret, XXH64_hash_t seed)
+    @trusted pure nothrow @nogc
+in(len <= 16, "len > 16")
+{
+    {
+        if (xxh_likely(len > 8))
+            return xxh3_len_9to16_64b(input, len, secret, seed);
+        if (xxh_likely(len >= 4))
+            return xxh3_len_4to8_64b(input, len, secret, seed);
+        if (len)
+            return xxh3_len_1to3_64b(input, len, secret, seed);
+        return xxh64_avalanche(seed ^ (xxh_readLE64(secret + 56) ^ xxh_readLE64(secret + 64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded xxh3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in xxh3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+private ulong xxh3_mix16B(const(ubyte)* input, const(ubyte)* secret, ulong seed64)
+    @trusted pure nothrow @nogc
+{
+    {
+        const ulong input_lo = xxh_readLE64(input);
+        const ulong input_hi = xxh_readLE64(input + 8);
+        return xxh3_mul128_fold64(
+            input_lo ^ (xxh_readLE64(secret) + seed64),
+            input_hi ^ (xxh_readLE64(secret + 8) - seed64));
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+private XXH64_hash_t xxh3_len_17to128_64b(
+    const(ubyte)* input, size_t len, const(ubyte)* secret, size_t secretSize, XXH64_hash_t seed)
+    @trusted pure nothrow @nogc
+in(secretSize >= XXH3_SECRET_SIZE_MIN, "secretSize < XXH3_SECRET_SIZE_MIN")
+in(16 < len && len <= 128, "len out of range")
+{
+    ulong acc = len * XXH_PRIME64_1;
+    if (len > 32)
+    {
+        if (len > 64)
+        {
+            if (len > 96)
+            {
+                acc += xxh3_mix16B(input + 48, secret + 96, seed);
+                acc += xxh3_mix16B(input + len - 64, secret + 112, seed);
+            }
+            acc += xxh3_mix16B(input + 32, secret + 64, seed);
+            acc += xxh3_mix16B(input + len - 48, secret + 80, seed);
+        }
+        acc += xxh3_mix16B(input + 16, secret + 32, seed);
+        acc += xxh3_mix16B(input + len - 32, secret + 48, seed);
+    }
+    acc += xxh3_mix16B(input + 0, secret + 0, seed);
+    acc += xxh3_mix16B(input + len - 16, secret + 16, seed);
+
+    return xxh3_avalanche(acc);
+}
+
+enum XXH3_MIDSIZE_MAX = 240;
+enum XXH3_MIDSIZE_STARTOFFSET = 3;
+enum XXH3_MIDSIZE_LASTOFFSET = 17;
+
+private XXH64_hash_t xxh3_len_129to240_64b(
+    const(ubyte)* input, size_t len, const(ubyte)* secret, size_t secretSize, XXH64_hash_t seed)
+    @trusted pure nothrow @nogc
+in(secretSize >= XXH3_SECRET_SIZE_MIN, "secretSize < XXH3_SECRET_SIZE_MIN")
+in { const int nbRounds = cast(int) len / 16; assert(nbRounds >= 8, "nbRounds < 8"); }
+in(128 < len && len <= XXH3_MIDSIZE_MAX, "128 >= len || len > XXH3_MIDSIZE_MAX")
+{
+    ulong acc = len * XXH_PRIME64_1;
+    const int nbRounds = cast(int) len / 16;
+    int i;
+    for (i = 0; i < 8; i++)
+    {
+        acc += xxh3_mix16B(input + (16 * i), secret + (16 * i), seed);
+    }
+    acc = xxh3_avalanche(acc);
+    for (i = 8; i < nbRounds; i++)
+    {
+        acc += xxh3_mix16B(input + (16 * i),
+                secret + (16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+    }
+    /* last bytes */
+    acc += xxh3_mix16B(input + len - 16,
+            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+    return xxh3_avalanche(acc);
+}
+
+/* =======     Long Keys     ======= */
+
+enum XXH_STRIPE_LEN = 64;
+enum XXH_SECRET_CONSUME_RATE = 8; /* nb of secret bytes consumed at each accumulation */
+enum XXH_ACC_NB = (XXH_STRIPE_LEN / (ulong).sizeof);
+
+private void xxh_writeLE64(void* dst, ulong v64)
+    @trusted pure nothrow @nogc
+{
+    version (LittleEndian) {}
+    else
+        v64 = bswap(v64);
+    (cast(ubyte*) dst) [0 .. v64.sizeof] = (cast(ubyte*) &v64) [0 .. v64.sizeof];
+}
+
+/* scalar variants - universal */
+
+enum XXH_ACC_ALIGN = 8;
+
+/* Scalar round for xxh3_accumulate_512_scalar(). */
+private void xxh3_scalarRound(void* acc, const(void)* input, const(void)* secret, size_t lane)
+    @trusted pure nothrow @nogc
+in(lane < XXH_ACC_NB, "lane >= XXH_ACC_NB")
+{
+    version (CheckACCAlignment)
+        assert((cast(size_t) acc & (XXH_ACC_ALIGN - 1)) == 0, "(cast(size_t) acc & (XXH_ACC_ALIGN - 1)) != 0");
+    ulong* xacc = cast(ulong*) acc;
+    ubyte* xinput = cast(ubyte*) input;
+    ubyte* xsecret = cast(ubyte*) secret;
+    {
+        const ulong data_val = xxh_readLE64(xinput + lane * 8);
+        const ulong data_key = data_val ^ xxh_readLE64(xsecret + lane * 8);
+        xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
+        xacc[lane] += xxh_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+}
+
+/* Processes a 64 byte block of data using the scalar path. */
+private void xxh3_accumulate_512_scalar(void* acc, const(void)* input, const(void)* secret)
+    @safe pure nothrow @nogc
+{
+    size_t i;
+    for (i = 0; i < XXH_ACC_NB; i++)
+    {
+        xxh3_scalarRound(acc, input, secret, i);
+    }
+}
+
+/* Scalar scramble step for xxh3_scrambleAcc_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+private void xxh3_scalarScrambleRound(void* acc, const(void)* secret, size_t lane)
+    @trusted pure nothrow @nogc
+in(lane < XXH_ACC_NB, "lane >= XXH_ACC_NB")
+{
+    version (CheckACCAlignment)
+        assert(((cast(size_t) acc) & (XXH_ACC_ALIGN - 1)) == 0, "((cast(size_t) acc) & (XXH_ACC_ALIGN - 1)) != 0");
+    ulong* xacc = cast(ulong*) acc; /* presumed aligned */
+    const ubyte* xsecret = cast(const ubyte*) secret; /* no alignment restriction */
+    {
+        const ulong key64 = xxh_readLE64(xsecret + lane * 8);
+        ulong acc64 = xacc[lane];
+        acc64 = xxh_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= XXH_PRIME32_1;
+        xacc[lane] = acc64;
+    }
+}
+
+/* Scrambles the accumulators after a large chunk has been read */
+private void xxh3_scrambleAcc_scalar(void* acc, const(void)* secret)
+    @safe pure nothrow @nogc
+{
+    size_t i;
+    for (i = 0; i < XXH_ACC_NB; i++)
+    {
+        xxh3_scalarScrambleRound(acc, secret, i);
+    }
+}
+
+private void xxh3_initCustomSecret_scalar(void* customSecret, ulong seed64)
+    @trusted pure nothrow @nogc
+{
+    /*
+     * We need a separate pointer for the hack below,
+     * which requires a non-const pointer.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const ubyte* kSecretPtr = cast(ubyte*) xxh3_kSecret;
+    static assert((XXH_SECRET_DEFAULT_SIZE & 15) == 0, "(XXH_SECRET_DEFAULT_SIZE & 15) != 0");
+
+    /*
+     * Note: in debug mode, this overrides the asm optimization
+     * and Clang will emit MOVK chains again.
+     */
+    //assert(kSecretPtr == xxh3_kSecret);
+
+    {
+        const int nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+        int i;
+        for (i = 0; i < nbRounds; i++)
+        {
+            /*
+             * The asm hack causes Clang to assume that kSecretPtr aliases with
+             * customSecret, and on aarch64, this prevented LDP from merging two
+             * loads together for free. Putting the loads together before the stores
+             * properly generates LDP.
+             */
+            const ulong lo = xxh_readLE64(kSecretPtr + 16 * i) + seed64;
+            const ulong hi = xxh_readLE64(kSecretPtr + 16 * i + 8) - seed64;
+            xxh_writeLE64(cast(ubyte*) customSecret + 16 * i, lo);
+            xxh_writeLE64(cast(ubyte*) customSecret + 16 * i + 8, hi);
+        }
+    }
+}
+
+alias XXH3_f_accumulate_512 = void function(void*, const(void)*, const(void)*) @safe pure nothrow @nogc;
+alias XXH3_f_scrambleAcc = void function(void*, const void*) @safe pure nothrow @nogc;
+alias XXH3_f_initCustomSecret = void function(void*, ulong) @safe pure nothrow @nogc;
+
+immutable XXH3_f_accumulate_512 xxh3_accumulate_512 = &xxh3_accumulate_512_scalar;
+immutable XXH3_f_scrambleAcc xxh3_scrambleAcc = &xxh3_scrambleAcc_scalar;
+immutable XXH3_f_initCustomSecret xxh3_initCustomSecret = &xxh3_initCustomSecret_scalar;
+
+enum XXH_PREFETCH_DIST = 384;
+/* TODO: Determine how to implement prefetching in D! Disabled for now */
+private void XXH_PREFETCH(const ubyte* ptr) @safe pure nothrow @nogc
+{
+//    cast(void)(ptr); /* DISABLED prefetch and do nothing here */
+
+// In C it is done with the following code lines:
+//  if XXH_SIZE_OPT >= 1
+//    define XXH_PREFETCH(ptr) (void)(ptr)
+//  elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
+//    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+//    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+//  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+//    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw == read */, 3 /* locality */)
+//  else
+//    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+//  endif
+}
+
+/*
+ * xxh3_accumulate()
+ * Loops over xxh3_accumulate_512().
+ * Assumption: nbStripes will not overflow the secret size
+ */
+private void xxh3_accumulate(
+    ulong* acc, const ubyte* input,
+    const ubyte* secret, size_t nbStripes, XXH3_f_accumulate_512 f_acc512)
+    @trusted pure nothrow @nogc
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++)
+    {
+        const ubyte* in_ = input + n * XXH_STRIPE_LEN;
+        XXH_PREFETCH(in_ + XXH_PREFETCH_DIST);
+        f_acc512(acc, in_, secret + n * XXH_SECRET_CONSUME_RATE);
+    }
+}
+
+private void xxh3_hashLong_internal_loop(
+    ulong* acc, const ubyte* input, size_t len, const ubyte* secret,
+    size_t secretSize, XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble)
+    @trusted pure nothrow @nogc
+in(secretSize >= XXH3_SECRET_SIZE_MIN, "secretSize < XXH3_SECRET_SIZE_MIN")
+in(len > XXH_STRIPE_LEN, "len <= XXH_STRIPE_LEN")
+{
+    const size_t nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    const size_t block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
+    const size_t nb_blocks = (len - 1) / block_len;
+
+    size_t n;
+
+    for (n = 0; n < nb_blocks; n++)
+    {
+        xxh3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock, f_acc512);
+        f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
+    }
+
+    /* last partial block */
+    {
+        const size_t nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
+        assert(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE),
+                "nbStripes > (secretSize / XXH_SECRET_CONSUME_RATE)");
+        xxh3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes, f_acc512);
+
+        /* last stripe */
+        {
+            const ubyte* p = input + len - XXH_STRIPE_LEN;
+            f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
+        }
+    }
+}
+
+enum XXH_SECRET_LASTACC_START = 7; /* not aligned on 8, last secret is different from acc & scrambler */
+
+private ulong xxh3_mix2Accs(const(ulong)* acc, const(ubyte)* secret)
+    @trusted pure nothrow @nogc
+{
+    return xxh3_mul128_fold64(acc[0] ^ xxh_readLE64(secret), acc[1] ^ xxh_readLE64(secret + 8));
+}
+
+private XXH64_hash_t xxh3_mergeAccs(const(ulong)* acc, const(ubyte)* secret, ulong start)
+    @trusted pure nothrow @nogc
+{
+    ulong result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++)
+    {
+        result64 += xxh3_mix2Accs(acc + 2 * i, secret + 16 * i);
+    }
+
+    return xxh3_avalanche(result64);
+}
+
+static immutable XXH3_INIT_ACC = [
+        XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3,
+        XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1
+    ];
+
+private XXH64_hash_t xxh3_hashLong_64b_internal(
+    const(void)* input, size_t len, const(void)* secret, size_t secretSize,
+    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble)
+    @trusted pure nothrow @nogc
+{
+    align(XXH_ACC_ALIGN) ulong[XXH_ACC_NB] acc = XXH3_INIT_ACC; /* NOTE: This doesn't work in D, fails on 32bit NetBSD */
+
+    xxh3_hashLong_internal_loop(&acc[0], cast(const(ubyte)*) input, len,
+            cast(const(ubyte)*) secret, secretSize, f_acc512, f_scramble);
+
+    /* converge into final hash */
+    static assert(acc.sizeof == 64, "acc.sizeof != 64");
+    /* do not align on 8, so that the secret is different from the accumulator */
+    assert(secretSize >= acc.sizeof + XXH_SECRET_MERGEACCS_START,
+            "secretSize < acc.sizeof + XXH_SECRET_MERGEACCS_START");
+    return xxh3_mergeAccs(&acc[0], cast(const(ubyte)*) secret + XXH_SECRET_MERGEACCS_START,
+            cast(ulong) len * XXH_PRIME64_1);
+}
+
+enum XXH_SECRET_MERGEACCS_START = 11;
+
+/*
+ * It's important for performance to transmit secret's size (when it's static)
+ * so that the compiler can properly optimize the vectorized loop.
+ * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
+ */
+private XXH64_hash_t xxh3_hashLong_64b_withSecret(
+    const(void)* input, size_t len, XXH64_hash_t seed64, const(ubyte)* secret, size_t secretLen)
+    @safe pure nothrow @nogc
+{
+    return xxh3_hashLong_64b_internal(input, len, secret, secretLen,
+            xxh3_accumulate_512, xxh3_scrambleAcc);
+}
+
+/*
+ * It's preferable for performance that XXH3_hashLong is not inlined,
+ * as it results in a smaller function for small data, easier to the instruction cache.
+ * Note that inside this no_inline function, we do inline the internal loop,
+ * and provide a statically defined secret size to allow optimization of vector loop.
+ */
+private XXH64_hash_t xxh3_hashLong_64b_default(
+    const(void)* input, size_t len, XXH64_hash_t seed64, const(ubyte)* secret, size_t secretLen)
+    @safe pure nothrow @nogc
+{
+    return xxh3_hashLong_64b_internal(input, len, &xxh3_kSecret[0],
+            (xxh3_kSecret).sizeof, xxh3_accumulate_512, xxh3_scrambleAcc);
+}
+
+enum XXH_SEC_ALIGN = 8;
+
+/*
+ * xxh3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default xxh3_kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed == 0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+private XXH64_hash_t xxh3_hashLong_64b_withSeed_internal(
+    const(void)* input, size_t len, XXH64_hash_t seed,
+    XXH3_f_accumulate_512 f_acc512,
+    XXH3_f_scrambleAcc f_scramble,
+    XXH3_f_initCustomSecret f_initSec)
+    @trusted pure nothrow @nogc
+{
+    //#if XXH_SIZE_OPT <= 0
+    if (seed == 0)
+        return xxh3_hashLong_64b_internal(input, len, &xxh3_kSecret[0],
+                (xxh3_kSecret).sizeof, f_acc512, f_scramble);
+    //#endif
+    else
+    {
+        align(XXH_SEC_ALIGN) ubyte[XXH_SECRET_DEFAULT_SIZE] secret;
+        f_initSec(&secret[0], seed);
+        return xxh3_hashLong_64b_internal(input, len, &secret[0],
+            (secret).sizeof, f_acc512, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+private XXH64_hash_t xxh3_hashLong_64b_withSeed(const(void)* input, size_t len,
+    XXH64_hash_t seed, const(ubyte)* secret, size_t secretLen)
+    @safe pure nothrow @nogc
+{
+    return xxh3_hashLong_64b_withSeed_internal(input, len, seed,
+            xxh3_accumulate_512, xxh3_scrambleAcc, xxh3_initCustomSecret);
+}
+
+alias XXH3_hashLong64_f = XXH64_hash_t function(
+    const(void)*, size_t, XXH64_hash_t, const(ubyte)*, size_t)
+    @safe pure nothrow @nogc;
+
+private XXH64_hash_t xxh3_64bits_internal(const(void)* input, size_t len,
+        XXH64_hash_t seed64, const(void)* secret, size_t secretLen, XXH3_hashLong64_f f_hashLong)
+        @safe pure nothrow @nogc
+in(secretLen >= XXH3_SECRET_SIZE_MIN, "secretLen < XXH3_SECRET_SIZE_MIN")
+{
+    /*
+     * If an action is to be taken if `secretLen` condition is not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     * Also, note that function signature doesn't offer room to return an error.
+     */
+    if (len <= 16)
+        return xxh3_len_0to16_64b(cast(const(ubyte)*) input, len,
+                cast(const(ubyte)*) secret, seed64);
+    if (len <= 128)
+        return xxh3_len_17to128_64b(cast(const(ubyte)*) input, len,
+                cast(const(ubyte)*) secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return xxh3_len_129to240_64b(cast(const(ubyte)*) input, len,
+                cast(const(ubyte)*) secret, secretLen, seed64);
+    return f_hashLong(input, len, seed64, cast(const(ubyte)*) secret, secretLen);
+}
+
+/* ===   Public entry point   === */
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH64_hash_t xxh3_64bits(const(void)* input, size_t length)
+    @safe pure nothrow @nogc
+{
+    return xxh3_64bits_internal(input, length, 0, &xxh3_kSecret[0],
+            (xxh3_kSecret).sizeof, &xxh3_hashLong_64b_default);
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH64_hash_t xxh3_64bits_withSecret(
+    const(void)* input, size_t length, const(void)* secret, size_t secretSize)
+    @safe pure nothrow @nogc
+{
+    return xxh3_64bits_internal(input, length, 0, secret, secretSize,
+            &xxh3_hashLong_64b_withSecret);
+}
+
+XXH64_hash_t xxh3_64bits_withSeed(const(void)* input, size_t length, XXH64_hash_t seed)
+    @safe pure nothrow @nogc
+{
+    return xxh3_64bits_internal(input, length, seed, &xxh3_kSecret[0],
+            (xxh3_kSecret).sizeof, &xxh3_hashLong_64b_withSeed);
+}
+
+/* XXH PUBLIC API - hidden in D module */ private
+XXH64_hash_t xxh3_64bits_withSecretandSeed(
+    const(void)* input, size_t length, const(void)* secret, size_t secretSize, XXH64_hash_t seed)
+    @safe pure nothrow @nogc
+{
+    if (length <= XXH3_MIDSIZE_MAX)
+        return xxh3_64bits_internal(input, length, seed, &xxh3_kSecret[0],
+                (xxh3_kSecret).sizeof, null);
+    return xxh3_hashLong_64b_withSecret(input, length, seed,
+            cast(const(ubyte)*) secret, secretSize);
+}
+
+/* ===   XXH3 streaming   === */
+
+private void XXH3_INITSTATE(XXH3_state_t* XXH3_state_ptr)
+    @safe nothrow @nogc
+{
+    (XXH3_state_ptr).seed = 0;
+}
+
+private void xxh3_reset_internal(
+    scope XXH3_state_t* statePtr, XXH64_hash_t seed, const void* secret, size_t secretSize)
+    @trusted pure nothrow @nogc
+in
+{
+    const size_t initStart = XXH3_state_t.bufferedSize.offsetof;
+    assert(XXH3_state_t.nbStripesPerBlock.offsetof > initStart,
+            "(XXH3_state_t.nbStripesPerBlock.offsetof <= initStart");
+}
+in(statePtr != null, "statePtr == null")
+in(secretSize >= XXH3_SECRET_SIZE_MIN, "secretSize < XXH3_SECRET_SIZE_MIN")
+{
+    /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
+    *statePtr = XXH3_state_t.init;
+    statePtr.acc[0] = XXH_PRIME32_3;
+    statePtr.acc[1] = XXH_PRIME64_1;
+    statePtr.acc[2] = XXH_PRIME64_2;
+    statePtr.acc[3] = XXH_PRIME64_3;
+    statePtr.acc[4] = XXH_PRIME64_4;
+    statePtr.acc[5] = XXH_PRIME32_2;
+    statePtr.acc[6] = XXH_PRIME64_5;
+    statePtr.acc[7] = XXH_PRIME32_1;
+    statePtr.seed = seed;
+    statePtr.useSeed = (seed != 0);
+    statePtr.extSecret = cast(const(ubyte)*) secret;
+    statePtr.secretLimit = secretSize - XXH_STRIPE_LEN;
+    statePtr.nbStripesPerBlock = statePtr.secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+/* XXH PUBLIC API - hidden in D module */ private
+XXH_errorcode xxh3_64bits_reset(scope XXH3_state_t* statePtr)
+    @safe pure nothrow @nogc
+{
+    if (statePtr == null)
+        return XXH_errorcode.XXH_ERROR;
+    xxh3_reset_internal(statePtr, 0, &xxh3_kSecret[0], XXH_SECRET_DEFAULT_SIZE);
+    return XXH_errorcode.XXH_OK;
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH_errorcode xxh3_64bits_reset_withSecret(
+    XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+    @safe pure nothrow @nogc
+{
+    if (statePtr == null)
+        return XXH_errorcode.XXH_ERROR;
+    xxh3_reset_internal(statePtr, 0, secret, secretSize);
+    if (secret == null)
+        return XXH_errorcode.XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN)
+        return XXH_errorcode.XXH_ERROR;
+    return XXH_errorcode.XXH_OK;
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH_errorcode xxh3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+    @safe pure nothrow @nogc
+{
+    if (statePtr == null)
+        return XXH_errorcode.XXH_ERROR;
+    if (seed == 0)
+        return xxh3_64bits_reset(statePtr);
+    if ((seed != statePtr.seed) || (statePtr.extSecret != null))
+        xxh3_initCustomSecret(&statePtr.customSecret[0], seed);
+    xxh3_reset_internal(statePtr, seed, null, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_errorcode.XXH_OK;
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH_errorcode xxh3_64bits_reset_withSecretandSeed(
+    XXH3_state_t* statePtr, const(void)* secret, size_t secretSize, XXH64_hash_t seed64)
+    @safe pure nothrow @nogc
+{
+    if (statePtr == null)
+        return XXH_errorcode.XXH_ERROR;
+    if (secret == null)
+        return XXH_errorcode.XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN)
+        return XXH_errorcode.XXH_ERROR;
+    xxh3_reset_internal(statePtr, seed64, secret, secretSize);
+    statePtr.useSeed = 1; /* always, even if seed64 == 0 */
+    return XXH_errorcode.XXH_OK;
+}
+
+/* Note : when xxh3_consumeStripes() is invoked,
+ * there must be a guarantee that at least one more byte must be consumed from input
+ * so that the function can blindly consume all stripes using the "normal" secret segment */
+private void xxh3_consumeStripes(
+    ulong* acc, size_t* nbStripesSoFarPtr, size_t nbStripesPerBlock, const ubyte* input, size_t nbStripes,
+    const ubyte* secret, size_t secretLimit,
+    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble)
+    @trusted pure nothrow @nogc
+in(nbStripes <= nbStripesPerBlock, "nbStripes > nbStripesPerBlock") /* can handle max 1 scramble per invocation */
+in(*nbStripesSoFarPtr < nbStripesPerBlock, "*nbStripesSoFarPtr >= nbStripesPerBlock")
+{
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes)
+    {
+        /* need a scrambling operation */
+        const size_t nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
+        const size_t nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
+        xxh3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE,
+                nbStripesToEndofBlock, f_acc512);
+        f_scramble(acc, secret + secretLimit);
+        xxh3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN,
+                secret, nbStripesAfterBlock, f_acc512);
+        *nbStripesSoFarPtr = nbStripesAfterBlock;
+    }
+    else
+    {
+        xxh3_accumulate(acc, input,
+                secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
+        *nbStripesSoFarPtr += nbStripes;
+    }
+}
+
+enum XXH3_STREAM_USE_STACK = 1;
+/*
+ * Both xxh3_64bits_update and xxh3_128bits_update use this routine.
+ */
+private XXH_errorcode xxh3_update(
+    scope XXH3_state_t* state, scope const(ubyte)* input, size_t len,
+    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble)
+    @trusted pure nothrow @nogc
+in(state != null, "state == null")
+in
+{
+    if (input == null) assert(len == 0, "input null ptr only allowed with len == 0");
+}
+do
+{
+    if (input == null && len == 0)
+        return XXH_errorcode.XXH_OK;
+    else if (input == null && len != 0)
+        return XXH_errorcode.XXH_ERROR;
+    else
+    {
+        const ubyte* bEnd = input + len;
+        const(ubyte)* secret = (state.extSecret == null) ? &state.customSecret[0] : &state.extSecret[0];
+        static if (XXH3_STREAM_USE_STACK >= 1)
+        {
+            /* For some reason, gcc and MSVC seem to suffer greatly
+            * when operating accumulators directly into state.
+            * Operating into stack space seems to enable proper optimization.
+            * clang, on the other hand, doesn't seem to need this trick */
+            align(XXH_ACC_ALIGN) ulong[8] acc;
+            (cast(ubyte*) &acc[0]) [0 .. acc.sizeof] = (cast(ubyte*) &state.acc[0]) [0 .. acc.sizeof];
+        }
+        else
+        {
+            ulong* acc = state.acc;
+        }
+        state.totalLen += len;
+        assert(state.bufferedSize <= XXH3_INTERNALBUFFER_SIZE, "state.bufferedSize > XXH3_INTERNALBUFFER_SIZE");
+
+        /* small input : just fill in tmp buffer */
+        if (state.bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE)
+        {
+            (cast(ubyte*) &state.buffer[0]) [state.bufferedSize .. state.bufferedSize + len] =
+                (cast(ubyte*) input) [0 .. len];
+            state.bufferedSize += cast(XXH32_hash_t) len;
+            return XXH_errorcode.XXH_OK;
+        }
+
+        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+        enum XXH3_INTERNALBUFFER_STRIPES = (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN);
+        static assert(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0, "XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN != 0"); /* clean multiple */
+
+        /*
+         * Internal buffer is partially filled (always, except at beginning)
+         * Complete it, then consume it.
+         */
+        if (state.bufferedSize)
+        {
+            const size_t loadSize = XXH3_INTERNALBUFFER_SIZE - state.bufferedSize;
+            (cast(ubyte*)&state.buffer[0]) [state.bufferedSize .. state.bufferedSize + loadSize] =
+                (cast(ubyte*) input) [0 .. loadSize];
+            input += loadSize;
+            xxh3_consumeStripes(&acc[0], &state.nbStripesSoFar, state.nbStripesPerBlock,
+                    &state.buffer[0], XXH3_INTERNALBUFFER_STRIPES, secret,
+                    state.secretLimit, f_acc512, f_scramble);
+            state.bufferedSize = 0;
+        }
+        assert(input < bEnd, "input >= bEnd");
+
+        /* large input to consume : ingest per full block */
+        if (cast(size_t)(bEnd - input) > state.nbStripesPerBlock * XXH_STRIPE_LEN)
+        {
+            size_t nbStripes = cast(size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
+            assert(state.nbStripesPerBlock >= state.nbStripesSoFar, "state.nbStripesPerBlock < state.nbStripesSoFar");
+            /* join to current block's end */
+            {
+                const size_t nbStripesToEnd = state.nbStripesPerBlock - state.nbStripesSoFar;
+                assert(nbStripesToEnd <= nbStripes, "nbStripesToEnd > nbStripes");
+                xxh3_accumulate(&acc[0], input,
+                        secret + state.nbStripesSoFar * XXH_SECRET_CONSUME_RATE,
+                        nbStripesToEnd, f_acc512);
+                f_scramble(&acc[0], secret + state.secretLimit);
+                state.nbStripesSoFar = 0;
+                input += nbStripesToEnd * XXH_STRIPE_LEN;
+                nbStripes -= nbStripesToEnd;
+            }
+            /* consume per entire blocks */
+            while (nbStripes >= state.nbStripesPerBlock)
+            {
+                xxh3_accumulate(&acc[0], input, secret, state.nbStripesPerBlock, f_acc512);
+                f_scramble(&acc[0], secret + state.secretLimit);
+                input += state.nbStripesPerBlock * XXH_STRIPE_LEN;
+                nbStripes -= state.nbStripesPerBlock;
+            }
+            /* consume last partial block */
+            xxh3_accumulate(&acc[0], input, secret, nbStripes, f_acc512);
+            input += nbStripes * XXH_STRIPE_LEN;
+            assert(input < bEnd, "input exceeds buffer, no bytes left"); /* at least some bytes left */
+            state.nbStripesSoFar = nbStripes;
+            /* buffer predecessor of last partial stripe */
+            (cast(ubyte*) &state.buffer[0])
+                [state.buffer.sizeof - XXH_STRIPE_LEN .. state.buffer.sizeof - XXH_STRIPE_LEN + XXH_STRIPE_LEN] =
+                (cast(ubyte*) input - XXH_STRIPE_LEN) [0 .. XXH_STRIPE_LEN];
+            assert(bEnd - input <= XXH_STRIPE_LEN, "input exceed strip length");
+        }
+        else
+        {
+            /* content to consume <= block size */
+            /* Consume input by a multiple of internal buffer size */
+            if (bEnd - input > XXH3_INTERNALBUFFER_SIZE)
+            {
+                const ubyte* limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+                do
+                {
+                    xxh3_consumeStripes(&acc[0], &state.nbStripesSoFar, state.nbStripesPerBlock, input,
+                            XXH3_INTERNALBUFFER_STRIPES, secret,
+                            state.secretLimit, f_acc512, f_scramble);
+                    input += XXH3_INTERNALBUFFER_SIZE;
+                }
+                while (input < limit);
+                /* buffer predecessor of last partial stripe */
+                (cast(ubyte*) &state.buffer[0])
+                    [state.buffer.sizeof - XXH_STRIPE_LEN .. state.buffer.sizeof - XXH_STRIPE_LEN + XXH_STRIPE_LEN] =
+                    (cast(ubyte*) input - XXH_STRIPE_LEN) [0 .. XXH_STRIPE_LEN];
+            }
+        }
+
+        /* Some remaining input (always) : buffer it */
+        assert(input < bEnd, "input exceeds buffer");
+        assert(bEnd - input <= XXH3_INTERNALBUFFER_SIZE, "input outside buffer");
+        assert(state.bufferedSize == 0, "bufferedSize != 0");
+        (cast(ubyte*) &state.buffer[0]) [0 .. cast(size_t)(bEnd - input)] =
+            (cast(ubyte*) input) [0 .. cast(size_t)(bEnd - input)];
+        state.bufferedSize = cast(XXH32_hash_t)(bEnd - input);
+        static if (XXH3_STREAM_USE_STACK >= 1)
+        {
+            /* save stack accumulators into state */
+            (cast(ubyte*) &state.acc[0]) [0 .. acc.sizeof] = (cast(ubyte*) &acc[0]) [0 .. acc.sizeof];
+        }
+    }
+
+    return XXH_errorcode.XXH_OK;
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH_errorcode xxh3_64bits_update(scope XXH3_state_t* state, scope const(void)* input, size_t len)
+    @safe pure nothrow @nogc
+{
+    return xxh3_update(state, cast(const(ubyte)*) input, len,
+            xxh3_accumulate_512, xxh3_scrambleAcc);
+}
+
+private void xxh3_digest_long(XXH64_hash_t* acc, const XXH3_state_t* state, const ubyte* secret)
+    @trusted pure nothrow @nogc
+{
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    (cast(ubyte*) &acc[0]) [0 .. state.acc.sizeof] = (cast(ubyte*) &state.acc[0]) [0 .. state.acc.sizeof];
+    if (state.bufferedSize >= XXH_STRIPE_LEN)
+    {
+        const size_t nbStripes = (state.bufferedSize - 1) / XXH_STRIPE_LEN;
+        size_t nbStripesSoFar = state.nbStripesSoFar;
+        xxh3_consumeStripes(acc, &nbStripesSoFar, state.nbStripesPerBlock, &state.buffer[0],
+                nbStripes, secret, state.secretLimit, xxh3_accumulate_512, xxh3_scrambleAcc);
+        /* last stripe */
+        xxh3_accumulate_512(acc, &state.buffer[0] + state.bufferedSize - XXH_STRIPE_LEN,
+                secret + state.secretLimit - XXH_SECRET_LASTACC_START);
+    }
+    else
+    { /* bufferedSize < XXH_STRIPE_LEN */
+        ubyte[XXH_STRIPE_LEN] lastStripe;
+        const size_t catchupSize = XXH_STRIPE_LEN - state.bufferedSize;
+        assert(state.bufferedSize > 0, "bufferedSize <= 0"); /* there is always some input buffered */
+        (cast(ubyte*) &lastStripe[0]) [0 .. catchupSize] =
+            (cast(ubyte*) &state.buffer[0]) [state.buffer.sizeof - catchupSize .. state.buffer.sizeof];
+        (cast(ubyte*) &lastStripe[0]) [catchupSize .. catchupSize + state.bufferedSize] =
+            (cast(ubyte*) &state.buffer[0]) [0 .. state.buffer.sizeof];
+        xxh3_accumulate_512(&acc[0], &lastStripe[0], &secret[0] + state.secretLimit - XXH_SECRET_LASTACC_START);
+    }
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH64_hash_t xxh3_64bits_digest(const XXH3_state_t* state)
+    @trusted pure nothrow @nogc
+{
+    const ubyte* secret = (state.extSecret == null) ? &state.customSecret[0] : &state.extSecret[0];
+    if (state.totalLen > XXH3_MIDSIZE_MAX)
+    {
+        align(XXH_ACC_ALIGN) XXH64_hash_t[XXH_ACC_NB] acc;
+        xxh3_digest_long(&acc[0], state, secret);
+        return xxh3_mergeAccs(&acc[0], secret + XXH_SECRET_MERGEACCS_START,
+                cast(ulong) state.totalLen * XXH_PRIME64_1);
+    }
+    /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
+    if (state.useSeed)
+        return xxh3_64bits_withSeed(&state.buffer[0], cast(size_t) state.totalLen, state.seed);
+    return xxh3_64bits_withSecret(&state.buffer[0],
+            cast(size_t)(state.totalLen), secret, state.secretLimit + XXH_STRIPE_LEN);
+}
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See xxh3_mix16B and xxh128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+private XXH128_hash_t xxh3_len_1to3_128b(
+    const ubyte* input, size_t len, const ubyte* secret, XXH64_hash_t seed)
+    @trusted pure nothrow @nogc
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    assert(input != null, "input is null");
+    assert(1 <= len && len <= 3, "len is out of range");
+    assert(secret != null, "secret is null");
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {
+        const ubyte c1 = input[0];
+        const ubyte c2 = input[len >> 1];
+        const ubyte c3 = input[len - 1];
+        const uint combinedl = (cast(uint) c1 << 16) | (
+                cast(uint) c2 << 24) | (cast(uint) c3 << 0) | (cast(uint) len << 8);
+        const uint combinedh = rol(bswap(combinedl), 13);
+        const ulong bitflipl = (xxh_readLE32(secret) ^ xxh_readLE32(secret + 4)) + seed;
+        const ulong bitfliph = (xxh_readLE32(secret + 8) ^ xxh_readLE32(secret + 12)) - seed;
+        const ulong keyed_lo = cast(ulong) combinedl ^ bitflipl;
+        const ulong keyed_hi = cast(ulong) combinedh ^ bitfliph;
+        XXH128_hash_t h128;
+        h128.low64 = xxh64_avalanche(keyed_lo);
+        h128.high64 = xxh64_avalanche(keyed_hi);
+        return h128;
+    }
+}
+
+private XXH128_hash_t xxh3_len_4to8_128b(
+    const ubyte* input, size_t len, const ubyte* secret, XXH64_hash_t seed)
+    @trusted pure nothrow @nogc
+{
+    assert(input != null, "input is null");
+    assert(secret != null, "secret is null");
+    assert(4 <= len && len <= 8, "len is out of range");
+    seed ^= cast(ulong) bswap(cast(uint) seed) << 32;
+    {
+        const uint input_lo = xxh_readLE32(input);
+        const uint input_hi = xxh_readLE32(input + len - 4);
+        const ulong input_64 = input_lo + (cast(ulong) input_hi << 32);
+        const ulong bitflip = (xxh_readLE64(secret + 16) ^ xxh_readLE64(secret + 24)) + seed;
+        const ulong keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = xxh_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64 ^= (m128.high64 >> 3);
+
+        m128.low64 = xxh_xorshift64(m128.low64, 35);
+        m128.low64 *= 0x9FB21C651E98DF25;
+        m128.low64 = xxh_xorshift64(m128.low64, 28);
+        m128.high64 = xxh3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+private XXH128_hash_t xxh3_len_9to16_128b(
+    const ubyte* input, size_t len, const ubyte* secret, XXH64_hash_t seed)
+    @trusted pure nothrow @nogc
+{
+    assert(input != null, "input is null");
+    assert(secret != null, "secret is null");
+    assert(9 <= len && len <= 16, "len out of range");
+    {
+        const ulong bitflipl = (xxh_readLE64(secret + 32) ^ xxh_readLE64(secret + 40)) - seed;
+        const ulong bitfliph = (xxh_readLE64(secret + 48) ^ xxh_readLE64(secret + 56)) + seed;
+        const ulong input_lo = xxh_readLE64(input);
+        ulong input_hi = xxh_readLE64(input + len - 8);
+        XXH128_hash_t m128 = xxh_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += cast(ulong)(len - 1) << 54;
+        input_hi ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if ((void*).sizeof < (ulong).sizeof)
+        { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000) + xxh_mult32to64(cast(uint) input_hi,
+                    XXH_PRIME32_2);
+        }
+        else
+        {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = XXH_PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((ulong)input_hi.lo * (XXH_PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((ulong)input_hi.lo * (XXH_PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + xxh_mult32to64(cast(uint) input_hi, XXH_PRIME32_2 - 1);
+        }
+        /* m128 ^= bswap(m128 >> 64); */
+        m128.low64 ^= bswap(m128.high64);
+
+        { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+            XXH128_hash_t h128 = xxh_mult64to128(m128.low64, XXH_PRIME64_2);
+            h128.high64 += m128.high64 * XXH_PRIME64_2;
+
+            h128.low64 = xxh3_avalanche(h128.low64);
+            h128.high64 = xxh3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+private XXH128_hash_t xxh3_len_0to16_128b(
+    const ubyte* input, size_t len, const ubyte* secret, XXH64_hash_t seed)
+    @trusted pure nothrow @nogc
+{
+    assert(len <= 16, "len > 16");
+    {
+        if (len > 8)
+            return xxh3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4)
+            return xxh3_len_4to8_128b(input, len, secret, seed);
+        if (len)
+            return xxh3_len_1to3_128b(input, len, secret, seed);
+        {
+            XXH128_hash_t h128;
+            const ulong bitflipl = xxh_readLE64(secret + 64) ^ xxh_readLE64(secret + 72);
+            const ulong bitfliph = xxh_readLE64(secret + 80) ^ xxh_readLE64(secret + 88);
+            h128.low64 = xxh64_avalanche(seed ^ bitflipl);
+            h128.high64 = xxh64_avalanche(seed ^ bitfliph);
+            return h128;
+        }
+    }
+}
+
+private XXH128_hash_t xxh128_mix32B(
+    XXH128_hash_t acc, const ubyte* input_1, const ubyte* input_2, const ubyte* secret, XXH64_hash_t seed)
+    @trusted pure nothrow @nogc
+{
+    acc.low64 += xxh3_mix16B(input_1, secret + 0, seed);
+    acc.low64 ^= xxh_readLE64(input_2) + xxh_readLE64(input_2 + 8);
+    acc.high64 += xxh3_mix16B(input_2, secret + 16, seed);
+    acc.high64 ^= xxh_readLE64(input_1) + xxh_readLE64(input_1 + 8);
+    return acc;
+}
+
+private XXH128_hash_t xxh3_len_17to128_128b(
+    const ubyte* input, size_t len, const ubyte* secret, size_t secretSize, XXH64_hash_t seed)
+    @trusted pure nothrow @nogc
+in(secretSize >= XXH3_SECRET_SIZE_MIN, "secretSie < XXH3_SECRET_SIZE_MIN")
+in(16 < len && len <= 128, "len out of range")
+{
+    XXH128_hash_t acc;
+    acc.low64 = len * XXH_PRIME64_1;
+    acc.high64 = 0;
+
+    static if (XXH_SIZE_OPT >= 1)
+    {
+        /* Smaller, but slightly slower. */
+        size_t i = (len - 1) / 32;
+        do
+        {
+            acc = xxh128_mix32B(acc, input + 16 * i,
+                    input + len - 16 * (i + 1), secret + 32 * i, seed);
+        }
+        while (i-- != 0);
+    }
+    else
+    {
+        if (len > 32)
+        {
+            if (len > 64)
+            {
+                if (len > 96)
+                {
+                    acc = xxh128_mix32B(acc, input + 48, input + len - 64, secret + 96, seed);
+                }
+                acc = xxh128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed);
+            }
+            acc = xxh128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed);
+        }
+        acc = xxh128_mix32B(acc, input, input + len - 16, secret, seed);
+    }
+    {
+        XXH128_hash_t h128;
+        h128.low64 = acc.low64 + acc.high64;
+        h128.high64 = (acc.low64 * XXH_PRIME64_1) + (
+                acc.high64 * XXH_PRIME64_4) + ((len - seed) * XXH_PRIME64_2);
+        h128.low64 = xxh3_avalanche(h128.low64);
+        h128.high64 = cast(XXH64_hash_t) 0 - xxh3_avalanche(h128.high64);
+        return h128;
+    }
+}
+
+private XXH128_hash_t xxh3_len_129to240_128b(
+    const ubyte* input, size_t len, const ubyte* secret, size_t secretSize, XXH64_hash_t seed)
+    @trusted pure nothrow @nogc
+in(secretSize >= XXH3_SECRET_SIZE_MIN, "secretSize < XXH3_SECRET_SIZE_MIN")
+in(128 < len && len <= XXH3_MIDSIZE_MAX, "len > 128 or len > XXH3_MIDSIZE_MAX")
+{
+    XXH128_hash_t acc;
+    const int nbRounds = cast(int) len / 32;
+    int i;
+    acc.low64 = len * XXH_PRIME64_1;
+    acc.high64 = 0;
+    for (i = 0; i < 4; i++)
+    {
+        acc = xxh128_mix32B(acc, input + (32 * i), input + (32 * i) + 16, secret + (32 * i),
+                seed);
+    }
+    acc.low64 = xxh3_avalanche(acc.low64);
+    acc.high64 = xxh3_avalanche(acc.high64);
+    assert(nbRounds >= 4, "nbRounds < 4");
+    for (i = 4; i < nbRounds; i++)
+    {
+        acc = xxh128_mix32B(acc, input + (32 * i), input + (32 * i) + 16,
+                secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), seed);
+    }
+    /* last bytes */
+    acc = xxh128_mix32B(acc, input + len - 16, input + len - 32,
+            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, 0 - seed);
+
+    {
+        XXH128_hash_t h128;
+        h128.low64 = acc.low64 + acc.high64;
+        h128.high64 = (acc.low64 * XXH_PRIME64_1) + (
+                acc.high64 * XXH_PRIME64_4) + ((len - seed) * XXH_PRIME64_2);
+        h128.low64 = xxh3_avalanche(h128.low64);
+        h128.high64 = cast(XXH64_hash_t) 0 - xxh3_avalanche(h128.high64);
+        return h128;
+    }
+}
+
+private XXH128_hash_t xxh3_hashLong_128b_internal(
+    const void* input, size_t len, const ubyte* secret, size_t secretSize,
+    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble)
+    @trusted pure nothrow @nogc
+{
+    align(XXH_ACC_ALIGN) ulong[XXH_ACC_NB] acc = XXH3_INIT_ACC;
+
+    xxh3_hashLong_internal_loop(&acc[0], cast(const ubyte*) input, len,
+            secret, secretSize, f_acc512, f_scramble);
+
+    /* converge into final hash */
+    static assert(acc.sizeof == 64, "acc isn't 64 bytes long");
+    assert(secretSize >= acc.sizeof + XXH_SECRET_MERGEACCS_START, "secretSze < allowed limit.");
+    {
+        XXH128_hash_t h128;
+        h128.low64 = xxh3_mergeAccs(&acc[0],
+                secret + XXH_SECRET_MERGEACCS_START, cast(ulong) len * XXH_PRIME64_1);
+        h128.high64 = xxh3_mergeAccs(&acc[0], secret + secretSize - (acc)
+                .sizeof - XXH_SECRET_MERGEACCS_START, ~(cast(ulong) len * XXH_PRIME64_2));
+        return h128;
+    }
+}
+
+private XXH128_hash_t xxh3_hashLong_128b_default(
+    const void* input, size_t len, XXH64_hash_t seed64, const void* secret, size_t secretLen)
+    @safe pure nothrow @nogc
+{
+    return xxh3_hashLong_128b_internal(input, len, &xxh3_kSecret[0],
+            (xxh3_kSecret).sizeof, xxh3_accumulate_512, xxh3_scrambleAcc);
+}
+
+/*
+ * It's important for performance to pass @p secretLen (when it's static)
+ * to the compiler, so that it can properly optimize the vectorized loop.
+ */
+private XXH128_hash_t xxh3_hashLong_128b_withSecret(
+    const void* input, size_t len, XXH64_hash_t seed64, const void* secret, size_t secretLen)
+    @safe pure nothrow @nogc
+{
+    return xxh3_hashLong_128b_internal(input, len, cast(const ubyte*) secret,
+            secretLen, xxh3_accumulate_512, xxh3_scrambleAcc);
+}
+
+private XXH128_hash_t xxh3_hashLong_128b_withSeed_internal(
+    const void* input, size_t len, XXH64_hash_t seed64,
+    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble,
+    XXH3_f_initCustomSecret f_initSec)
+    @trusted pure nothrow @nogc
+{
+    if (seed64 == 0)
+        return xxh3_hashLong_128b_internal(input, len, &xxh3_kSecret[0],
+                (xxh3_kSecret).sizeof, f_acc512, f_scramble);
+    {
+        align(XXH_SEC_ALIGN) ubyte[XXH_SECRET_DEFAULT_SIZE] secret;
+        f_initSec(&secret[0], seed64);
+        return xxh3_hashLong_128b_internal(input, len,
+                cast(const ubyte*)&secret[0], (secret).sizeof, f_acc512, f_scramble);
+    }
+}
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+private XXH128_hash_t xxh3_hashLong_128b_withSeed(
+    const void* input, size_t len, XXH64_hash_t seed64, const void* secret, size_t secretLen)
+    @safe pure nothrow @nogc
+{
+    return xxh3_hashLong_128b_withSeed_internal(input, len, seed64,
+            xxh3_accumulate_512, xxh3_scrambleAcc, xxh3_initCustomSecret);
+}
+
+alias XXH3_hashLong128_f = XXH128_hash_t function(const void*, size_t,
+        XXH64_hash_t, const void*, size_t) @safe pure nothrow @nogc;
+
+private XXH128_hash_t xxh3_128bits_internal(
+    const void* input, size_t len, XXH64_hash_t seed64, const void* secret, size_t secretLen,
+    XXH3_hashLong128_f f_hl128)
+    @safe pure nothrow @nogc
+in(secretLen >= XXH3_SECRET_SIZE_MIN, "Secret length is < XXH3_SECRET_SIZE_MIN")
+{
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return xxh3_len_0to16_128b(cast(const ubyte*) input, len,
+                cast(const ubyte*) secret, seed64);
+    if (len <= 128)
+        return xxh3_len_17to128_128b(cast(const ubyte*) input, len,
+                cast(const ubyte*) secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return xxh3_len_129to240_128b(cast(const ubyte*) input, len,
+                cast(const ubyte*) secret, secretLen, seed64);
+    return f_hl128(input, len, seed64, secret, secretLen);
+}
+
+/* ===   Public XXH128 API   === */
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH128_hash_t xxh3_128bits(const void* input, size_t len)
+    @safe pure nothrow @nogc
+{
+    return xxh3_128bits_internal(input, len, 0, &xxh3_kSecret[0],
+            (xxh3_kSecret).sizeof, &xxh3_hashLong_128b_default);
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH128_hash_t xxh3_128bits_withSecret(
+    const void* input, size_t len, const void* secret, size_t secretSize)
+    @safe pure nothrow @nogc
+{
+    return xxh3_128bits_internal(input, len, 0, cast(const ubyte*) secret,
+            secretSize, &xxh3_hashLong_128b_withSecret);
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH128_hash_t xxh3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+    @safe pure nothrow @nogc
+{
+    return xxh3_128bits_internal(input, len, seed, &xxh3_kSecret[0],
+            (xxh3_kSecret).sizeof, &xxh3_hashLong_128b_withSeed);
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH128_hash_t xxh3_128bits_withSecretandSeed(
+    const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
+    @safe pure nothrow @nogc
+{
+    if (len <= XXH3_MIDSIZE_MAX)
+        return xxh3_128bits_internal(input, len, seed, &xxh3_kSecret[0],
+                (xxh3_kSecret).sizeof, null);
+    return xxh3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH128_hash_t XXH128(const void* input, size_t len, XXH64_hash_t seed)
+    @safe pure nothrow @nogc
+{
+    return xxh3_128bits_withSeed(input, len, seed);
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH_errorcode xxh3_128bits_reset(scope XXH3_state_t* statePtr)
+    @safe pure nothrow @nogc
+{
+    return xxh3_64bits_reset(statePtr);
+}
+
+/* XXH PUBLIC API - hidden in D module */
+private XXH_errorcode xxh3_128bits_reset_withSecret(
+    XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+    @safe pure nothrow @nogc
+{
+    return xxh3_64bits_reset_withSecret(statePtr, secret, secretSize);
+}
+
+/* XXH PUBLIC API - hidden in D module */ private
+XXH_errorcode xxh3_128bits_reset_withSeed(
+    XXH3_state_t* statePtr, XXH64_hash_t seed)
+    @safe pure nothrow @nogc
+{
+    return xxh3_64bits_reset_withSeed(statePtr, seed);
+}
+
+/* XXH PUBLIC API - hidden in D module */ private
+XXH_errorcode xxh3_128bits_reset_withSecretandSeed(
+    XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)
+    @safe pure nothrow @nogc
+{
+    return xxh3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
+}
+
+/* XXH PUBLIC API - hidden in D module */ private
+XXH_errorcode xxh3_128bits_update(
+    scope XXH3_state_t* state, scope const void* input, size_t len)
+    @safe pure nothrow @nogc
+{
+    return xxh3_update(state, cast(const ubyte*) input, len,
+            xxh3_accumulate_512, xxh3_scrambleAcc);
+}
+
+/* XXH PUBLIC API - hidden in D module */ private
+XXH128_hash_t xxh3_128bits_digest(const XXH3_state_t* state)
+    @trusted pure nothrow @nogc
+{
+    const ubyte* secret = (state.extSecret == null) ? &state.customSecret[0] : &state.extSecret[0];
+    if (state.totalLen > XXH3_MIDSIZE_MAX)
+    {
+        align(XXH_ACC_ALIGN) XXH64_hash_t[XXH_ACC_NB] acc;
+        xxh3_digest_long(&acc[0], state, secret);
+        assert(state.secretLimit + XXH_STRIPE_LEN >= acc.sizeof + XXH_SECRET_MERGEACCS_START, "Internal error");
+        {
+            XXH128_hash_t h128;
+            h128.low64 = xxh3_mergeAccs(&acc[0], secret + XXH_SECRET_MERGEACCS_START,
+                    cast(ulong) state.totalLen * XXH_PRIME64_1);
+            h128.high64 = xxh3_mergeAccs(&acc[0], secret + state.secretLimit + XXH_STRIPE_LEN - (acc)
+                    .sizeof - XXH_SECRET_MERGEACCS_START,
+                    ~(cast(ulong) state.totalLen * XXH_PRIME64_2));
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state.seed)
+        return xxh3_128bits_withSeed(&state.buffer[0], cast(size_t) state.totalLen, state.seed);
+    return xxh3_128bits_withSecret(&state.buffer[0],
+            cast(size_t)(state.totalLen), secret, state.secretLimit + XXH_STRIPE_LEN);
+}
+
+/* ----------------------------------------------------------------------------------------*/
+
+import core.bitop;
+
+public import std.digest;
+
+/*
+ * Helper methods for encoding the buffer.
+ * Can be removed if the optimizer can inline the methods from std.bitmanip.
+ */
+version (LittleEndian)
+{
+    private alias nativeToBigEndian = bswap;
+    private alias bigEndianToNative = bswap;
+}
+else
+pragma(inline, true) private pure @nogc nothrow @safe
+{
+    uint nativeToBigEndian(uint val)
+    {
+        return val;
+    }
+
+    ulong nativeToBigEndian(ulong val)
+    {
+        return val;
+    }
+
+    alias bigEndianToNative = nativeToBigEndian;
+}
+
+/**
+ * Template API XXHTemplate implementation. Uses parameters to configure for number of bits and XXH variant (classic or XXH3)
+ * See `std.digest` for differences between template and OOP API.
+ */
+struct XXHTemplate(HASH, STATE, bool useXXH3)
+{
+private:
+    HASH hash;
+    STATE state;
+    HASH seed = HASH.init;
+
+public:
+    enum digestSize = HASH.sizeof * 8;
+
+    /**
+         * Use this to feed the digest with data.
+         * Also implements the $(REF isOutputRange, std,range,primitives)
+         * interface for `ubyte` and `const(ubyte)[]`.
+         *
+         * Example:
+         * ----
+         * XXHTemplate!(hashtype,statetype,useXXH3) dig;
+         * dig.put(cast(ubyte) 0); //single ubyte
+         * dig.put(cast(ubyte) 0, cast(ubyte) 0); //variadic
+         * ubyte[10] buf;
+         * dig.put(buf); //buffer
+         * ----
+         */
+    void put(scope const(ubyte)[] data...) @safe nothrow @nogc
+    {
+        XXH_errorcode ec = XXH_errorcode.XXH_OK;
+        if (data.length > 0) // digest will only change, when there is data!
+        {
+            static if (digestSize == 32)
+                ec = xxh32_update(&state, &data[0], data.length);
+            else static if (digestSize == 64 && !useXXH3)
+                ec = xxh64_update(&state, &data[0], data.length);
+            else static if (digestSize == 64 && useXXH3)
+                ec = xxh3_64bits_update(&state, &data[0], data.length);
+            else static if (digestSize == 128)
+                ec = xxh3_128bits_update(&state, &data[0], data.length);
+            else
+                assert(false, "Unknown XXH bitdeep or variant");
+        }
+        assert(ec == XXH_errorcode.XXH_OK, "Update failed");
+    }
+
+    /**
+         * Used to (re)initialize the XXHTemplate digest.
+         *
+         * Example:
+         * --------
+         * XXHTemplate!(hashtype,statetype,useXXH3) digest;
+         * digest.start();
+         * digest.put(0);
+         * --------
+         */
+    void start() @safe nothrow @nogc
+    {
+        this = typeof(this).init;
+        XXH_errorcode ec;
+        static if (digestSize == 32)
+        {
+            assert(state.alignof == uint.alignof, "Wrong alignment for state structure");
+            ec = xxh32_reset(&state, seed);
+        }
+        else static if (digestSize == 64 && !useXXH3)
+        {
+            assert(state.alignof == ulong.alignof, "Wrong alignment for state structure");
+            ec = xxh64_reset(&state, seed);
+        }
+        else static if (digestSize == 64 && useXXH3)
+        {
+            assert(state.alignof == 64, "Wrong alignment for state structure");
+            ec = xxh3_64bits_reset(&state);
+        }
+        else static if (digestSize == 128)
+        {
+            assert(state.alignof == 64, "Wrong alignment for state structure");
+            ec = xxh3_128bits_reset(&state);
+        }
+        else
+            assert(false, "Unknown XXH bitdeep or variant");
+        //assert(ec == XXH_errorcode.XXH_OK, "reset failed");
+    }
+
+    /**
+         * Returns the finished XXH hash. This also calls $(LREF start) to
+         * reset the internal state.
+          */
+    ubyte[digestSize / 8] finish() @trusted nothrow @nogc
+    {
+        static if (digestSize == 32)
+        {
+            hash = xxh32_digest(&state);
+            const auto rc = nativeToBigEndian(hash);
+        }
+        else static if (digestSize == 64 && !useXXH3)
+        {
+            hash = xxh64_digest(&state);
+            const auto rc = nativeToBigEndian(hash);
+        }
+        else static if (digestSize == 64 && useXXH3)
+        {
+            hash = xxh3_64bits_digest(&state);
+            const auto rc = nativeToBigEndian(hash);
+        }
+        else static if (digestSize == 128)
+        {
+            hash = xxh3_128bits_digest(&state);
+            HASH rc;
+            // Note: low64 and high64 are intentionally exchanged!
+            rc.low64 = nativeToBigEndian(hash.high64);
+            rc.high64 = nativeToBigEndian(hash.low64);
+        }
+
+        return (cast(ubyte*)&rc)[0 .. rc.sizeof];
+    }
+}
+///
+@safe unittest
+{
+    // Simple example using the XXH_64 digest
+    XXHTemplate!(XXH64_hash_t, XXH64_state_t, false) hash1;
+    hash1.start();
+    hash1.put(cast(ubyte) 0);
+    auto result = hash1.finish();
+}
+
+alias XXH_32 = XXHTemplate!(XXH32_hash_t, XXH32_state_t, false); /// XXH_32 for XXH, 32bit, hash is ubyte[4]
+alias XXH_64 = XXHTemplate!(XXH64_hash_t, XXH64_state_t, false); /// XXH_64 for XXH, 64bit, hash is ubyte[8]
+alias XXH3_64 = XXHTemplate!(XXH64_hash_t, XXH3_state_t, true); /// XXH3_64 for XXH3, 64bits, hash is ubyte[8]
+alias XXH3_128 = XXHTemplate!(XXH128_hash_t, XXH3_state_t, true); /// XXH3_128 for XXH3, 128bits, hash is ubyte[16]
+
+///
+@safe unittest
+{
+    //Simple example
+    XXH_32 hash1;
+    hash1.start();
+    hash1.put(cast(ubyte) 0);
+    auto result = hash1.finish();
+}
+///
+@safe unittest
+{
+    //Simple example
+    XXH_64 hash1;
+    hash1.start();
+    hash1.put(cast(ubyte) 0);
+    auto result = hash1.finish();
+}
+///
+@safe unittest
+{
+    //Simple example
+    XXH3_64 hash1;
+    hash1.start();
+    hash1.put(cast(ubyte) 0);
+    auto result = hash1.finish();
+}
+///
+@safe unittest
+{
+    //Simple example
+    XXH3_128 hash1;
+    hash1.start();
+    hash1.put(cast(ubyte) 0);
+    auto result = hash1.finish();
+}
+
+///
+@safe unittest
+{
+    //Simple example, hashing a string using xxh32Of helper function
+    auto hash = xxh32Of("abc");
+    //Let's get a hash string
+    assert(toHexString(hash) == "32D153FF");
+}
+///
+@safe unittest
+{
+    //Simple example, hashing a string using xxh32Of helper function
+    auto hash = xxh64Of("abc");
+    //Let's get a hash string
+    assert(toHexString(hash) == "44BC2CF5AD770999"); // XXH64
+}
+///
+@safe unittest
+{
+    //Simple example, hashing a string using xxh32Of helper function
+    auto hash = xxh3_64Of("abc");
+    //Let's get a hash string
+    assert(toHexString(hash) == "78AF5F94892F3950"); // XXH3/64
+}
+///
+@safe unittest
+{
+    //Simple example, hashing a string using xxh32Of helper function
+    auto hash = xxh128Of("abc");
+    //Let's get a hash string
+    assert(toHexString(hash) == "06B05AB6733A618578AF5F94892F3950");
+
+}
+
+///
+@safe unittest
+{
+    //Using the basic API
+    XXH_32 hash;
+    hash.start();
+    ubyte[1024] data;
+    //Initialize data here...
+    hash.put(data);
+    ubyte[4] result = hash.finish();
+}
+///
+@safe unittest
+{
+    //Using the basic API
+    XXH_64 hash;
+    hash.start();
+    ubyte[1024] data;
+    //Initialize data here...
+    hash.put(data);
+    ubyte[8] result = hash.finish();
+}
+///
+@safe unittest
+{
+    //Using the basic API
+    XXH3_64 hash;
+    hash.start();
+    ubyte[1024] data;
+    //Initialize data here...
+    hash.put(data);
+    ubyte[8] result = hash.finish();
+}
+///
+@safe unittest
+{
+    //Using the basic API
+    XXH3_128 hash;
+    hash.start();
+    ubyte[1024] data;
+    //Initialize data here...
+    hash.put(data);
+    ubyte[16] result = hash.finish();
+}
+
+///
+@safe unittest
+{
+    //Let's use the template features:
+    void doSomething(T)(ref T hash)
+    if (isDigest!T)
+    {
+        hash.put(cast(ubyte) 0);
+    }
+
+    XXH_32 xxh;
+    xxh.start();
+    doSomething(xxh);
+    auto hash = xxh.finish;
+    assert(toHexString(hash) == "CF65B03E", "Got " ~ toHexString(hash));
+}
+///
+@safe unittest
+{
+    //Let's use the template features:
+    void doSomething(T)(ref T hash)
+    if (isDigest!T)
+    {
+        hash.put(cast(ubyte) 0);
+    }
+
+    XXH_64 xxh;
+    xxh.start();
+    doSomething(xxh);
+    auto hash = xxh.finish;
+    assert(toHexString(hash) == "E934A84ADB052768", "Got " ~ toHexString(hash));
+}
+///
+@safe unittest
+{
+    //Let's use the template features:
+    void doSomething(T)(ref T hash)
+    if (isDigest!T)
+    {
+        hash.put(cast(ubyte) 0);
+    }
+
+    XXH3_64 xxh;
+    xxh.start();
+    doSomething(xxh);
+    auto hash = xxh.finish;
+    assert(toHexString(hash) == "C44BDFF4074EECDB", "Got " ~ toHexString(hash));
+}
+///
+@safe unittest
+{
+    //Let's use the template features:
+    void doSomething(T)(ref T hash)
+    if (isDigest!T)
+    {
+        hash.put(cast(ubyte) 0);
+    }
+
+    XXH3_128 xxh;
+    xxh.start();
+    doSomething(xxh);
+    auto hash = xxh.finish;
+    assert(toHexString(hash) == "A6CD5E9392000F6AC44BDFF4074EECDB", "Got " ~ toHexString(hash));
+}
+
+///
+@safe unittest
+{
+    assert(isDigest!XXH_32);
+    assert(isDigest!XXH_64);
+    assert(isDigest!XXH3_64);
+    assert(isDigest!XXH3_128);
+}
+
+@system unittest
+{
+    import std.range;
+    import std.conv : hexString;
+
+    ubyte[4] digest32;
+    ubyte[8] digest64;
+    ubyte[16] digest128;
+
+    XXH_32 xxh;
+    xxh.put(cast(ubyte[]) "abcdef");
+    xxh.start();
+    xxh.put(cast(ubyte[]) "");
+    assert(xxh.finish() == cast(ubyte[]) hexString!"02cc5d05");
+
+    digest32 = xxh32Of("");
+    assert(digest32 == cast(ubyte[]) hexString!"02cc5d05");
+    digest64 = xxh64Of("");
+    assert(digest64 == cast(ubyte[]) hexString!"EF46DB3751D8E999", "Got " ~ toHexString(digest64));
+    digest64 = xxh3_64Of("");
+    assert(digest64 == cast(ubyte[]) hexString!"2D06800538D394C2", "Got " ~ toHexString(digest64));
+    digest128 = xxh128Of("");
+    assert(digest128 == cast(ubyte[]) hexString!"99AA06D3014798D86001C324468D497F",
+            "Got " ~ toHexString(digest128));
+
+    digest32 = xxh32Of("a");
+    assert(digest32 == cast(ubyte[]) hexString!"550d7456");
+    digest64 = xxh64Of("a");
+    assert(digest64 == cast(ubyte[]) hexString!"D24EC4F1A98C6E5B", "Got " ~ toHexString(digest64));
+    digest64 = xxh3_64Of("a");
+    assert(digest64 == cast(ubyte[]) hexString!"E6C632B61E964E1F", "Got " ~ toHexString(digest64));
+    digest128 = xxh128Of("a");
+    assert(digest128 == cast(ubyte[]) hexString!"A96FAF705AF16834E6C632B61E964E1F",
+            "Got " ~ toHexString(digest128));
+
+    digest32 = xxh32Of("abc");
+    assert(digest32 == cast(ubyte[]) hexString!"32D153FF");
+    digest64 = xxh64Of("abc");
+    assert(digest64 == cast(ubyte[]) hexString!"44BC2CF5AD770999");
+    digest64 = xxh3_64Of("abc");
+    assert(digest64 == cast(ubyte[]) hexString!"78AF5F94892F3950");
+    digest128 = xxh128Of("abc");
+    assert(digest128 == cast(ubyte[]) hexString!"06B05AB6733A618578AF5F94892F3950");
+
+    digest32 = xxh32Of("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq");
+    assert(digest32 == cast(ubyte[]) hexString!"89ea60c3");
+    digest64 = xxh64Of("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq");
+    assert(digest64 == cast(ubyte[]) hexString!"F06103773E8585DF", "Got " ~ toHexString(digest64));
+    digest64 = xxh3_64Of("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq");
+    assert(digest64 == cast(ubyte[]) hexString!"5BBCBBABCDCC3D3F", "Got " ~ toHexString(digest64));
+    digest128 = xxh128Of("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq");
+    assert(digest128 == cast(ubyte[]) hexString!"3D62D22A5169B016C0D894FD4828A1A7",
+            "Got " ~ toHexString(digest128));
+
+    digest32 = xxh32Of("message digest");
+    assert(digest32 == cast(ubyte[]) hexString!"7c948494");
+    digest64 = xxh64Of("message digest");
+    assert(digest64 == cast(ubyte[]) hexString!"066ED728FCEEB3BE", "Got " ~ toHexString(digest64));
+    digest64 = xxh3_64Of("message digest");
+    assert(digest64 == cast(ubyte[]) hexString!"160D8E9329BE94F9", "Got " ~ toHexString(digest64));
+    digest128 = xxh128Of("message digest");
+    assert(digest128 == cast(ubyte[]) hexString!"34AB715D95E3B6490ABFABECB8E3A424",
+            "Got " ~ toHexString(digest128));
+
+    digest32 = xxh32Of("abcdefghijklmnopqrstuvwxyz");
+    assert(digest32 == cast(ubyte[]) hexString!"63a14d5f");
+    digest64 = xxh64Of("abcdefghijklmnopqrstuvwxyz");
+    assert(digest64 == cast(ubyte[]) hexString!"CFE1F278FA89835C", "Got " ~ toHexString(digest64));
+    digest64 = xxh3_64Of("abcdefghijklmnopqrstuvwxyz");
+    assert(digest64 == cast(ubyte[]) hexString!"810F9CA067FBB90C", "Got " ~ toHexString(digest64));
+    digest128 = xxh128Of("abcdefghijklmnopqrstuvwxyz");
+    assert(digest128 == cast(ubyte[]) hexString!"DB7CA44E84843D67EBE162220154E1E6",
+            "Got " ~ toHexString(digest128));
+
+    digest32 = xxh32Of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789");
+    assert(digest32 == cast(ubyte[]) hexString!"9c285e64");
+    digest64 = xxh64Of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789");
+    assert(digest64 == cast(ubyte[]) hexString!"AAA46907D3047814", "Got " ~ toHexString(digest64));
+    digest64 = xxh3_64Of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789");
+    assert(digest64 == cast(ubyte[]) hexString!"643542BB51639CB2", "Got " ~ toHexString(digest64));
+    digest128 = xxh128Of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789");
+    assert(digest128 == cast(ubyte[]) hexString!"5BCB80B619500686A3C0560BD47A4FFB",
+            "Got " ~ toHexString(digest128));
+
+    digest32 = xxh32Of(
+            "1234567890123456789012345678901234567890" ~ "1234567890123456789012345678901234567890");
+    assert(digest32 == cast(ubyte[]) hexString!"9c05f475");
+    digest64 = xxh64Of(
+            "1234567890123456789012345678901234567890" ~ "1234567890123456789012345678901234567890");
+    assert(digest64 == cast(ubyte[]) hexString!"E04A477F19EE145D", "Got " ~ toHexString(digest64));
+    digest64 = xxh3_64Of(
+            "1234567890123456789012345678901234567890" ~ "1234567890123456789012345678901234567890");
+    assert(digest64 == cast(ubyte[]) hexString!"7F58AA2520C681F9", "Got " ~ toHexString(digest64));
+    digest128 = xxh128Of(
+            "1234567890123456789012345678901234567890" ~ "1234567890123456789012345678901234567890");
+    assert(digest128 == cast(ubyte[]) hexString!"08DD22C3DDC34CE640CB8D6AC672DCB8",
+            "Got " ~ toHexString(digest128));
+
+    enum ubyte[16] input = cast(ubyte[16]) hexString!"c3fcd3d76192e4007dfb496cca67e13b";
+    assert(toHexString(input) == "C3FCD3D76192E4007DFB496CCA67E13B");
+
+    ubyte[] onemilliona = new ubyte[1_000_000];
+    onemilliona[] = 'a';
+    digest32 = xxh32Of(onemilliona);
+    assert(digest32 == cast(ubyte[]) hexString!"E1155920", "Got " ~ toHexString(digest32));
+    digest64 = xxh64Of(onemilliona);
+    assert(digest64 == cast(ubyte[]) hexString!"DC483AAA9B4FDC40", "Got " ~ toHexString(digest64));
+    digest64 = xxh3_64Of(onemilliona);
+    assert(digest64 == cast(ubyte[]) hexString!"B1FD6FAE5285C4EB", "Got " ~ toHexString(digest64));
+    digest128 = xxh128Of(onemilliona);
+    assert(digest128 == cast(ubyte[]) hexString!"A545DF8E384A9579B1FD6FAE5285C4EB",
+            "Got " ~ toHexString(digest128));
+
+    auto oneMillionRange = repeat!ubyte(cast(ubyte) 'a', 1_000_000);
+    digest32 = xxh32Of(oneMillionRange);
+    assert(digest32 == cast(ubyte[]) hexString!"E1155920", "Got " ~ toHexString(digest32));
+    digest64 = xxh64Of(oneMillionRange);
+    assert(digest64 == cast(ubyte[]) hexString!"DC483AAA9B4FDC40", "Got " ~ toHexString(digest64));
+    digest64 = xxh3_64Of(oneMillionRange);
+    assert(digest64 == cast(ubyte[]) hexString!"B1FD6FAE5285C4EB", "Got " ~ toHexString(digest64));
+    digest128 = xxh128Of(oneMillionRange);
+    assert(digest128 == cast(ubyte[]) hexString!"A545DF8E384A9579B1FD6FAE5285C4EB",
+            "Got " ~ toHexString(digest128));
+}
+
+/**
+ * This is a convenience alias for $(REF digest, std,digest) using the
+ * XXH implementation.
+ */
+//simple alias doesn't work here, hope this gets inlined...
+auto xxh32Of(T...)(T data)
+{
+    return digest!(XXH_32, T)(data);
+}
+/// Ditto
+auto xxh64Of(T...)(T data)
+{
+    return digest!(XXH_64, T)(data);
+}
+/// Ditto
+auto xxh3_64Of(T...)(T data)
+{
+    return digest!(XXH3_64, T)(data);
+}
+/// Ditto
+auto xxh128Of(T...)(T data)
+{
+    return digest!(XXH3_128, T)(data);
+}
+
+///
+@safe unittest
+{
+    auto hash = xxh32Of("abc");
+    assert(hash == digest!XXH_32("abc"));
+    auto hash1 = xxh64Of("abc");
+    assert(hash1 == digest!XXH_64("abc"));
+    auto hash2 = xxh3_64Of("abc");
+    assert(hash2 == digest!XXH3_64("abc"));
+    auto hash3 = xxh128Of("abc");
+    assert(hash3 == digest!XXH3_128("abc"));
+}
+
+/**
+ * OOP API XXH implementation.
+ * See `std.digest` for differences between template and OOP API.
+ *
+ * This is an alias for $(D $(REF WrapperDigest, std,digest)!XXH_32), see
+ * there for more information.
+ */
+alias XXH32Digest = WrapperDigest!XXH_32;
+alias XXH64Digest = WrapperDigest!XXH_64; ///ditto
+alias XXH3_64Digest = WrapperDigest!XXH3_64; ///ditto
+alias XXH3_128Digest = WrapperDigest!XXH3_128; ///ditto
+
+///
+@safe unittest
+{
+    //Simple example, hashing a string using Digest.digest helper function
+    auto xxh = new XXH32Digest();
+    ubyte[] hash = xxh.digest("abc");
+    //Let's get a hash string
+    assert(toHexString(hash) == "32D153FF");
+}
+///
+@safe unittest
+{
+    //Simple example, hashing a string using Digest.digest helper function
+    auto xxh = new XXH64Digest();
+    ubyte[] hash = xxh.digest("abc");
+    //Let's get a hash string
+    assert(toHexString(hash) == "44BC2CF5AD770999");
+}
+///
+@safe unittest
+{
+    //Simple example, hashing a string using Digest.digest helper function
+    auto xxh = new XXH3_64Digest();
+    ubyte[] hash = xxh.digest("abc");
+    //Let's get a hash string
+    assert(toHexString(hash) == "78AF5F94892F3950");
+}
+///
+@safe unittest
+{
+    //Simple example, hashing a string using Digest.digest helper function
+    auto xxh = new XXH3_128Digest();
+    ubyte[] hash = xxh.digest("abc");
+    //Let's get a hash string
+    assert(toHexString(hash) == "06B05AB6733A618578AF5F94892F3950");
+}
+
+///
+@system unittest
+{
+    //Let's use the OOP features:
+    void test(Digest dig)
+    {
+        dig.put(cast(ubyte) 0);
+    }
+
+    auto xxh = new XXH32Digest();
+    test(xxh);
+
+    //Let's use a custom buffer:
+    ubyte[16] buf;
+    ubyte[] result = xxh.finish(buf[]);
+    assert(toHexString(result) == "CF65B03E", "Got " ~ toHexString(result));
+}
+///
+@system unittest
+{
+    //Let's use the OOP features:
+    void test(Digest dig)
+    {
+        dig.put(cast(ubyte) 0);
+    }
+
+    auto xxh = new XXH64Digest();
+    test(xxh);
+
+    //Let's use a custom buffer:
+    ubyte[16] buf;
+    ubyte[] result = xxh.finish(buf[]);
+    assert(toHexString(result) == "E934A84ADB052768", "Got " ~ toHexString(result));
+}
+///
+@system unittest
+{
+    //Let's use the OOP features:
+    void test(Digest dig)
+    {
+        dig.put(cast(ubyte) 0);
+    }
+
+    auto xxh = new XXH3_64Digest();
+    test(xxh);
+
+    //Let's use a custom buffer:
+    ubyte[16] buf;
+    ubyte[] result = xxh.finish(buf[]);
+    assert(toHexString(result) == "C44BDFF4074EECDB", "Got " ~ toHexString(result));
+}
+///
+@system unittest
+{
+    //Let's use the OOP features:
+    void test(Digest dig)
+    {
+        dig.put(cast(ubyte) 0);
+    }
+
+    auto xxh = new XXH3_128Digest();
+    test(xxh);
+
+    //Let's use a custom buffer:
+    ubyte[16] buf;
+    ubyte[] result = xxh.finish(buf[]);
+    assert(toHexString(result) == "A6CD5E9392000F6AC44BDFF4074EECDB", "Got " ~ toHexString(result));
+}
+
+@system unittest
+{
+    import std.conv : hexString;
+
+    auto xxh = new XXH32Digest();
+    auto xxh64 = new XXH64Digest();
+    auto xxh3_64 = new XXH3_64Digest();
+    auto xxh128 = new XXH3_128Digest();
+
+    xxh.put(cast(ubyte[]) "abcdef");
+    xxh.reset();
+    xxh.put(cast(ubyte[]) "");
+    assert(xxh.finish() == cast(ubyte[]) hexString!"02cc5d05");
+
+    xxh.put(cast(ubyte[]) "abcdefghijklmnopqrstuvwxyz");
+    ubyte[20] result;
+    auto result2 = xxh.finish(result[]);
+    assert(result[0 .. 4] == result2
+            && result2 == cast(ubyte[]) hexString!"63a14d5f", "Got " ~ toHexString(result));
+
+    debug
+    {
+        import std.exception;
+
+        assertThrown!Error(xxh.finish(result[0 .. 3]));
+    }
+
+    assert(xxh.length == 4);
+    assert(xxh64.length == 8);
+    assert(xxh3_64.length == 8);
+    assert(xxh128.length == 16);
+
+    assert(xxh.digest("") == cast(ubyte[]) hexString!"02cc5d05");
+    assert(xxh64.digest("") == cast(ubyte[]) hexString!"EF46DB3751D8E999");
+    assert(xxh3_64.digest("") == cast(ubyte[]) hexString!"2D06800538D394C2");
+    assert(xxh128.digest("") == cast(ubyte[]) hexString!"99AA06D3014798D86001C324468D497F");
+
+    assert(xxh.digest("a") == cast(ubyte[]) hexString!"550d7456");
+    assert(xxh64.digest("a") == cast(ubyte[]) hexString!"D24EC4F1A98C6E5B");
+    assert(xxh3_64.digest("a") == cast(ubyte[]) hexString!"E6C632B61E964E1F");
+    assert(xxh128.digest("a") == cast(ubyte[]) hexString!"A96FAF705AF16834E6C632B61E964E1F");
+
+    assert(xxh.digest("abc") == cast(ubyte[]) hexString!"32D153FF");
+    assert(xxh64.digest("abc") == cast(ubyte[]) hexString!"44BC2CF5AD770999");
+    assert(xxh3_64.digest("abc") == cast(ubyte[]) hexString!"78AF5F94892F3950");
+    assert(xxh128.digest("abc") == cast(ubyte[]) hexString!"06B05AB6733A618578AF5F94892F3950");
+
+    assert(xxh.digest("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq") == cast(
+            ubyte[]) hexString!"89ea60c3");
+    assert(xxh64.digest("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq") == cast(
+            ubyte[]) hexString!"F06103773E8585DF");
+    assert(xxh3_64.digest("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq") == cast(
+            ubyte[]) hexString!"5BBCBBABCDCC3D3F");
+    assert(xxh128.digest("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq") == cast(
+            ubyte[]) hexString!"3D62D22A5169B016C0D894FD4828A1A7");
+
+    assert(xxh.digest("message digest") == cast(ubyte[]) hexString!"7c948494");
+    assert(xxh64.digest("message digest") == cast(ubyte[]) hexString!"066ED728FCEEB3BE");
+    assert(xxh3_64.digest("message digest") == cast(ubyte[]) hexString!"160D8E9329BE94F9");
+    assert(xxh128.digest("message digest") == cast(
+            ubyte[]) hexString!"34AB715D95E3B6490ABFABECB8E3A424");
+
+    assert(xxh.digest("abcdefghijklmnopqrstuvwxyz") == cast(ubyte[]) hexString!"63a14d5f");
+    assert(xxh64.digest("abcdefghijklmnopqrstuvwxyz") == cast(ubyte[]) hexString!"CFE1F278FA89835C");
+    assert(xxh3_64.digest("abcdefghijklmnopqrstuvwxyz") == cast(
+            ubyte[]) hexString!"810F9CA067FBB90C");
+    assert(xxh128.digest("abcdefghijklmnopqrstuvwxyz") == cast(
+            ubyte[]) hexString!"DB7CA44E84843D67EBE162220154E1E6");
+
+    assert(xxh.digest("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789") == cast(
+            ubyte[]) hexString!"9c285e64");
+    assert(xxh64.digest("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789") == cast(
+            ubyte[]) hexString!"AAA46907D3047814");
+    assert(xxh3_64.digest("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789") == cast(
+            ubyte[]) hexString!"643542BB51639CB2");
+    assert(xxh128.digest("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789") == cast(
+            ubyte[]) hexString!"5BCB80B619500686A3C0560BD47A4FFB");
+
+    assert(xxh.digest("1234567890123456789012345678901234567890",
+            "1234567890123456789012345678901234567890") == cast(ubyte[]) hexString!"9c05f475");
+    assert(xxh64.digest("1234567890123456789012345678901234567890",
+            "1234567890123456789012345678901234567890") == cast(ubyte[]) hexString!"E04A477F19EE145D");
+    assert(xxh3_64.digest("1234567890123456789012345678901234567890",
+            "1234567890123456789012345678901234567890") == cast(ubyte[]) hexString!"7F58AA2520C681F9");
+    assert(xxh128.digest("1234567890123456789012345678901234567890",
+            "1234567890123456789012345678901234567890") == cast(ubyte[]) hexString!"08DD22C3DDC34CE640CB8D6AC672DCB8");
+}
diff --git a/external/xxhash/xxhash.h b/external/xxhash/xxhash.h
new file mode 100644
index 0000000..c4cb3ba
--- /dev/null
+++ b/external/xxhash/xxhash.h
@@ -0,0 +1,7343 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2023 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*!
+ * @mainpage xxHash
+ *
+ * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed
+ * limits.
+ *
+ * It is proposed in four flavors, in three families:
+ * 1. @ref XXH32_family
+ *   - Classic 32-bit hash function. Simple, compact, and runs on almost all
+ *     32-bit and 64-bit systems.
+ * 2. @ref XXH64_family
+ *   - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most
+ *     64-bit systems (but _not_ 32-bit systems).
+ * 3. @ref XXH3_family
+ *   - Modern 64-bit and 128-bit hash function family which features improved
+ *     strength and performance across the board, especially on smaller data.
+ *     It benefits greatly from SIMD and 64-bit without requiring it.
+ *
+ * Benchmarks
+ * ---
+ * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04.
+ * The open source benchmark program is compiled with clang v10.0 using -O3 flag.
+ *
+ * | Hash Name            | ISA ext | Width | Large Data Speed | Small Data Velocity |
+ * | -------------------- | ------- | ----: | ---------------: | ------------------: |
+ * | XXH3_64bits()        | @b AVX2 |    64 |        59.4 GB/s |               133.1 |
+ * | MeowHash             | AES-NI  |   128 |        58.2 GB/s |                52.5 |
+ * | XXH3_128bits()       | @b AVX2 |   128 |        57.9 GB/s |               118.1 |
+ * | CLHash               | PCLMUL  |    64 |        37.1 GB/s |                58.1 |
+ * | XXH3_64bits()        | @b SSE2 |    64 |        31.5 GB/s |               133.1 |
+ * | XXH3_128bits()       | @b SSE2 |   128 |        29.6 GB/s |               118.1 |
+ * | RAM sequential read  |         |   N/A |        28.0 GB/s |                 N/A |
+ * | ahash                | AES-NI  |    64 |        22.5 GB/s |               107.2 |
+ * | City64               |         |    64 |        22.0 GB/s |                76.6 |
+ * | T1ha2                |         |    64 |        22.0 GB/s |                99.0 |
+ * | City128              |         |   128 |        21.7 GB/s |                57.7 |
+ * | FarmHash             | AES-NI  |    64 |        21.3 GB/s |                71.9 |
+ * | XXH64()              |         |    64 |        19.4 GB/s |                71.0 |
+ * | SpookyHash           |         |    64 |        19.3 GB/s |                53.2 |
+ * | Mum                  |         |    64 |        18.0 GB/s |                67.0 |
+ * | CRC32C               | SSE4.2  |    32 |        13.0 GB/s |                57.9 |
+ * | XXH32()              |         |    32 |         9.7 GB/s |                71.9 |
+ * | City32               |         |    32 |         9.1 GB/s |                66.0 |
+ * | Blake3*              | @b AVX2 |   256 |         4.4 GB/s |                 8.1 |
+ * | Murmur3              |         |    32 |         3.9 GB/s |                56.1 |
+ * | SipHash*             |         |    64 |         3.0 GB/s |                43.2 |
+ * | Blake3*              | @b SSE2 |   256 |         2.4 GB/s |                 8.1 |
+ * | HighwayHash          |         |    64 |         1.4 GB/s |                 6.0 |
+ * | FNV64                |         |    64 |         1.2 GB/s |                62.7 |
+ * | Blake2*              |         |   256 |         1.1 GB/s |                 5.1 |
+ * | SHA1*                |         |   160 |         0.8 GB/s |                 5.6 |
+ * | MD5*                 |         |   128 |         0.6 GB/s |                 7.8 |
+ * @note
+ *   - Hashes which require a specific ISA extension are noted. SSE2 is also noted,
+ *     even though it is mandatory on x64.
+ *   - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic
+ *     by modern standards.
+ *   - Small data velocity is a rough average of algorithm's efficiency for small
+ *     data. For more accurate information, see the wiki.
+ *   - More benchmarks and strength tests are found on the wiki:
+ *         https://github.com/Cyan4973/xxHash/wiki
+ *
+ * Usage
+ * ------
+ * All xxHash variants use a similar API. Changing the algorithm is a trivial
+ * substitution.
+ *
+ * @pre
+ *    For functions which take an input and length parameter, the following
+ *    requirements are assumed:
+ *    - The range from [`input`, `input + length`) is valid, readable memory.
+ *      - The only exception is if the `length` is `0`, `input` may be `NULL`.
+ *    - For C++, the objects must have the *TriviallyCopyable* property, as the
+ *      functions access bytes directly as if it was an array of `unsigned char`.
+ *
+ * @anchor single_shot_example
+ * **Single Shot**
+ *
+ * These functions are stateless functions which hash a contiguous block of memory,
+ * immediately returning the result. They are the easiest and usually the fastest
+ * option.
+ *
+ * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits()
+ *
+ * @code{.c}
+ *   #include <string.h>
+ *   #include "xxhash.h"
+ *
+ *   // Example for a function which hashes a null terminated string with XXH32().
+ *   XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed)
+ *   {
+ *       // NULL pointers are only valid if the length is zero
+ *       size_t length = (string == NULL) ? 0 : strlen(string);
+ *       return XXH32(string, length, seed);
+ *   }
+ * @endcode
+ *
+ *
+ * @anchor streaming_example
+ * **Streaming**
+ *
+ * These groups of functions allow incremental hashing of unknown size, even
+ * more than what would fit in a size_t.
+ *
+ * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset()
+ *
+ * @code{.c}
+ *   #include <stdio.h>
+ *   #include <assert.h>
+ *   #include "xxhash.h"
+ *   // Example for a function which hashes a FILE incrementally with XXH3_64bits().
+ *   XXH64_hash_t hashFile(FILE* f)
+ *   {
+ *       // Allocate a state struct. Do not just use malloc() or new.
+ *       XXH3_state_t* state = XXH3_createState();
+ *       assert(state != NULL && "Out of memory!");
+ *       // Reset the state to start a new hashing session.
+ *       XXH3_64bits_reset(state);
+ *       char buffer[4096];
+ *       size_t count;
+ *       // Read the file in chunks
+ *       while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) {
+ *           // Run update() as many times as necessary to process the data
+ *           XXH3_64bits_update(state, buffer, count);
+ *       }
+ *       // Retrieve the finalized hash. This will not change the state.
+ *       XXH64_hash_t result = XXH3_64bits_digest(state);
+ *       // Free the state. Do not use free().
+ *       XXH3_freeState(state);
+ *       return result;
+ *   }
+ * @endcode
+ *
+ * Streaming functions generate the xxHash value from an incremental input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+ *
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+ *
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ *
+ *
+ * @anchor canonical_representation_example
+ * **Canonical Representation**
+ *
+ * The default return values from XXH functions are unsigned 32, 64 and 128 bit
+ * integers.
+ * This the simplest and fastest format for further post-processing.
+ *
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
+ *
+ * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(),
+ * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(),
+ * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(),
+ *
+ * @code{.c}
+ *   #include <stdio.h>
+ *   #include "xxhash.h"
+ *
+ *   // Example for a function which prints XXH32_hash_t in human readable format
+ *   void printXxh32(XXH32_hash_t hash)
+ *   {
+ *       XXH32_canonical_t cano;
+ *       XXH32_canonicalFromHash(&cano, hash);
+ *       size_t i;
+ *       for(i = 0; i < sizeof(cano.digest); ++i) {
+ *           printf("%02x", cano.digest[i]);
+ *       }
+ *       printf("\n");
+ *   }
+ *
+ *   // Example for a function which converts XXH32_canonical_t to XXH32_hash_t
+ *   XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano)
+ *   {
+ *       XXH32_hash_t hash = XXH32_hashFromCanonical(&cano);
+ *       return hash;
+ *   }
+ * @endcode
+ *
+ *
+ * @file xxhash.h
+ * xxHash prototypes and implementation
+ */
+
+#if defined(__cplusplus) && !defined(XXH_NO_EXTERNC_GUARD)
+extern "C" {
+#endif
+
+/* ****************************
+ *  INLINE mode
+ ******************************/
+/*!
+ * @defgroup public Public API
+ * Contains details on the public xxHash functions.
+ * @{
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Gives access to internal state declaration, required for static allocation.
+ *
+ * Incompatible with dynamic linking, due to risks of ABI changes.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_STATIC_LINKING_ONLY
+ *     #include "xxhash.h"
+ * @endcode
+ */
+#  define XXH_STATIC_LINKING_ONLY
+/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */
+
+/*!
+ * @brief Gives access to internal definitions.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_STATIC_LINKING_ONLY
+ *     #define XXH_IMPLEMENTATION
+ *     #include "xxhash.h"
+ * @endcode
+ */
+#  define XXH_IMPLEMENTATION
+/* Do not undef XXH_IMPLEMENTATION for Doxygen */
+
+/*!
+ * @brief Exposes the implementation and marks all functions as `inline`.
+ *
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *  https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * @endcode
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
+ */
+#  define XXH_INLINE_ALL
+#  undef XXH_INLINE_ALL
+/*!
+ * @brief Exposes the implementation without marking functions as inline.
+ */
+#  define XXH_PRIVATE_API
+#  undef XXH_PRIVATE_API
+/*!
+ * @brief Emulate a namespace by transparently prefixing all symbols.
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
+ */
+#  define XXH_NAMESPACE /* YOUR NAME HERE */
+#  undef XXH_NAMESPACE
+#endif
+
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((__unused__))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* note: this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
+    * such as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+   /* Before that, we unconditionally #undef all symbols,
+    * in case they were already defined with XXH_NAMESPACE.
+    * They will then be redefined for XXH_INLINE_ALL
+    */
+#  undef XXH_versionNumber
+    /* XXH32 */
+#  undef XXH32
+#  undef XXH32_createState
+#  undef XXH32_freeState
+#  undef XXH32_reset
+#  undef XXH32_update
+#  undef XXH32_digest
+#  undef XXH32_copyState
+#  undef XXH32_canonicalFromHash
+#  undef XXH32_hashFromCanonical
+    /* XXH64 */
+#  undef XXH64
+#  undef XXH64_createState
+#  undef XXH64_freeState
+#  undef XXH64_reset
+#  undef XXH64_update
+#  undef XXH64_digest
+#  undef XXH64_copyState
+#  undef XXH64_canonicalFromHash
+#  undef XXH64_hashFromCanonical
+    /* XXH3_64bits */
+#  undef XXH3_64bits
+#  undef XXH3_64bits_withSecret
+#  undef XXH3_64bits_withSeed
+#  undef XXH3_64bits_withSecretandSeed
+#  undef XXH3_createState
+#  undef XXH3_freeState
+#  undef XXH3_copyState
+#  undef XXH3_64bits_reset
+#  undef XXH3_64bits_reset_withSeed
+#  undef XXH3_64bits_reset_withSecret
+#  undef XXH3_64bits_update
+#  undef XXH3_64bits_digest
+#  undef XXH3_generateSecret
+    /* XXH3_128bits */
+#  undef XXH128
+#  undef XXH3_128bits
+#  undef XXH3_128bits_withSeed
+#  undef XXH3_128bits_withSecret
+#  undef XXH3_128bits_reset
+#  undef XXH3_128bits_reset_withSeed
+#  undef XXH3_128bits_reset_withSecret
+#  undef XXH3_128bits_reset_withSecretandSeed
+#  undef XXH3_128bits_update
+#  undef XXH3_128bits_digest
+#  undef XXH128_isEqual
+#  undef XXH128_cmp
+#  undef XXH128_canonicalFromHash
+#  undef XXH128_hashFromCanonical
+    /* Finally, free the namespace itself */
+#  undef XXH_NAMESPACE
+
+    /* employ the namespace for XXH_INLINE_ALL */
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols,
+    * but they must nonetheless be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and has a more dispersed impact.
+    * Meanwhile, renaming can be achieved in a single place.
+    */
+#  define XXH_IPREF(Id)   XXH_NAMESPACE ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+/*! @brief Marks a global symbol. */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+/* XXH32 */
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+/* XXH64 */
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+/* XXH3_64bits */
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+#  define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#  define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
+#  define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
+/* XXH3_128bits */
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+#  define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Compiler specifics
+***************************************/
+
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#if defined (__GNUC__)
+# define XXH_CONSTF  __attribute__((__const__))
+# define XXH_PUREF   __attribute__((__pure__))
+# define XXH_MALLOCF __attribute__((__malloc__))
+#else
+# define XXH_CONSTF  /* disable */
+# define XXH_PUREF
+# define XXH_MALLOCF
+#endif
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    8
+#define XXH_VERSION_RELEASE  3
+/*! @brief Version number, encoded as two digits each */
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+
+/*!
+ * @brief Obtains the xxHash version.
+ *
+ * This is mostly useful when xxHash is compiled as a shared library,
+ * since the returned value comes from the library, as opposed to header file.
+ *
+ * @return @ref XXH_VERSION_NUMBER of the invoked library.
+ */
+XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Common basic types
+******************************/
+#include <stddef.h>   /* size_t */
+/*!
+ * @brief Exit code for the streaming API.
+ */
+typedef enum {
+    XXH_OK = 0, /*!< OK */
+    XXH_ERROR   /*!< Error */
+} XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
+/*!
+ * @brief An unsigned 32-bit integer.
+ *
+ * Not necessarily defined to `uint32_t` but functionally equivalent.
+ */
+typedef uint32_t XXH32_hash_t;
+
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   ifdef _AIX
+#     include <inttypes.h>
+#   else
+#     include <stdint.h>
+#   endif
+    typedef uint32_t XXH32_hash_t;
+
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   elif ULONG_MAX == 0xFFFFFFFFUL
+      typedef unsigned long XXH32_hash_t;
+#   else
+#     error "unsupported platform: need a 32-bit type"
+#   endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup XXH32_family XXH32 family
+ * @ingroup public
+ * Contains functions used in the classic 32-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH32 is useful for older platforms, with no or poor 64-bit performance.
+ *   Note that the @ref XXH3_family provides competitive speed for both 32-bit
+ *   and 64-bit systems, and offers true 64/128 bit hash results.
+ *
+ * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families
+ * @see @ref XXH32_impl for implementation details
+ * @{
+ */
+
+/*!
+ * @brief Calculates the 32-bit hash of @p input using xxHash32.
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 32-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 32-bit xxHash32 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+#ifndef XXH_NO_STREAM
+/*!
+ * @typedef struct XXH32_state_s XXH32_state_t
+ * @brief The opaque state struct for the XXH32 streaming API.
+ *
+ * @see XXH32_state_s for details.
+ * @see @ref streaming_example "Streaming Example"
+ */
+typedef struct XXH32_state_s XXH32_state_t;
+
+/*!
+ * @brief Allocates an @ref XXH32_state_t.
+ *
+ * @return An allocated pointer of @ref XXH32_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH32_freeState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void);
+/*!
+ * @brief Frees an @ref XXH32_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note @p statePtr must be allocated with XXH32_createState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+/*!
+ * @brief Copies one @ref XXH32_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH32_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 32-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note This function resets and seeds a state. Call it before @ref XXH32_update().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated 32-bit xxHash32 value from that state.
+ *
+ * @note
+ *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/*******   Canonical representation   *******/
+
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
+ */
+typedef struct {
+    unsigned char digest[4]; /*!< Hash bytes, big endian */
+} XXH32_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
+ *
+ * @param dst  The @ref XXH32_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH32_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
+ *
+ * @param src The @ref XXH32_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+/*! @cond Doxygen ignores this part */
+#ifdef __has_attribute
+# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+# define XXH_HAS_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * C23 __STDC_VERSION__ number hasn't been specified yet. For now
+ * leave as `201711L` (C17 + 1).
+ * TODO: Update to correct value when its been specified.
+ */
+#define XXH_C23_VN 201711L
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/* C-language Attributes are added in C23. */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute)
+# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+#else
+# define XXH_HAS_C_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+#if defined(__cplusplus) && defined(__has_cpp_attribute)
+# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define XXH_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
+ * introduced in CPP17 and C23.
+ * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
+ * C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
+ */
+#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough)
+# define XXH_FALLTHROUGH [[fallthrough]]
+#elif XXH_HAS_ATTRIBUTE(__fallthrough__)
+# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__))
+#else
+# define XXH_FALLTHROUGH /* fallthrough */
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * Define XXH_NOESCAPE for annotated pointers in public API.
+ * https://clang.llvm.org/docs/AttributeReference.html#noescape
+ * As of writing this, only supported by clang.
+ */
+#if XXH_HAS_ATTRIBUTE(noescape)
+# define XXH_NOESCAPE __attribute__((__noescape__))
+#else
+# define XXH_NOESCAPE
+#endif
+/*! @endcond */
+
+
+/*!
+ * @}
+ * @ingroup public
+ * @{
+ */
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
+/*!
+ * @brief An unsigned 64-bit integer.
+ *
+ * Not necessarily defined to `uint64_t` but functionally equivalent.
+ */
+typedef uint64_t XXH64_hash_t;
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   ifdef _AIX
+#     include <inttypes.h>
+#   else
+#     include <stdint.h>
+#   endif
+   typedef uint64_t XXH64_hash_t;
+#else
+#  include <limits.h>
+#  if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
+     /* LP64 ABI says uint64_t is unsigned long */
+     typedef unsigned long XXH64_hash_t;
+#  else
+     /* the following type must have a width of 64-bit */
+     typedef unsigned long long XXH64_hash_t;
+#  endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup XXH64_family XXH64 family
+ * @ingroup public
+ * @{
+ * Contains functions used in the classic 64-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ *   and offers true 64/128 bit hash results.
+ *   It provides better speed for systems with vector processing capabilities.
+ */
+
+/*!
+ * @brief Calculates the 64-bit hash of @p input using xxHash64.
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit xxHash64 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*!
+ * @brief The opaque state struct for the XXH64 streaming API.
+ *
+ * @see XXH64_state_s for details.
+ * @see @ref streaming_example "Streaming Example"
+ */
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+
+/*!
+ * @brief Allocates an @ref XXH64_state_t.
+ *
+ * @return An allocated pointer of @ref XXH64_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH64_freeState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);
+
+/*!
+ * @brief Frees an @ref XXH64_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note @p statePtr must be allocated with XXH64_createState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+
+/*!
+ * @brief Copies one @ref XXH64_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH64_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note This function resets and seeds a state. Call it before @ref XXH64_update().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH64_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH64_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated 64-bit xxHash64 value from that state.
+ *
+ * @note
+ *   Calling XXH64_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+/*******   Canonical representation   *******/
+
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH64_hash_t.
+ */
+typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t.
+ *
+ * @param dst The @ref XXH64_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH64_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t.
+ *
+ * @param src The @ref XXH64_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);
+
+#ifndef XXH_NO_XXH3
+
+/*!
+ * @}
+ * ************************************************************************
+ * @defgroup XXH3_family XXH3 family
+ * @ingroup public
+ * @{
+ *
+ * XXH3 is a more recent hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * Compared to XXH64, expect XXH3 to run approximately
+ * ~2x faster on large inputs and >3x faster on small ones,
+ * exact differences vary depending on platform.
+ *
+ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+ * but does not require it.
+ * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3
+ * at competitive speeds, even without vector support. Further details are
+ * explained in the implementation.
+ *
+ * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD
+ * implementations for many common platforms:
+ *   - AVX512
+ *   - AVX2
+ *   - SSE2
+ *   - ARM NEON
+ *   - WebAssembly SIMD128
+ *   - POWER8 VSX
+ *   - s390x ZVector
+ * This can be controlled via the @ref XXH_VECTOR macro, but it automatically
+ * selects the best version according to predefined macros. For the x86 family, an
+ * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c.
+ *
+ * XXH3 implementation is portable:
+ * it has a generic C90 formulation that can be compiled on any platform,
+ * all implementations generate exactly the same hash value on all platforms.
+ * Starting from v0.8.0, it's also labelled "stable", meaning that
+ * any future version will also generate the same hash value.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ *
+ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
+
+/*!
+ * @ingroup tuning
+ * @brief Possible values for @ref XXH_VECTOR.
+ *
+ * Unless set explicitly, determined automatically.
+ */
+#  define XXH_SCALAR 0 /*!< Portable scalar version */
+#  define XXH_SSE2   1 /*!< SSE2 for Pentium 4, Opteron, all x86_64. */
+#  define XXH_AVX2   2 /*!< AVX2 for Haswell and Bulldozer */
+#  define XXH_AVX512 3 /*!< AVX512 for Skylake and Icelake */
+#  define XXH_NEON   4 /*!< NEON for most ARMv7-A, all AArch64, and WASM SIMD128 */
+#  define XXH_VSX    5 /*!< VSX and ZVector for POWER8/z13 (64-bit) */
+#  define XXH_SVE    6 /*!< SVE for some ARMv8-A and ARMv9-A */
+#  define XXH_LSX    7 /*!< LSX (128-bit SIMD) for LoongArch64 */
+#  define XXH_LASX   8 /*!< LASX (256-bit SIMD) for LoongArch64 */
+
+
+/*-**********************************************************************
+*  XXH3 64-bit variant
+************************************************************************/
+
+/*!
+ * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input.
+ *
+ * @param input  The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @note
+ *   This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however
+ *   it may have slightly better performance due to constant propagation of the
+ *   defaults.
+ *
+ * @see
+ *    XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input.
+ *
+ * @param input  The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed   The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @note
+ *    seed == 0 produces the same results as @ref XXH3_64bits().
+ *
+ * This variant generates a custom secret on the fly based on default secret
+ * altered using the @p seed value.
+ *
+ * While this operation is decently fast, note that it's not completely free.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
+
+/*!
+ * The bare minimum size for a custom secret.
+ *
+ * @see
+ *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+ *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+
+/*!
+ * @brief Calculates 64-bit variant of XXH3 with a custom "secret".
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @pre
+ *   The memory between @p data and @p data + @p len must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p data may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
+ * Therefore, the secret _must_ look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever in doubt about the "randomness" of the blob of bytes,
+ * consider employing @ref XXH3_generateSecret() instead (see below).
+ * It will generate a proper high entropy secret derived from the blob of bytes.
+ * Another advantage of using XXH3_generateSecret() is that
+ * it guarantees that all bits within the initial blob of bytes
+ * will impact every bit of the output.
+ * This is not necessarily the case when using the blob of bytes directly
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ */
+
+/*!
+ * @brief The opaque state struct for the XXH3 streaming API.
+ *
+ * @see XXH3_state_s for details.
+ * @see @ref streaming_example "Streaming Example"
+ */
+typedef struct XXH3_state_s XXH3_state_t;
+XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+
+/*!
+ * @brief Copies one @ref XXH3_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret with default parameters.
+ *   - Call this function before @ref XXH3_64bits_update().
+ *   - Digest will be equivalent to `XXH3_64bits()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed     The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret from `seed`.
+ *   - Call this function before @ref XXH3_64bits_update().
+ *   - Digest will be equivalent to `XXH3_64bits_withSeed()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   `secret` is referenced, it _must outlive_ the hash streaming session.
+ *
+ * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated XXH3 64-bit hash value from that state.
+ *
+ * @note
+ *   Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/* note : canonical representation of XXH3 is the same as XXH64
+ * since they both produce XXH64_hash_t values */
+
+
+/*-**********************************************************************
+*  XXH3 128-bit variant
+************************************************************************/
+
+/*!
+ * @brief The return value from 128-bit hashes.
+ *
+ * Stored in little endian order, although the fields themselves are in native
+ * endianness.
+ */
+typedef struct {
+    XXH64_hash_t low64;   /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+    XXH64_hash_t high64;  /*!< `value >> 64` */
+} XXH128_hash_t;
+
+/*!
+ * @brief Calculates 128-bit unseeded variant of XXH3 of @p data.
+ *
+ * @param data The block of data to be hashed, at least @p length bytes in size.
+ * @param len  The length of @p data, in bytes.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead
+ * for shorter inputs.
+ *
+ * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however
+ * it may have slightly better performance due to constant propagation of the
+ * defaults.
+ *
+ * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);
+/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param data The block of data to be hashed, at least @p length bytes in size.
+ * @param len  The length of @p data, in bytes.
+ * @param seed The 64-bit seed to alter the hash result predictably.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * @note
+ *    seed == 0 produces the same results as @ref XXH3_64bits().
+ *
+ * This variant generates a custom secret on the fly based on default secret
+ * altered using the @p seed value.
+ *
+ * While this operation is decently fast, note that it's not completely free.
+ *
+ * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
+/*!
+ * @brief Calculates 128-bit variant of XXH3 with a custom "secret".
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
+ * Therefore, the secret _must_ look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever in doubt about the "randomness" of the blob of bytes,
+ * consider employing @ref XXH3_generateSecret() instead (see below).
+ * It will generate a proper high entropy secret derived from the blob of bytes.
+ * Another advantage of using XXH3_generateSecret() is that
+ * it guarantees that all bits within the initial blob of bytes
+ * will impact every bit of the output.
+ * This is not necessarily the case when using the blob of bytes directly
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ *
+ * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
+ * Use already declared XXH3_createState() and XXH3_freeState().
+ *
+ * All reset and streaming functions have same meaning as their 64-bit counterpart.
+ */
+
+/*!
+ * @brief Resets an @ref XXH3_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret with default parameters.
+ *   - Call it before @ref XXH3_128bits_update().
+ *   - Digest will be equivalent to `XXH3_128bits()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed     The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret from `seed`.
+ *   - Call it before @ref XXH3_128bits_update().
+ *   - Digest will be equivalent to `XXH3_128bits_withSeed()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   The state struct to reset.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * `secret` is referenced, it _must outlive_ the hash streaming session.
+ * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH3_state_t.
+ *
+ * Call this to incrementally consume blocks of data.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated XXH3 128-bit hash value from that state.
+ *
+ * @note
+ *   Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/* Following helper functions make it possible to compare XXH128_hast_t values.
+ * Since XXH128_hash_t is a structure, this capability is not offered by the language.
+ * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * @brief Check equality of two XXH128_hash_t values
+ *
+ * @param h1 The 128-bit hash value.
+ * @param h2 Another 128-bit hash value.
+ *
+ * @return `1` if `h1` and `h2` are equal.
+ * @return `0` if they are not.
+ */
+XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * @brief Compares two @ref XXH128_hash_t
+ *
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+ *
+ * @param h128_1 Left-hand side value
+ * @param h128_2 Right-hand side value
+ *
+ * @return >0 if @p h128_1  > @p h128_2
+ * @return =0 if @p h128_1 == @p h128_2
+ * @return <0 if @p h128_1  < @p h128_2
+ */
+XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
+
+
+/*!
+ * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t.
+ *
+ * @param dst  The @ref XXH128_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH128_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t.
+ *
+ * @param src The @ref XXH128_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);
+
+
+#endif  /* !XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+
+/*!
+ * @}
+ */
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation
+ * of XXH states, on stack or in a struct, for example.
+ * Never **ever** access their members directly.
+ */
+
+/*!
+ * @internal
+ * @brief Structure for XXH32 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH32_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH64_state_s, XXH3_state_s
+ */
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
+   XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
+   XXH32_hash_t acc[4];       /*!< Accumulator lanes */
+   unsigned char buffer[16];  /*!< Internal buffer for partial reads. */
+   XXH32_hash_t bufferedSize; /*!< Amount of data in @ref buffer */
+   XXH32_hash_t reserved;     /*!< Reserved field. Do not read nor write to it. */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+/*!
+ * @internal
+ * @brief Structure for XXH64 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH64_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH32_state_s, XXH3_state_s
+ */
+struct XXH64_state_s {
+   XXH64_hash_t total_len;    /*!< Total length hashed. This is always 64-bit. */
+   XXH64_hash_t acc[4];       /*!< Accumulator lanes */
+   unsigned char buffer[32];  /*!< Internal buffer for partial reads.. */
+   XXH32_hash_t bufferedSize; /*!< Amount of data in @ref buffer */
+   XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
+   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it. */
+};   /* typedef'd to XXH64_state_t */
+
+#ifndef XXH_NO_XXH3
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
+#  define XXH_ALIGN(n)      _Alignas(n)
+#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
+/* In C++ alignas() is a keyword */
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+/*!
+ * @internal
+ * @brief The size of the internal XXH3 buffer.
+ *
+ * This is the optimal update size for incremental hashing.
+ *
+ * @see XXH3_64b_update(), XXH3_128b_update().
+ */
+#define XXH3_INTERNALBUFFER_SIZE 256
+
+/*!
+ * @def XXH3_SECRET_DEFAULT_SIZE
+ * @brief Default Secret's size
+ *
+ * This is the size of internal XXH3_kSecret
+ * and is needed by XXH3_generateSecret_fromSeed().
+ *
+ * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+ */
+#define XXH3_SECRET_DEFAULT_SIZE 192
+
+/*!
+ * @internal
+ * @brief Structure for XXH3 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
+ * Otherwise it is an opaque type.
+ * Never use this definition in combination with dynamic library.
+ * This allows fields to safely be changed in the future.
+ *
+ * @note ** This structure has a strict alignment requirement of 64 bytes!! **
+ * Do not allocate this with `malloc()` or `new`,
+ * it will not be sufficiently aligned.
+ * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
+ *
+ * Typedef'd to @ref XXH3_state_t.
+ * Do never access the members of this struct directly.
+ *
+ * @see XXH3_INITSTATE() for stack initialization.
+ * @see XXH3_createState(), XXH3_freeState().
+ * @see XXH32_state_s, XXH64_state_s
+ */
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+       /*!< The 8 accumulators. See @ref XXH32_state_s::acc and @ref XXH64_state_s::acc */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+       /*!< Used to store a custom secret generated from a seed. */
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+       /*!< The internal buffer. @see XXH32_state_s::mem32 */
+   XXH32_hash_t bufferedSize;
+       /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
+   XXH32_hash_t useSeed;
+       /*!< Reserved field. Needed for padding on 64-bit. */
+   size_t nbStripesSoFar;
+       /*!< Number or stripes processed. */
+   XXH64_hash_t totalLen;
+       /*!< Total length hashed. 64-bit even on 32-bit targets. */
+   size_t nbStripesPerBlock;
+       /*!< Number of stripes per block. */
+   size_t secretLimit;
+       /*!< Size of @ref customSecret or @ref extSecret */
+   XXH64_hash_t seed;
+       /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */
+   XXH64_hash_t reserved64;
+       /*!< Reserved field. */
+   const unsigned char* extSecret;
+       /*!< Reference to an external secret for the _withSecret variants, NULL
+        *   for other variants. */
+   /* note: there may be some padding at the end due to alignment on 64 bytes */
+}; /* typedef'd to XXH3_state_t */
+
+#undef XXH_ALIGN_MEMBER
+
+/*!
+ * @brief Initializes a stack-allocated `XXH3_state_s`.
+ *
+ * When the @ref XXH3_state_t structure is merely emplaced on stack,
+ * it should be initialized with XXH3_INITSTATE() or a memset()
+ * in case its first reset uses XXH3_NNbits_reset_withSeed().
+ * This init can be omitted if the first reset uses default or _withSecret mode.
+ * This operation isn't necessary when the state is created with XXH3_createState().
+ * Note that this doesn't prepare the state for a streaming operation,
+ * it's still necessary to use XXH3_NNbits_reset*() afterwards.
+ */
+#define XXH3_INITSTATE(XXH3_state_ptr)                       \
+    do {                                                     \
+        XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \
+        tmp_xxh3_state_ptr->seed = 0;                        \
+        tmp_xxh3_state_ptr->extSecret = NULL;                \
+    } while(0)
+
+
+/*!
+ * @brief Calculates the 128-bit hash of @p data using XXH3.
+ *
+ * @param data The block of data to be hashed, at least @p len bytes in size.
+ * @param len  The length of @p data, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p data and @p data + @p len must be valid,
+ *   readable, contiguous memory. However, if @p len is `0`, @p data may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 128-bit XXH3 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* ===   Experimental API   === */
+/* Symbols defined below must be considered tied to a specific library version. */
+
+/*!
+ * @brief Derive a high-entropy secret from any user-defined content, named customSeed.
+ *
+ * @param secretBuffer    A writable buffer for derived high-entropy secret data.
+ * @param secretSize      Size of secretBuffer, in bytes.  Must be >= XXH3_SECRET_SIZE_MIN.
+ * @param customSeed      A user-defined content.
+ * @param customSeedSize  Size of customSeed, in bytes.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * The generated secret can be used in combination with `*_withSecret()` functions.
+ * The `_withSecret()` variants are useful to provide a higher level of protection
+ * than 64-bit seed, as it becomes much more difficult for an external actor to
+ * guess how to impact the calculation logic.
+ *
+ * The function accepts as input a custom seed of any length and any content,
+ * and derives from it a high-entropy secret of length @p secretSize into an
+ * already allocated buffer @p secretBuffer.
+ *
+ * The generated secret can then be used with any `*_withSecret()` variant.
+ * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(),
+ * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret()
+ * are part of this list. They all accept a `secret` parameter
+ * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN)
+ * _and_ feature very high entropy (consist of random-looking bytes).
+ * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can
+ * be employed to ensure proper quality.
+ *
+ * @p customSeed can be anything. It can have any size, even small ones,
+ * and its content can be anything, even "poor entropy" sources such as a bunch
+ * of zeroes. The resulting `secret` will nonetheless provide all required qualities.
+ *
+ * @pre
+ *   - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN
+ *   - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
+ *
+ * Example code:
+ * @code{.c}
+ *    #include <stdio.h>
+ *    #include <stdlib.h>
+ *    #include <string.h>
+ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
+ *    #include "xxhash.h"
+ *    // Hashes argv[2] using the entropy from argv[1].
+ *    int main(int argc, char* argv[])
+ *    {
+ *        char secret[XXH3_SECRET_SIZE_MIN];
+ *        if (argv != 3) { return 1; }
+ *        XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1]));
+ *        XXH64_hash_t h = XXH3_64bits_withSecret(
+ *             argv[2], strlen(argv[2]),
+ *             secret, sizeof(secret)
+ *        );
+ *        printf("%016llx\n", (unsigned long long) h);
+ *    }
+ * @endcode
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);
+
+/*!
+ * @brief Generate the same secret as the _withSeed() variants.
+ *
+ * @param secretBuffer A writable buffer of @ref XXH3_SECRET_DEFAULT_SIZE bytes
+ * @param seed         The 64-bit seed to alter the hash result predictably.
+ *
+ * The generated secret can be used in combination with
+ *`*_withSecret()` and `_withSecretandSeed()` variants.
+ *
+ * Example C++ `std::string` hash class:
+ * @code{.cpp}
+ *    #include <string>
+ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
+ *    #include "xxhash.h"
+ *    // Slow, seeds each time
+ *    class HashSlow {
+ *        XXH64_hash_t seed;
+ *    public:
+ *        HashSlow(XXH64_hash_t s) : seed{s} {}
+ *        size_t operator()(const std::string& x) const {
+ *            return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)};
+ *        }
+ *    };
+ *    // Fast, caches the seeded secret for future uses.
+ *    class HashFast {
+ *        unsigned char secret[XXH3_SECRET_DEFAULT_SIZE];
+ *    public:
+ *        HashFast(XXH64_hash_t s) {
+ *            XXH3_generateSecret_fromSeed(secret, seed);
+ *        }
+ *        size_t operator()(const std::string& x) const {
+ *            return size_t{
+ *                XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret))
+ *            };
+ *        }
+ *    };
+ * @endcode
+ */
+XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);
+
+/*!
+ * @brief Maximum size of "short" key in bytes.
+ */
+#define XXH3_MIDSIZE_MAX 240
+
+/*!
+ * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed       The 64-bit seed to alter the hash result predictably.
+ *
+ * These variants generate hash values using either:
+ * - @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes)
+ * - @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX).
+ *
+ * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
+ * `_withSeed()` has to generate the secret on the fly for "large" keys.
+ * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
+ * `_withSecret()` has to generate the masks on the fly for "small" keys,
+ * which requires more instructions than _withSeed() variants.
+ * Therefore, _withSecretandSeed variant combines the best of both worlds.
+ *
+ * When @p secret has been generated by XXH3_generateSecret_fromSeed(),
+ * this variant produces *exactly* the same results as `_withSeed()` variant,
+ * hence offering only a pure speed benefit on "large" input,
+ * by skipping the need to regenerate the secret for every large input.
+ *
+ * Another usage scenario is to hash the secret to a 64-bit hash value,
+ * for example with XXH3_64bits(), which then becomes the seed,
+ * and then employ both the seed and the secret in _withSecretandSeed().
+ * On top of speed, an added benefit is that each bit in the secret
+ * has a 50% chance to swap each bit in the output, via its impact to the seed.
+ *
+ * This is not guaranteed when using the secret directly in "small data" scenarios,
+ * because only portions of the secret are employed for small data.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
+XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,
+                              XXH_NOESCAPE const void* secret, size_t secretSize,
+                              XXH64_hash_t seed);
+
+/*!
+ * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param input      The memory segment to be hashed, at least @p len bytes in size.
+ * @param length     The length of @p data, in bytes.
+ * @param secret     The secret used to alter hash result predictably.
+ * @param secretSize The length of @p secret, in bytes (must be >= XXH3_SECRET_SIZE_MIN)
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed(): contract is the same.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
+XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,
+                               XXH_NOESCAPE const void* secret, size_t secretSize,
+                               XXH64_hash_t seed64);
+
+#ifndef XXH_NO_STREAM
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed(). Contract is identical.
+ */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
+                                    XXH_NOESCAPE const void* secret, size_t secretSize,
+                                    XXH64_hash_t seed64);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed(). Contract is identical.
+ *
+ * Note: there was a bug in an earlier version of this function (<= v0.8.2)
+ * that would make it generate an incorrect hash value
+ * when @p seed == 0 and @p length < XXH3_MIDSIZE_MAX
+ * and @p secret is different from XXH3_generateSecret_fromSeed().
+ * As stated in the contract, the correct hash result must be
+ * the same as XXH3_128bits_withSeed() when @p length <= XXH3_MIDSIZE_MAX.
+ * Results generated by this older version are wrong, hence not comparable.
+ */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
+                                     XXH_NOESCAPE const void* secret, size_t secretSize,
+                                     XXH64_hash_t seed64);
+
+#endif /* !XXH_NO_STREAM */
+
+#endif  /* !XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be hosted inside xxhash.c.
+ *
+ * However, inlining requires implementation to be visible to the compiler,
+ * hence be included alongside the header.
+ * Previously, implementation was hosted inside xxhash.c,
+ * which was then #included when inlining was activated.
+ * This construction created issues with a few build and install systems,
+ * as it required xxhash.c to be stored in /include directory.
+ *
+ * xxHash implementation is now directly integrated within xxhash.h.
+ * As a consequence, xxhash.c is no longer needed in /include.
+ *
+ * xxhash.c is still available and is still useful.
+ * In a "normal" setup, when xxhash is not inlined,
+ * xxhash.h only exposes the prototypes and public symbols,
+ * while xxhash.c can be built into an object file xxhash.o
+ * which can then be linked into the final binary.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+
+/*!
+ * @defgroup tuning Tuning parameters
+ * @{
+ *
+ * Various macros to control xxHash's behavior.
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Define this to disable 64-bit code.
+ *
+ * Useful if only using the @ref XXH32_family and you have a strict C90 compiler.
+ */
+#  define XXH_NO_LONG_LONG
+#  undef XXH_NO_LONG_LONG /* don't actually */
+/*!
+ * @brief Controls how unaligned memory is accessed.
+ *
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow selection of a different access method
+ * in the search for improved performance.
+ *
+ * @par Possible options:
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+ *   @par
+ *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
+ *     eliminate the function call and treat it as an unaligned access.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))`
+ *   @par
+ *     Depends on compiler extensions and is therefore not portable.
+ *     This method is safe _if_ your compiler supports it,
+ *     and *generally* as fast or faster than `memcpy`.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+ *  @par
+ *     Casts directly and dereferences. This method doesn't depend on the
+ *     compiler, but it violates the C standard as it directly dereferences an
+ *     unaligned pointer. It can generate buggy code on targets which do not
+ *     support unaligned memory accesses, but in some circumstances, it's the
+ *     only known way to get the most performance.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+ *  @par
+ *     Also portable. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction. However, some compilers
+ *     will emit literal byteshifts even if the target supports unaligned access.
+ *
+ *
+ * @warning
+ *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
+ *   care, as what works on one compiler/platform/optimization level may cause
+ *   another to read garbage data or even crash.
+ *
+ * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
+ *
+ * Prefer these methods in priority order (0 > 3 > 1 > 2)
+ */
+#  define XXH_FORCE_MEMORY_ACCESS 0
+
+/*!
+ * @def XXH_SIZE_OPT
+ * @brief Controls how much xxHash optimizes for size.
+ *
+ * xxHash, when compiled, tends to result in a rather large binary size. This
+ * is mostly due to heavy usage to forced inlining and constant folding of the
+ * @ref XXH3_family to increase performance.
+ *
+ * However, some developers prefer size over speed. This option can
+ * significantly reduce the size of the generated code. When using the `-Os`
+ * or `-Oz` options on GCC or Clang, this is defined to 1 by default,
+ * otherwise it is defined to 0.
+ *
+ * Most of these size optimizations can be controlled manually.
+ *
+ * This is a number from 0-2.
+ *  - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed
+ *    comes first.
+ *  - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more
+ *    conservative and disables hacks that increase code size. It implies the
+ *    options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0,
+ *    and @ref XXH3_NEON_LANES == 8 if they are not already defined.
+ *  - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible.
+ *    Performance may cry. For example, the single shot functions just use the
+ *    streaming API.
+ */
+#  define XXH_SIZE_OPT 0
+
+/*!
+ * @def XXH_FORCE_ALIGN_CHECK
+ * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
+ * and XXH64() only).
+ *
+ * This is an important performance trick for architectures without decent
+ * unaligned memory access performance.
+ *
+ * It checks for input alignment, and when conditions are met, uses a "fast
+ * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
+ * faster_ read speed.
+ *
+ * The check costs one initial branch per hash, which is generally negligible,
+ * but not zero.
+ *
+ * Moreover, it's not useful to generate an additional code path if memory
+ * access uses the same instruction for both aligned and unaligned
+ * addresses (e.g. x86 and aarch64).
+ *
+ * In these cases, the alignment check can be removed by setting this macro to 0.
+ * Then the code will always use unaligned memory access.
+ * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips
+ * which are platforms known to offer good unaligned memory accesses performance.
+ *
+ * It is also disabled by default when @ref XXH_SIZE_OPT >= 1.
+ *
+ * This option does not affect XXH3 (only XXH32 and XXH64).
+ */
+#  define XXH_FORCE_ALIGN_CHECK 0
+
+/*!
+ * @def XXH_NO_INLINE_HINTS
+ * @brief When non-zero, sets all functions to `static`.
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if
+ * @ref XXH_SIZE_OPT >= 1, this will automatically be defined.
+ */
+#  define XXH_NO_INLINE_HINTS 0
+
+/*!
+ * @def XXH3_INLINE_SECRET
+ * @brief Determines whether to inline the XXH3 withSecret code.
+ *
+ * When the secret size is known, the compiler can improve the performance
+ * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret().
+ *
+ * However, if the secret size is not known, it doesn't have any benefit. This
+ * happens when xxHash is compiled into a global symbol. Therefore, if
+ * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0.
+ *
+ * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers
+ * that are *sometimes* force inline on -Og, and it is impossible to automatically
+ * detect this optimization level.
+ */
+#  define XXH3_INLINE_SECRET 0
+
+/*!
+ * @def XXH32_ENDJMP
+ * @brief Whether to use a jump for `XXH32_finalize`.
+ *
+ * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
+ * This is generally preferable for performance,
+ * but depending on exact architecture, a jmp may be preferable.
+ *
+ * This setting is only possibly making a difference for very small inputs.
+ */
+#  define XXH32_ENDJMP 0
+
+/*!
+ * @internal
+ * @brief Redefines old internal names.
+ *
+ * For compatibility with code that uses xxHash's internals before the names
+ * were changed to improve namespacing. There is no other reason to use this.
+ */
+#  define XXH_OLD_NAMES
+#  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
+
+/*!
+ * @def XXH_NO_STREAM
+ * @brief Disables the streaming API.
+ *
+ * When xxHash is not inlined and the streaming functions are not used, disabling
+ * the streaming functions can improve code size significantly, especially with
+ * the @ref XXH3_family which tends to make constant folded copies of itself.
+ */
+#  define XXH_NO_STREAM
+#  undef XXH_NO_STREAM /* don't actually */
+#endif /* XXH_DOXYGEN */
+/*!
+ * @}
+ */
+
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+   /* prefer __packed__ structures (method 1) for GCC
+    * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy
+    * which for some reason does unaligned loads. */
+#  if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+#ifndef XXH_SIZE_OPT
+   /* default to 1 for -Os or -Oz */
+#  if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__)
+#    define XXH_SIZE_OPT 1
+#  else
+#    define XXH_SIZE_OPT 0
+#  endif
+#endif
+
+#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
+   /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */
+#  if XXH_SIZE_OPT >= 1 || \
+      defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \
+   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64)    || defined(_M_ARM) /* visual */
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+#ifndef XXH_NO_INLINE_HINTS
+#  if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__)  /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+#ifndef XXH3_INLINE_SECRET
+#  if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \
+     || !defined(XXH_INLINE_ALL)
+#    define XXH3_INLINE_SECRET 0
+#  else
+#    define XXH3_INLINE_SECRET 1
+#  endif
+#endif
+
+#ifndef XXH32_ENDJMP
+/* generally preferable for performance */
+#  define XXH32_ENDJMP 0
+#endif
+
+/*!
+ * @defgroup impl Implementation
+ * @{
+ */
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+#if defined(XXH_NO_STREAM)
+/* nothing */
+#elif defined(XXH_NO_STDLIB)
+
+/* When requesting to disable any mention of stdlib,
+ * the library loses the ability to invoked malloc / free.
+ * In practice, it means that functions like `XXH*_createState()`
+ * will always fail, and return NULL.
+ * This flag is useful in situations where
+ * xxhash.h is integrated into some kernel, embedded or limited environment
+ * without access to dynamic allocation.
+ */
+
+static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; }
+static void XXH_free(void* p) { (void)p; }
+
+#else
+
+/*
+ * Modify the local functions below should you wish to use
+ * different memory routines for malloc() and free()
+ */
+#include <stdlib.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than malloc().
+ */
+static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); }
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than free().
+ */
+static void XXH_free(void* p) { free(p); }
+
+#endif  /* XXH_NO_STDLIB */
+
+#ifndef XXH_memcpy
+/*!
+ * @internal
+ * @brief XXH_memcpy() macro can be redirected at compile time
+ */
+#  include <string.h>
+#  define XXH_memcpy memcpy
+#endif
+
+#ifndef XXH_memset
+/*!
+ * @internal
+ * @brief XXH_memset() macro can be redirected at compile time
+ */
+#  include <string.h>
+#  define XXH_memset memset
+#endif
+
+#ifndef XXH_memcmp
+/*!
+ * @internal
+ * @brief XXH_memcmp() macro can be redirected at compile time
+ * Note: only needed by XXH128.
+ */
+#  include <string.h>
+#  define XXH_memcmp memcmp
+#endif
+
+
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS  /* disable inlining hints */
+#  if defined(__GNUC__) || defined(__clang__)
+#    define XXH_FORCE_INLINE static __attribute__((__unused__))
+#  else
+#    define XXH_FORCE_INLINE static
+#  endif
+#  define XXH_NO_INLINE static
+/* enable inlining hints */
+#elif defined(__GNUC__) || defined(__clang__)
+#  define XXH_FORCE_INLINE static __inline__ __attribute__((__always_inline__, __unused__))
+#  define XXH_NO_INLINE static __attribute__((__noinline__))
+#elif defined(_MSC_VER)  /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#elif defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
+#  define XXH_FORCE_INLINE static inline
+#  define XXH_NO_INLINE static
+#else
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#endif
+
+#if defined(XXH_INLINE_ALL)
+#  define XXH_STATIC XXH_FORCE_INLINE
+#else
+#  define XXH_STATIC static
+#endif
+
+#if XXH3_INLINE_SECRET
+#  define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE
+#else
+#  define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE
+#endif
+
+#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
+#  define XXH_RESTRICT   /* disable */
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
+   || (defined (__clang__)) \
+   || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \
+   || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300))
+/*
+ * There are a LOT more compilers that recognize __restrict but this
+ * covers the major ones.
+ */
+#  define XXH_RESTRICT   __restrict
+#else
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+/* *************************************
+*  Debug
+***************************************/
+/*!
+ * @ingroup tuning
+ * @def XXH_DEBUGLEVEL
+ * @brief Sets the debugging level.
+ *
+ * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
+ * compiler's command line options. The value must be a number.
+ */
+#ifndef XXH_DEBUGLEVEL
+#  ifdef DEBUGLEVEL /* backwards compat */
+#    define XXH_DEBUGLEVEL DEBUGLEVEL
+#  else
+#    define XXH_DEBUGLEVEL 0
+#  endif
+#endif
+
+#if (XXH_DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  if defined(__INTEL_COMPILER)
+#    define XXH_ASSERT(c)   XXH_ASSUME((unsigned char) (c))
+#  else
+#    define XXH_ASSERT(c)   XXH_ASSUME(c)
+#  endif
+#endif
+
+/* note: use after variable declarations */
+#ifndef XXH_STATIC_ASSERT
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)    /* C11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0)
+#  elif defined(__cplusplus) && (__cplusplus >= 201103L)            /* C++11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
+#  else
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)
+#  endif
+#  define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)
+#endif
+
+/*!
+ * @internal
+ * @def XXH_COMPILER_GUARD(var)
+ * @brief Used to prevent unwanted optimizations for @p var.
+ *
+ * It uses an empty GCC inline assembly statement with a register constraint
+ * which forces @p var into a general purpose register (eg eax, ebx, ecx
+ * on x86) and marks it as modified.
+ *
+ * This is used in a few places to avoid unwanted autovectorization (e.g.
+ * XXH32_round()). All vectorization we want is explicit via intrinsics,
+ * and _usually_ isn't wanted elsewhere.
+ *
+ * We also use it to prevent unwanted constant folding for AArch64 in
+ * XXH3_initCustomSecret_scalar().
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var))
+#else
+#  define XXH_COMPILER_GUARD(var) ((void)0)
+#endif
+
+/* Specifically for NEON vectors which use the "w" constraint, on
+ * Clang. */
+#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__)
+#  define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var))
+#else
+#  define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0)
+#endif
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   ifdef _AIX
+#     include <inttypes.h>
+#   else
+#     include <stdint.h>
+#   endif
+    typedef uint8_t xxh_u8;
+#else
+    typedef unsigned char xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+#ifdef XXH_OLD_NAMES
+#  warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly"
+#  define BYTE xxh_u8
+#  define U8   xxh_u8
+#  define U32  xxh_u32
+#endif
+
+/* ***   Memory access   *** */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_read32(const void* ptr)
+ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit native endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readBE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit big endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
+ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
+ * always @ref XXH_alignment::XXH_unaligned.
+ *
+ * @param ptr The pointer to read from.
+ * @param align Whether @p ptr is aligned.
+ * @pre
+ *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
+ *   aligned.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
+ * documentation claimed that it only increased the alignment, but actually it
+ * can decrease it on gcc, clang, and icc:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
+ * https://gcc.godbolt.org/z/xYez1j67Y.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; } __attribute__((__packed__)) unalign;
+#endif
+static xxh_u32 XXH_read32(const void* ptr)
+{
+    typedef __attribute__((__aligned__(1))) xxh_u32 xxh_unalign32;
+    return *((const xxh_unalign32*)ptr);
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianness   *** */
+
+/*!
+ * @ingroup tuning
+ * @def XXH_CPU_LITTLE_ENDIAN
+ * @brief Whether the target is little endian.
+ *
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined,
+ * a runtime check (which is usually constant folded) is used instead.
+ *
+ * @note
+ *   This is not necessarily defined to an integer constant.
+ *
+ * @see XXH_isLittleEndian() for the runtime check.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+/*!
+ * @internal
+ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
+ *
+ * Most compilers will constant fold this.
+ */
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Portable and well-defined behavior.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifdef __has_builtin
+#  define XXH_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define XXH_HAS_BUILTIN(x) 0
+#endif
+
+
+
+/*
+ * C23 and future versions have standard "unreachable()".
+ * Once it has been implemented reliably we can add it as an
+ * additional case:
+ *
+ * ```
+ * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN)
+ * #  include <stddef.h>
+ * #  ifdef unreachable
+ * #    define XXH_UNREACHABLE() unreachable()
+ * #  endif
+ * #endif
+ * ```
+ *
+ * Note C++23 also has std::unreachable() which can be detected
+ * as follows:
+ * ```
+ * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L)
+ * #  include <utility>
+ * #  define XXH_UNREACHABLE() std::unreachable()
+ * #endif
+ * ```
+ * NB: `__cpp_lib_unreachable` is defined in the `<version>` header.
+ * We don't use that as including `<utility>` in `extern "C"` blocks
+ * doesn't work on GCC12
+ */
+
+#if XXH_HAS_BUILTIN(__builtin_unreachable)
+#  define XXH_UNREACHABLE() __builtin_unreachable()
+
+#elif defined(_MSC_VER)
+#  define XXH_UNREACHABLE() __assume(0)
+
+#else
+#  define XXH_UNREACHABLE()
+#endif
+
+#if XXH_HAS_BUILTIN(__builtin_assume)
+#  define XXH_ASSUME(c) __builtin_assume(c)
+#else
+#  define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }
+#endif
+
+/*!
+ * @internal
+ * @def XXH_rotl32(x,r)
+ * @brief 32-bit rotate left.
+ *
+ * @param x The 32-bit integer to be rotated.
+ * @param r The number of bits to rotate.
+ * @pre
+ *   @p r > 0 && @p r < 32
+ * @note
+ *   @p x and @p r may be evaluated multiple times.
+ * @return The rotated result.
+ */
+#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
+                               && XXH_HAS_BUILTIN(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+#elif XXH_HAS_BUILTIN(__builtin_stdc_rotate_left)
+#  define XXH_rotl32 __builtin_stdc_rotate_left
+#  define XXH_rotl64 __builtin_stdc_rotate_left
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_swap32(xxh_u32 x)
+ * @brief A 32-bit byteswap.
+ *
+ * @param x The 32-bit integer to byteswap.
+ * @return @p x, byteswapped.
+ */
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+
+/*!
+ * @internal
+ * @brief Enum to indicate whether a pointer is aligned.
+ */
+typedef enum {
+    XXH_aligned,  /*!< Aligned */
+    XXH_unaligned /*!< Possibly unaligned */
+} XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+/*! @ingroup public */
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @defgroup XXH32_impl XXH32 implementation
+ * @ingroup impl
+ *
+ * Details on the XXH32 implementation.
+ * @{
+ */
+ /* #define instead of static const, to be used as initializers */
+#define XXH_PRIME32_1  0x9E3779B1U  /*!< 0b10011110001101110111100110110001 */
+#define XXH_PRIME32_2  0x85EBCA77U  /*!< 0b10000101111010111100101001110111 */
+#define XXH_PRIME32_3  0xC2B2AE3DU  /*!< 0b11000010101100101010111000111101 */
+#define XXH_PRIME32_4  0x27D4EB2FU  /*!< 0b00100111110101001110101100101111 */
+#define XXH_PRIME32_5  0x165667B1U  /*!< 0b00010110010101100110011110110001 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME32_1 XXH_PRIME32_1
+#  define PRIME32_2 XXH_PRIME32_2
+#  define PRIME32_3 XXH_PRIME32_3
+#  define PRIME32_4 XXH_PRIME32_4
+#  define PRIME32_5 XXH_PRIME32_5
+#endif
+
+/*!
+ * @internal
+ * @brief Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * @p acc.
+ *
+ * @param acc The accumulator lane.
+ * @param input The stripe of input to mix.
+ * @return The mixed accumulator lane.
+ */
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * XXH_PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= XXH_PRIME32_1;
+#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * A compiler fence is used to prevent GCC and Clang from
+     * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing
+     * the loop. NEON is only faster on the A53, and with the newer cores, it is less
+     * than half the speed.
+     *
+     * Additionally, this is used on WASM SIMD128 because it JITs to the same
+     * SIMD instructions and has the same issue.
+     */
+    XXH_COMPILER_GUARD(acc);
+#endif
+    return acc;
+}
+
+/*!
+ * @internal
+ * @brief Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * @param hash The hash to avalanche.
+ * @return The avalanched hash.
+ */
+static xxh_u32 XXH32_avalanche(xxh_u32 hash)
+{
+    hash ^= hash >> 15;
+    hash *= XXH_PRIME32_2;
+    hash ^= hash >> 13;
+    hash *= XXH_PRIME32_3;
+    hash ^= hash >> 16;
+    return hash;
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+/*!
+ * @internal
+ * @brief Sets up the initial accumulator state for XXH32().
+ */
+XXH_FORCE_INLINE void
+XXH32_initAccs(xxh_u32 *acc, xxh_u32 seed)
+{
+    XXH_ASSERT(acc != NULL);
+    acc[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+    acc[1] = seed + XXH_PRIME32_2;
+    acc[2] = seed + 0;
+    acc[3] = seed - XXH_PRIME32_1;
+}
+
+/*!
+ * @internal
+ * @brief Consumes a block of data for XXH32().
+ *
+ * @return the end input pointer.
+ */
+XXH_FORCE_INLINE const xxh_u8 *
+XXH32_consumeLong(
+    xxh_u32 *XXH_RESTRICT acc,
+    xxh_u8 const *XXH_RESTRICT input,
+    size_t len,
+    XXH_alignment align
+)
+{
+    const xxh_u8* const bEnd = input + len;
+    const xxh_u8* const limit = bEnd - 15;
+    XXH_ASSERT(acc != NULL);
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(len >= 16);
+    do {
+        acc[0] = XXH32_round(acc[0], XXH_get32bits(input)); input += 4;
+        acc[1] = XXH32_round(acc[1], XXH_get32bits(input)); input += 4;
+        acc[2] = XXH32_round(acc[2], XXH_get32bits(input)); input += 4;
+        acc[3] = XXH32_round(acc[3], XXH_get32bits(input)); input += 4;
+    } while (input < limit);
+
+    return input;
+}
+
+/*!
+ * @internal
+ * @brief Merges the accumulator lanes together for XXH32()
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u32
+XXH32_mergeAccs(const xxh_u32 *acc)
+{
+    XXH_ASSERT(acc != NULL);
+    return XXH_rotl32(acc[0], 1)  + XXH_rotl32(acc[1], 7)
+         + XXH_rotl32(acc[2], 12) + XXH_rotl32(acc[3], 18);
+}
+
+/*!
+ * @internal
+ * @brief Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param hash The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 16.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash.
+ * @see XXH64_finalize().
+ */
+static XXH_PUREF xxh_u32
+XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define XXH_PROCESS1 do {                             \
+    hash += (*ptr++) * XXH_PRIME32_5;                 \
+    hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1;      \
+} while (0)
+
+#define XXH_PROCESS4 do {                             \
+    hash += XXH_get32bits(ptr) * XXH_PRIME32_3;       \
+    ptr += 4;                                         \
+    hash  = XXH_rotl32(hash, 17) * XXH_PRIME32_4;     \
+} while (0)
+
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+
+    /* Compact rerolled version; generally faster */
+    if (!XXH32_ENDJMP) {
+        len &= 15;
+        while (len >= 4) {
+            XXH_PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            XXH_PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(hash);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 8:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 4:       XXH_PROCESS4;
+                         return XXH32_avalanche(hash);
+
+           case 13:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 9:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 5:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(hash);
+
+           case 14:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 10:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 6:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(hash);
+
+           case 15:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 11:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 7:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 3:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 2:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 1:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 0:       return XXH32_avalanche(hash);
+        }
+        XXH_ASSERT(0);
+        return hash;   /* reaching this point is deemed impossible */
+    }
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1 XXH_PROCESS1
+#  define PROCESS4 XXH_PROCESS4
+#else
+#  undef XXH_PROCESS1
+#  undef XXH_PROCESS4
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH32().
+ *
+ * @param input , len , seed Directly passed from @ref XXH32().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    xxh_u32 h32;
+
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=16) {
+        xxh_u32 acc[4];
+        XXH32_initAccs(acc, seed);
+
+        input = XXH32_consumeLong(acc, input, len, align);
+
+        h32 = XXH32_mergeAccs(acc);
+    } else {
+        h32  = seed + XXH_PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+#ifndef XXH_NO_STREAM
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    XXH_memset(statePtr, 0, sizeof(*statePtr));
+    XXH32_initAccs(statePtr->acc, seed);
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    state->total_len_32 += (XXH32_hash_t)len;
+    state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+    XXH_ASSERT(state->bufferedSize < sizeof(state->buffer));
+    if (len < sizeof(state->buffer) - state->bufferedSize)  {   /* fill in tmp buffer */
+        XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+        state->bufferedSize += (XXH32_hash_t)len;
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* xinput = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = xinput + len;
+
+        if (state->bufferedSize) {   /* non-empty buffer: complete first */
+            XXH_memcpy(state->buffer + state->bufferedSize, xinput, sizeof(state->buffer) - state->bufferedSize);
+            xinput += sizeof(state->buffer) - state->bufferedSize;
+            /* then process one round */
+            (void)XXH32_consumeLong(state->acc, state->buffer, sizeof(state->buffer), XXH_aligned);
+            state->bufferedSize = 0;
+        }
+
+        XXH_ASSERT(xinput <= bEnd);
+        if ((size_t)(bEnd - xinput) >= sizeof(state->buffer)) {
+            /* Process the remaining data */
+            xinput = XXH32_consumeLong(state->acc, xinput, (size_t)(bEnd - xinput), XXH_unaligned);
+        }
+
+        if (xinput < bEnd) {
+            /* Copy the leftover to the tmp buffer */
+            XXH_memcpy(state->buffer, xinput, (size_t)(bEnd-xinput));
+            state->bufferedSize = (unsigned)(bEnd-xinput);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH32_mergeAccs(state->acc);
+    } else {
+        h32 = state->acc[2] /* == seed */ + XXH_PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, state->buffer, state->bufferedSize, XXH_aligned);
+}
+#endif /* !XXH_NO_STREAM */
+
+/*******   Canonical representation   *******/
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @ingroup impl
+ * @{
+ */
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+#ifdef XXH_OLD_NAMES
+#  define U64 xxh_u64
+#endif
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    return *(const xxh_u64*) memPtr;
+}
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
+ * documentation claimed that it only increased the alignment, but actually it
+ * can decrease it on gcc, clang, and icc:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
+ * https://gcc.godbolt.org/z/xYez1j67Y.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((__packed__)) unalign64;
+#endif
+static xxh_u64 XXH_read64(const void* ptr)
+{
+    typedef __attribute__((__aligned__(1))) xxh_u64 xxh_unalign64;
+    return *((const xxh_unalign64*)ptr);
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64(xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+/*!
+ * @}
+ * @defgroup XXH64_impl XXH64 implementation
+ * @ingroup impl
+ *
+ * Details on the XXH64 implementation.
+ * @{
+ */
+/* #define rather that static const, to be used as initializers */
+#define XXH_PRIME64_1  0x9E3779B185EBCA87ULL  /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
+#define XXH_PRIME64_2  0xC2B2AE3D27D4EB4FULL  /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
+#define XXH_PRIME64_3  0x165667B19E3779F9ULL  /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
+#define XXH_PRIME64_4  0x85EBCA77C2B2AE63ULL  /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
+#define XXH_PRIME64_5  0x27D4EB2F165667C5ULL  /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME64_1 XXH_PRIME64_1
+#  define PRIME64_2 XXH_PRIME64_2
+#  define PRIME64_3 XXH_PRIME64_3
+#  define PRIME64_4 XXH_PRIME64_4
+#  define PRIME64_5 XXH_PRIME64_5
+#endif
+
+/*! @copydoc XXH32_round */
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * XXH_PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= XXH_PRIME64_1;
+#if (defined(__AVX512F__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * DISABLE AUTOVECTORIZATION:
+     * A compiler fence is used to prevent GCC and Clang from
+     * autovectorizing the XXH64 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling AVX512.
+     *
+     * Autovectorization of XXH64 tends to be detrimental,
+     * though the exact outcome may change depending on exact cpu and compiler version.
+     * For information, it has been reported as detrimental for Skylake-X,
+     * but possibly beneficial for Zen4.
+     *
+     * The default is to disable auto-vectorization,
+     * but you can select to enable it instead using `XXH_ENABLE_AUTOVECTORIZE` build variable.
+     */
+    XXH_COMPILER_GUARD(acc);
+#endif
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
+    return acc;
+}
+
+/*! @copydoc XXH32_avalanche */
+static xxh_u64 XXH64_avalanche(xxh_u64 hash)
+{
+    hash ^= hash >> 33;
+    hash *= XXH_PRIME64_2;
+    hash ^= hash >> 29;
+    hash *= XXH_PRIME64_3;
+    hash ^= hash >> 32;
+    return hash;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+/*!
+ * @internal
+ * @brief Sets up the initial accumulator state for XXH64().
+ */
+XXH_FORCE_INLINE void
+XXH64_initAccs(xxh_u64 *acc, xxh_u64 seed)
+{
+    XXH_ASSERT(acc != NULL);
+    acc[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+    acc[1] = seed + XXH_PRIME64_2;
+    acc[2] = seed + 0;
+    acc[3] = seed - XXH_PRIME64_1;
+}
+
+/*!
+ * @internal
+ * @brief Consumes a block of data for XXH64().
+ *
+ * @return the end input pointer.
+ */
+XXH_FORCE_INLINE const xxh_u8 *
+XXH64_consumeLong(
+    xxh_u64 *XXH_RESTRICT acc,
+    xxh_u8 const *XXH_RESTRICT input,
+    size_t len,
+    XXH_alignment align
+)
+{
+    const xxh_u8* const bEnd = input + len;
+    const xxh_u8* const limit = bEnd - 31;
+    XXH_ASSERT(acc != NULL);
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(len >= 32);
+    do {
+        /* reroll on 32-bit */
+        if (sizeof(void *) < sizeof(xxh_u64)) {
+            size_t i;
+            for (i = 0; i < 4; i++) {
+                acc[i] = XXH64_round(acc[i], XXH_get64bits(input));
+                input += 8;
+            }
+        } else {
+            acc[0] = XXH64_round(acc[0], XXH_get64bits(input)); input += 8;
+            acc[1] = XXH64_round(acc[1], XXH_get64bits(input)); input += 8;
+            acc[2] = XXH64_round(acc[2], XXH_get64bits(input)); input += 8;
+            acc[3] = XXH64_round(acc[3], XXH_get64bits(input)); input += 8;
+        }
+    } while (input < limit);
+
+    return input;
+}
+
+/*!
+ * @internal
+ * @brief Merges the accumulator lanes together for XXH64()
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u64
+XXH64_mergeAccs(const xxh_u64 *acc)
+{
+    XXH_ASSERT(acc != NULL);
+    {
+        xxh_u64 h64 = XXH_rotl64(acc[0], 1) + XXH_rotl64(acc[1], 7)
+                    + XXH_rotl64(acc[2], 12) + XXH_rotl64(acc[3], 18);
+        /* reroll on 32-bit */
+        if (sizeof(void *) < sizeof(xxh_u64)) {
+            size_t i;
+            for (i = 0; i < 4; i++) {
+                h64 = XXH64_mergeRound(h64, acc[i]);
+            }
+        } else {
+            h64 = XXH64_mergeRound(h64, acc[0]);
+            h64 = XXH64_mergeRound(h64, acc[1]);
+            h64 = XXH64_mergeRound(h64, acc[2]);
+            h64 = XXH64_mergeRound(h64, acc[3]);
+        }
+        return h64;
+    }
+}
+
+/*!
+ * @internal
+ * @brief Processes the last 0-31 bytes of @p ptr.
+ *
+ * There may be up to 31 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param hash The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 32.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash
+ * @see XXH32_finalize().
+ */
+XXH_STATIC XXH_PUREF xxh_u64
+XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+    len &= 31;
+    while (len >= 8) {
+        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+        ptr += 8;
+        hash ^= k1;
+        hash  = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+        len -= 8;
+    }
+    if (len >= 4) {
+        hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+        ptr += 4;
+        hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+        len -= 4;
+    }
+    while (len > 0) {
+        hash ^= (*ptr++) * XXH_PRIME64_5;
+        hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1;
+        --len;
+    }
+    return  XXH64_avalanche(hash);
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1_64 XXH_PROCESS1_64
+#  define PROCESS4_64 XXH_PROCESS4_64
+#  define PROCESS8_64 XXH_PROCESS8_64
+#else
+#  undef XXH_PROCESS1_64
+#  undef XXH_PROCESS4_64
+#  undef XXH_PROCESS8_64
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH64().
+ *
+ * @param input , len , seed Directly passed from @ref XXH64().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    xxh_u64 h64;
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=32) {  /* Process a large block of data */
+        xxh_u64 acc[4];
+        XXH64_initAccs(acc, seed);
+
+        input = XXH64_consumeLong(acc, input, len, align);
+
+        h64 = XXH64_mergeAccs(acc);
+    } else {
+        h64  = seed + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*! @ingroup XXH64_family*/
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    XXH_memset(statePtr, 0, sizeof(*statePtr));
+    XXH64_initAccs(statePtr->acc, seed);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    state->total_len += len;
+
+    XXH_ASSERT(state->bufferedSize <= sizeof(state->buffer));
+    if (len < sizeof(state->buffer) - state->bufferedSize)  {   /* fill in tmp buffer */
+        XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+        state->bufferedSize += (XXH32_hash_t)len;
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* xinput = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = xinput + len;
+
+        if (state->bufferedSize) {   /* non-empty buffer => complete first */
+            XXH_memcpy(state->buffer + state->bufferedSize, xinput, sizeof(state->buffer) - state->bufferedSize);
+            xinput += sizeof(state->buffer) - state->bufferedSize;
+            /* and process one round */
+            (void)XXH64_consumeLong(state->acc, state->buffer, sizeof(state->buffer), XXH_aligned);
+            state->bufferedSize = 0;
+        }
+
+        XXH_ASSERT(xinput <= bEnd);
+        if ((size_t)(bEnd - xinput) >= sizeof(state->buffer)) {
+            /* Process the remaining data */
+            xinput = XXH64_consumeLong(state->acc, xinput, (size_t)(bEnd - xinput), XXH_unaligned);
+        }
+
+        if (xinput < bEnd) {
+            /* Copy the leftover to the tmp buffer */
+            XXH_memcpy(state->buffer, xinput, (size_t)(bEnd-xinput));
+            state->bufferedSize = (unsigned)(bEnd-xinput);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        h64 = XXH64_mergeAccs(state->acc);
+    } else {
+        h64  = state->acc[2] /*seed*/ + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, state->buffer, (size_t)state->total_len, XXH_aligned);
+}
+#endif /* !XXH_NO_STREAM */
+
+/******* Canonical representation   *******/
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+#ifndef XXH_NO_XXH3
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+/*!
+ * @}
+ * @defgroup XXH3_impl XXH3 implementation
+ * @ingroup impl
+ * @{
+ */
+
+/* ===   Compiler specifics   === */
+
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#ifndef XXH_HAS_INCLUDE
+#  ifdef __has_include
+/*
+ * Not defined as XXH_HAS_INCLUDE(x) (function-like) because
+ * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion)
+ */
+#    define XXH_HAS_INCLUDE __has_include
+#  else
+#    define XXH_HAS_INCLUDE(x) 0
+#  endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(__ARM_FEATURE_SVE)
+#    include <arm_sve.h>
+#  endif
+#  if defined(__ARM_NEON__) || defined(__ARM_NEON) \
+   || (defined(_M_ARM) && _M_ARM >= 7) \
+   || defined(_M_ARM64) || defined(_M_ARM64EC) \
+   || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* WASM SIMD128 via SIMDe */
+#    define inline __inline__  /* circumvent a clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  elif defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__loongarch_asx)
+#    include <lasxintrin.h>
+#    include <lsxintrin.h>
+#  elif defined(__loongarch_sx)
+#    include <lsxintrin.h>
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ *
+ * Credit: large sections of the vectorial and asm source code paths
+ *         have been contributed by @easyaspi314
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @ingroup tuning
+ * @brief Overrides the vectorization implementation chosen for XXH3.
+ *
+ * Can be defined to 0 to disable SIMD,
+ * or any other authorized value of @ref XXH_VECTOR.
+ *
+ * If this is not defined, it uses predefined macros to determine the best
+ * implementation.
+ */
+#  define XXH_VECTOR XXH_SCALAR
+/*!
+ * @ingroup tuning
+ * @brief Selects the minimum alignment for XXH3's accumulators.
+ *
+ * When using SIMD, this should match the alignment required for said vector
+ * type, so, for example, 32 for AVX2.
+ *
+ * Default: Auto detected.
+ */
+#  define XXH_ACC_ALIGN 8
+#endif
+
+/* Actual definition */
+#ifndef XXH_DOXYGEN
+#endif
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__ARM_FEATURE_SVE)
+#    define XXH_VECTOR XXH_SVE
+#  elif ( \
+        defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
+     || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
+     || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* wasm simd128 via SIMDe */ \
+   ) && ( \
+        defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
+   )
+#    define XXH_VECTOR XXH_NEON
+#  elif defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  elif defined(__loongarch_asx)
+#    define XXH_VECTOR XXH_LASX
+#  elif defined(__loongarch_sx)
+#    define XXH_VECTOR XXH_LSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
+#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
+#  ifdef _MSC_VER
+#    pragma warning(once : 4606)
+#  else
+#    warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
+#  endif
+#  undef XXH_VECTOR
+#  define XXH_VECTOR XXH_SCALAR
+#endif
+
+/*
+ * Controls the alignment of the accumulator,
+ * for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if defined(XXH_X86DISPATCH)
+#     define XXH_ACC_ALIGN 64  /* for compatibility with avx512 */
+#  elif XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512  /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  elif XXH_VECTOR == XXH_SVE   /* sve */
+#     define XXH_ACC_ALIGN 64
+#  elif XXH_VECTOR == XXH_LASX   /* lasx */
+#     define XXH_ACC_ALIGN 64
+#  elif XXH_VECTOR == XXH_LSX   /* lsx */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
+    || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#elif XXH_VECTOR == XXH_SVE
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#else
+#  define XXH_SEC_ALIGN 8
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_ALIASING __attribute__((__may_alias__))
+#else
+#  define XXH_ALIASING /* nothing */
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+#if XXH_VECTOR == XXH_NEON
+
+/*
+ * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3
+ * optimizes out the entire hashLong loop because of the aliasing violation.
+ *
+ * However, GCC is also inefficient at load-store optimization with vld1q/vst1q,
+ * so the only option is to mark it as aliasing.
+ */
+typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
+
+/*!
+ * @internal
+ * @brief `vld1q_u64` but faster and alignment-safe.
+ *
+ * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only
+ * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86).
+ *
+ * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it
+ * prohibits load-store optimizations. Therefore, a direct dereference is used.
+ *
+ * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe
+ * unaligned load.
+ */
+#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
+XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
+{
+    return *(xxh_aliasing_uint64x2_t const *)ptr;
+}
+#else
+XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
+{
+    return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
+}
+#endif
+
+/*!
+ * @internal
+ * @brief `vmlal_u32` on low and high halves of a vector.
+ *
+ * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with
+ * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32`
+ * with `vmlal_u32`.
+ */
+#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    /* Inline assembly is the only way */
+    __asm__("umlal   %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs));
+    return acc;
+}
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    /* This intrinsic works as expected */
+    return vmlal_high_u32(acc, lhs, rhs);
+}
+#else
+/* Portable intrinsic versions */
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs));
+}
+/*! @copydoc XXH_vmlal_low_u32
+ * Assume the compiler converts this to vmlal_high_u32 on aarch64 */
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs));
+}
+#endif
+
+/*!
+ * @ingroup tuning
+ * @brief Controls the NEON to scalar ratio for XXH3
+ *
+ * This can be set to 2, 4, 6, or 8.
+ *
+ * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used.
+ *
+ * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those
+ * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU
+ * bandwidth.
+ *
+ * This is even more noticeable on the more advanced cores like the Cortex-A76 which
+ * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
+ *
+ * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes
+ * and 2 scalar lanes, which is chosen by default.
+ *
+ * This does not apply to Apple processors or 32-bit processors, which run better with
+ * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes.
+ *
+ * This change benefits CPUs with large micro-op buffers without negatively affecting
+ * most other CPUs:
+ *
+ *  | Chipset               | Dispatch type       | NEON only | 6:2 hybrid | Diff. |
+ *  |:----------------------|:--------------------|----------:|-----------:|------:|
+ *  | Snapdragon 730 (A76)  | 2 NEON/8 micro-ops  |  8.8 GB/s |  10.1 GB/s |  ~16% |
+ *  | Snapdragon 835 (A73)  | 2 NEON/3 micro-ops  |  5.1 GB/s |   5.3 GB/s |   ~5% |
+ *  | Marvell PXA1928 (A53) | In-order dual-issue |  1.9 GB/s |   1.9 GB/s |    0% |
+ *  | Apple M1              | 4 NEON/8 micro-ops  | 37.3 GB/s |  36.1 GB/s |  ~-3% |
+ *
+ * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
+ *
+ * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning
+ * it effectively becomes worse 4.
+ *
+ * @see XXH3_accumulate_512_neon()
+ */
+# ifndef XXH3_NEON_LANES
+#  if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
+   && !defined(__APPLE__) && XXH_SIZE_OPT <= 0
+#   define XXH3_NEON_LANES 6
+#  else
+#   define XXH3_NEON_LANES XXH_ACC_NB
+#  endif
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`,
+ * and `pixel`. This is a problem for obvious reasons.
+ *
+ * These keywords are unnecessary; the spec literally says they are
+ * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd
+ * after including the header.
+ *
+ * We use pragma push_macro/pop_macro to keep the namespace clean. */
+#  pragma push_macro("bool")
+#  pragma push_macro("vector")
+#  pragma push_macro("pixel")
+/* silence potential macro redefined warnings */
+#  undef bool
+#  undef vector
+#  undef pixel
+
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+#    include <altivec.h>
+#  endif
+
+/* Restore the original macro values, if applicable. */
+#  pragma pop_macro("pixel")
+#  pragma pop_macro("vector")
+#  pragma pop_macro("bool")
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+/*
+ * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue.
+ */
+typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+/*!
+ * A polyfill for POWER9's vec_revb().
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*!
+ * Performs an unaligned vector load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+ /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+#if XXH_VECTOR == XXH_SVE
+#define ACCRND(acc, offset) \
+do { \
+    svuint64_t input_vec = svld1_u64(mask, xinput + offset);         \
+    svuint64_t secret_vec = svld1_u64(mask, xsecret + offset);       \
+    svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec);     \
+    svuint64_t swapped = svtbl_u64(input_vec, kSwap);                \
+    svuint64_t mixed_lo = svextw_u64_x(mask, mixed);                 \
+    svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32);            \
+    svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
+    acc = svadd_u64_x(mask, acc, mul);                               \
+} while (0)
+#endif /* XXH_VECTOR == XXH_SVE */
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if XXH_SIZE_OPT >= 1
+#    define XXH_PREFETCH(ptr) (void)(ptr)
+#  elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/*!
+ * @internal
+ * @def XXH3_kSecret
+ * @brief Pseudorandom secret taken directly from FARSH. */
+XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL;  /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */
+static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL;  /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */
+
+#ifdef XXH_OLD_NAMES
+#  define kSecret XXH3_kSecret
+#endif
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Implemented as a macro.
+ *
+ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * @param x, y Numbers to be multiplied
+ * @return 64-bit product of the low 32 bits of @p x and @p y.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+{
+   return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+}
+#elif defined(_MSC_VER) && defined(_M_IX86)
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*!
+ * @brief Calculates a 64->128-bit long multiply.
+ *
+ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
+ * version.
+ *
+ * @param lhs , rhs The 64-bit integers to be multiplied
+ * @return The 128-bit result represented in an @ref XXH128_hash_t.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if (defined(__GNUC__) || defined(__clang__)) && defined(DISABLED) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+    /*
+     * MSVC for ARM64's __umulh method.
+     *
+     * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
+     */
+#elif defined(_M_ARM64) || defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(__umulh)
+#endif
+    XXH128_hash_t r128;
+    r128.low64  = lhs * rhs;
+    r128.high64 = __umulh(lhs, rhs);
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*!
+ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ *
+ * @param lhs , rhs The 64-bit integers to multiply
+ * @return The low 64 bits of the product XOR'd by the high 64 bits.
+ * @see XXH_mult64to128()
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/*! Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * This is a fast avalanche stage,
+ * suitable when input bits are already partially mixed
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= PRIME_MX1;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+/*
+ * This is a stronger avalanche,
+ * inspired by Pelle Evensen's rrmxmx
+ * preferable when input has not been previously mixed
+ */
+static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
+{
+    /* this mix is inspired by Pelle Evensen's rrmxmx */
+    h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
+    h64 *= PRIME_MX2;
+    h64 ^= (h64 >> 35) + len ;
+    h64 *= PRIME_MX2;
+    return XXH_xorshift64(h64, 28);
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8  const c1 = input[0];
+        xxh_u8  const c2 = input[len >> 1];
+        xxh_u8  const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        return XXH64_avalanche(keyed);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 const keyed = input64 ^ bitflip;
+        return XXH3_rrmxmx(keyed, len);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    XXH_COMPILER_GUARD(seed64);
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+#if XXH_SIZE_OPT >= 1
+        /* Smaller and cleaner, but slightly slower. */
+        unsigned int i = (unsigned int)(len - 1) / 32;
+        do {
+            acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
+            acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
+        } while (i-- != 0);
+#else
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+#endif
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+        xxh_u64 acc_end;
+        unsigned int const nbRounds = (unsigned int)len / 16;
+        unsigned int i;
+        XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        /* last bytes */
+        acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        XXH_ASSERT(nbRounds >= 8);
+        acc = XXH3_avalanche(acc);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            /*
+             * Prevents clang for unrolling the acc loop and interleaving with this one.
+             */
+            XXH_COMPILER_GUARD(acc);
+            acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        return XXH3_avalanche(acc + acc_end);
+    }
+}
+
+
+/* =======     Long Keys     ======= */
+
+#define XXH_STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+
+#ifdef XXH_OLD_NAMES
+#  define STRIPE_LEN XXH_STRIPE_LEN
+#  define ACC_NB XXH_ACC_NB
+#endif
+
+#ifndef XXH_PREFETCH_DIST
+#  ifdef __clang__
+#    define XXH_PREFETCH_DIST 320
+#  else
+#    if (XXH_VECTOR == XXH_AVX512)
+#      define XXH_PREFETCH_DIST 512
+#    else
+#      define XXH_PREFETCH_DIST 384
+#    endif
+#  endif  /* __clang__ */
+#endif  /* XXH_PREFETCH_DIST */
+
+/*
+ * These macros are to generate an XXH3_accumulate() function.
+ * The two arguments select the name suffix and target attribute.
+ *
+ * The name of this symbol is XXH3_accumulate_<name>() and it calls
+ * XXH3_accumulate_512_<name>().
+ *
+ * It may be useful to hand implement this function if the compiler fails to
+ * optimize the inline function.
+ */
+#define XXH3_ACCUMULATE_TEMPLATE(name)                      \
+void                                                        \
+XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc,           \
+                       const xxh_u8* XXH_RESTRICT input,    \
+                       const xxh_u8* XXH_RESTRICT secret,   \
+                       size_t nbStripes)                    \
+{                                                           \
+    size_t n;                                               \
+    for (n = 0; n < nbStripes; n++ ) {                      \
+        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;  \
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);               \
+        XXH3_accumulate_512_##name(                         \
+                 acc,                                       \
+                 in,                                        \
+                 secret + n*XXH_SECRET_CONSUME_RATE);       \
+    }                                                       \
+}
+
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    XXH_memcpy(dst, &v64, sizeof(v64));
+}
+
+/* Several intrinsic functions below are supposed to accept __int64 as argument,
+ * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+ * However, several environments do not define __int64 type,
+ * requiring a workaround.
+ */
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+    typedef int64_t xxh_i64;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef long long xxh_i64;
+#endif
+
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+#if (XXH_VECTOR == XXH_AVX512) \
+     || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
+
+#ifndef XXH_TARGET_AVX512
+# define XXH_TARGET_AVX512  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    __m512i* const xacc = (__m512i *) acc;
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+
+    {
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        /* xacc[0] += swap(data_vec); */
+        __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
+        __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+        /* xacc[0] += product; */
+        *xacc = _mm512_add_epi64(product, sum);
+    }
+}
+XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+    {   __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);
+
+        /* xacc[0] *= XXH_PRIME32_1; */
+        __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+    XXH_ASSERT(((size_t)customSecret & 63) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
+        __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
+        __m512i const seed     = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);
+
+        const __m512i* const src  = (const __m512i*) ((const void*) XXH3_kSecret);
+              __m512i* const dest = (      __m512i*) customSecret;
+        int i;
+        XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 63) == 0);
+        for (i=0; i < nbRounds; ++i) {
+            dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_AVX2) \
+    || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
+
+#ifndef XXH_TARGET_AVX2
+# define XXH_TARGET_AVX2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm256_add_epi64(product, sum);
+    }   }
+}
+XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
+    (void)(&XXH_writeLE64);
+    XXH_PREFETCH(customSecret);
+    {   __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);
+
+        const __m256i* const src  = (const __m256i*) ((const void*) XXH3_kSecret);
+              __m256i*       dest = (      __m256i*) customSecret;
+
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dest);
+#       endif
+        XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 31) == 0);
+
+        /* GCC -O2 need unroll loop manually */
+        dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
+        dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
+        dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
+        dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
+        dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
+        dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
+    }
+}
+
+#endif
+
+/* x86dispatch always generates SSE2 */
+#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+
+#ifndef XXH_TARGET_SSE2
+# define XXH_TARGET_SSE2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+            __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm_add_epi64(product, sum);
+    }   }
+}
+XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
+
+#       if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+        /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
+        XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };
+        __m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
+#       else
+        __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
+#       endif
+        int i;
+
+        const void* const src16 = XXH3_kSecret;
+        __m128i* dst16 = (__m128i*) customSecret;
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dst16);
+#       endif
+        XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dst16 & 15) == 0);
+
+        for (i=0; i < nbRounds; ++i) {
+            dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_NEON)
+
+/* forward declarations for the scalar routines */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret, size_t lane);
+
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret, size_t lane);
+
+/*!
+ * @internal
+ * @brief The bulk processing loop for NEON and WASM SIMD128.
+ *
+ * The NEON code path is actually partially scalar when running on AArch64. This
+ * is to optimize the pipelining and can have up to 15% speedup depending on the
+ * CPU, and it also mitigates some GCC codegen issues.
+ *
+ * @see XXH3_NEON_LANES for configuring this and details about this optimization.
+ *
+ * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit
+ * integers instead of the other platforms which mask full 64-bit vectors,
+ * so the setup is more complicated than just shifting right.
+ *
+ * Additionally, there is an optimization for 4 lanes at once noted below.
+ *
+ * Since, as stated, the most optimal amount of lanes for Cortexes is 6,
+ * there needs to be *three* versions of the accumulate operation used
+ * for the remaining 2 lanes.
+ *
+ * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap
+ * nearly perfectly.
+ */
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
+    {   /* GCC for darwin arm64 does not like aliasing here */
+        xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* xinput = (const uint8_t *) input;
+        uint8_t const* xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+#ifdef __wasm_simd128__
+        /*
+         * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret
+         * is constant propagated, which results in it converting it to this
+         * inside the loop:
+         *
+         *    a = v128.load(XXH3_kSecret +  0 + $secret_offset, offset = 0)
+         *    b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0)
+         *    ...
+         *
+         * This requires a full 32-bit address immediate (and therefore a 6 byte
+         * instruction) as well as an add for each offset.
+         *
+         * Putting an asm guard prevents it from folding (at the cost of losing
+         * the alignment hint), and uses the free offset in `v128.load` instead
+         * of adding secret_offset each time which overall reduces code size by
+         * about a kilobyte and improves performance.
+         */
+        XXH_COMPILER_GUARD(xsecret);
+#endif
+        /* Scalar lanes use the normal scalarRound routine */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarRound(acc, input, secret, i);
+        }
+        i = 0;
+        /* 4 NEON lanes at a time. */
+        for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
+            /* data_vec = xinput[i]; */
+            uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput  + (i * 16));
+            uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput  + ((i+1) * 16));
+            /* key_vec  = xsecret[i];  */
+            uint64x2_t key_vec_1  = XXH_vld1q_u64(xsecret + (i * 16));
+            uint64x2_t key_vec_2  = XXH_vld1q_u64(xsecret + ((i+1) * 16));
+            /* data_swap = swap(data_vec) */
+            uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
+            uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
+            uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
+
+            /*
+             * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
+             * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
+             * get one vector with the low 32 bits of each lane, and one vector
+             * with the high 32 bits of each lane.
+             *
+             * The intrinsic returns a double vector because the original ARMv7-a
+             * instruction modified both arguments in place. AArch64 and SIMD128 emit
+             * two instructions from this intrinsic.
+             *
+             *  [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
+             *  [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
+             */
+            uint32x4x2_t unzipped = vuzpq_u32(
+                vreinterpretq_u32_u64(data_key_1),
+                vreinterpretq_u32_u64(data_key_2)
+            );
+            /* data_key_lo = data_key & 0xFFFFFFFF */
+            uint32x4_t data_key_lo = unzipped.val[0];
+            /* data_key_hi = data_key >> 32 */
+            uint32x4_t data_key_hi = unzipped.val[1];
+            /*
+             * Then, we can split the vectors horizontally and multiply which, as for most
+             * widening intrinsics, have a variant that works on both high half vectors
+             * for free on AArch64. A similar instruction is available on SIMD128.
+             *
+             * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
+             */
+            uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi);
+            uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi);
+            /*
+             * Clang reorders
+             *    a += b * c;     // umlal   swap.2d, dkl.2s, dkh.2s
+             *    c += a;         // add     acc.2d, acc.2d, swap.2d
+             * to
+             *    c += a;         // add     acc.2d, acc.2d, swap.2d
+             *    c += b * c;     // umlal   acc.2d, dkl.2s, dkh.2s
+             *
+             * While it would make sense in theory since the addition is faster,
+             * for reasons likely related to umlal being limited to certain NEON
+             * pipelines, this is worse. A compiler guard fixes this.
+             */
+            XXH_COMPILER_GUARD_CLANG_NEON(sum_1);
+            XXH_COMPILER_GUARD_CLANG_NEON(sum_2);
+            /* xacc[i] = acc_vec + sum; */
+            xacc[i]   = vaddq_u64(xacc[i], sum_1);
+            xacc[i+1] = vaddq_u64(xacc[i+1], sum_2);
+        }
+        /* Operate on the remaining NEON lanes 2 at a time. */
+        for (; i < XXH3_NEON_LANES / 2; i++) {
+            /* data_vec = xinput[i]; */
+            uint64x2_t data_vec = XXH_vld1q_u64(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
+            /* acc_vec_2 = swap(data_vec) */
+            uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1);
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
+            /* For two lanes, just use VMOVN and VSHRN. */
+            /* data_key_lo = data_key & 0xFFFFFFFF; */
+            uint32x2_t data_key_lo = vmovn_u64(data_key);
+            /* data_key_hi = data_key >> 32; */
+            uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
+            /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */
+            uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi);
+            /* Same Clang workaround as before */
+            XXH_COMPILER_GUARD_CLANG_NEON(sum);
+            /* xacc[i] = acc_vec + sum; */
+            xacc[i] = vaddq_u64 (xacc[i], sum);
+        }
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   xxh_aliasing_uint64x2_t* xacc       = (xxh_aliasing_uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+
+        size_t i;
+        /* WASM uses operator overloads and doesn't need these. */
+#ifndef __wasm_simd128__
+        /* { prime32_1, prime32_1 } */
+        uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1);
+        /* { 0, prime32_1, 0, prime32_1 } */
+        uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32));
+#endif
+
+        /* AArch64 uses both scalar and neon at the same time */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarScrambleRound(acc, secret, i);
+        }
+        for (i=0; i < XXH3_NEON_LANES / 2; i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64(acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
+            /* xacc[i] *= XXH_PRIME32_1 */
+#ifdef __wasm_simd128__
+            /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */
+            xacc[i] = data_key * XXH_PRIME32_1;
+#else
+            /*
+             * Expanded version with portable NEON intrinsics
+             *
+             *    lo(x) * lo(y) + (hi(x) * lo(y) << 32)
+             *
+             * prod_hi = hi(data_key) * lo(prime) << 32
+             *
+             * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector
+             * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits
+             * and avoid the shift.
+             */
+            uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi);
+            /* Extract low bits for vmlal_u32  */
+            uint32x2_t data_key_lo = vmovn_u64(data_key);
+            /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
+            xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
+#endif
+        }
+    }
+}
+#endif
+
+#if (XXH_VECTOR == XXH_VSX)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* presumed aligned */
+    xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
+    xxh_u8 const* const xinput   = (xxh_u8 const*) input;   /* no alignment restriction */
+    xxh_u8 const* const xsecret  = (xxh_u8 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        /* acc_vec = xacc[i]; */
+        xxh_u64x2 acc_vec        = xacc[i];
+        acc_vec += product;
+
+        /* swap high and low halves */
+#ifdef __s390x__
+        acc_vec += vec_permi(data_vec, data_vec, 2);
+#else
+        acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+        xacc[i] = acc_vec;
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
+        const xxh_u8* const xsecret = (const xxh_u8*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
+        size_t i;
+        for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= XXH_PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_SVE)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
+                   const void* XXH_RESTRICT input,
+                   const void* XXH_RESTRICT secret)
+{
+    uint64_t *xacc = (uint64_t *)acc;
+    const uint64_t *xinput = (const uint64_t *)(const void *)input;
+    const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
+    svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
+    uint64_t element_count = svcntd();
+    if (element_count >= 8) {
+        svbool_t mask = svptrue_pat_b64(SV_VL8);
+        svuint64_t vacc = svld1_u64(mask, xacc);
+        ACCRND(vacc, 0);
+        svst1_u64(mask, xacc, vacc);
+    } else if (element_count == 2) {   /* sve128 */
+        svbool_t mask = svptrue_pat_b64(SV_VL2);
+        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+        svuint64_t acc1 = svld1_u64(mask, xacc + 2);
+        svuint64_t acc2 = svld1_u64(mask, xacc + 4);
+        svuint64_t acc3 = svld1_u64(mask, xacc + 6);
+        ACCRND(acc0, 0);
+        ACCRND(acc1, 2);
+        ACCRND(acc2, 4);
+        ACCRND(acc3, 6);
+        svst1_u64(mask, xacc + 0, acc0);
+        svst1_u64(mask, xacc + 2, acc1);
+        svst1_u64(mask, xacc + 4, acc2);
+        svst1_u64(mask, xacc + 6, acc3);
+    } else {
+        svbool_t mask = svptrue_pat_b64(SV_VL4);
+        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+        svuint64_t acc1 = svld1_u64(mask, xacc + 4);
+        ACCRND(acc0, 0);
+        ACCRND(acc1, 4);
+        svst1_u64(mask, xacc + 0, acc0);
+        svst1_u64(mask, xacc + 4, acc1);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
+               const xxh_u8* XXH_RESTRICT input,
+               const xxh_u8* XXH_RESTRICT secret,
+               size_t nbStripes)
+{
+    if (nbStripes != 0) {
+        uint64_t *xacc = (uint64_t *)acc;
+        const uint64_t *xinput = (const uint64_t *)(const void *)input;
+        const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
+        svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
+        uint64_t element_count = svcntd();
+        if (element_count >= 8) {
+            svbool_t mask = svptrue_pat_b64(SV_VL8);
+            svuint64_t vacc = svld1_u64(mask, xacc + 0);
+            do {
+                /* svprfd(svbool_t, void *, enum svfprop); */
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(vacc, 0);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, vacc);
+        } else if (element_count == 2) { /* sve128 */
+            svbool_t mask = svptrue_pat_b64(SV_VL2);
+            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+            svuint64_t acc1 = svld1_u64(mask, xacc + 2);
+            svuint64_t acc2 = svld1_u64(mask, xacc + 4);
+            svuint64_t acc3 = svld1_u64(mask, xacc + 6);
+            do {
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(acc0, 0);
+                ACCRND(acc1, 2);
+                ACCRND(acc2, 4);
+                ACCRND(acc3, 6);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, acc0);
+           svst1_u64(mask, xacc + 2, acc1);
+           svst1_u64(mask, xacc + 4, acc2);
+           svst1_u64(mask, xacc + 6, acc3);
+        } else {
+            svbool_t mask = svptrue_pat_b64(SV_VL4);
+            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+            svuint64_t acc1 = svld1_u64(mask, xacc + 4);
+            do {
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(acc0, 0);
+                ACCRND(acc1, 4);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, acc0);
+           svst1_u64(mask, xacc + 4, acc1);
+       }
+    }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_LSX)
+#define _LSX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_lsx( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        __m128i* const xacc    =       (__m128i *) acc;
+        const __m128i* const xinput  = (const __m128i *) input;
+        const __m128i* const xsecret = (const __m128i *) secret;
+
+        for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
+            /* data_vec = xinput[i]; */
+            __m128i const data_vec = __lsx_vld(xinput + i, 0);
+            /* key_vec = xsecret[i]; */
+            __m128i const key_vec = __lsx_vld(xsecret + i, 0);
+            /* data_key = data_vec ^ key_vec; */
+            __m128i const data_key = __lsx_vxor_v(data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32);
+            // __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32);
+            /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product = __lsx_vmulwev_d_wu(data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m128i const data_swap = __lsx_vshuf4i_w(data_vec, _LSX_SHUFFLE(1, 0, 3, 2));
+            __m128i const sum = __lsx_vadd_d(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = __lsx_vadd_d(product, sum);
+        }
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lsx)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_lsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        __m128i* const xacc = (__m128i*) acc;
+        const __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = __lsx_vreplgr2vr_d(XXH_PRIME32_1);
+
+        for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec = xacc[i];
+            __m128i const shifted = __lsx_vsrli_d(acc_vec, 47);
+            __m128i const data_vec = __lsx_vxor_v(acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec = __lsx_vld(xsecret + i, 0);
+            __m128i const data_key = __lsx_vxor_v(data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            xacc[i] = __lsx_vmul_d(data_key, prime32);
+        }
+    }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_LASX)
+#define _LASX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_lasx( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {
+        __m256i* const xacc    =       (__m256i *) acc;
+        const __m256i* const xinput  = (const __m256i *) input;
+        const __m256i* const xsecret = (const __m256i *) secret;
+
+        for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
+            /* data_vec = xinput[i]; */
+            __m256i const data_vec = __lasx_xvld(xinput + i, 0);
+            /* key_vec = xsecret[i]; */
+            __m256i const key_vec = __lasx_xvld(xsecret + i, 0);
+            /* data_key = data_vec ^ key_vec; */
+            __m256i const data_key = __lasx_xvxor_v(data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = __lasx_xvsrli_d(data_key, 32);
+            // __m256i const data_key_lo = __lasx_xvsrli_d(data_key, 32);
+            /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product = __lasx_xvmulwev_d_wu(data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m256i const data_swap = __lasx_xvshuf4i_w(data_vec, _LASX_SHUFFLE(1, 0, 3, 2));
+            __m256i const sum = __lasx_xvadd_d(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = __lasx_xvadd_d(product, sum);
+        }
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lasx)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_lasx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {
+        __m256i* const xacc = (__m256i*) acc;
+        const __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = __lasx_xvreplgr2vr_d(XXH_PRIME32_1);
+
+        for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec = xacc[i];
+            __m256i const shifted = __lasx_xvsrli_d(acc_vec, 47);
+            __m256i const data_vec = __lasx_xvxor_v(acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m256i const key_vec = __lasx_xvld(xsecret + i, 0);
+            __m256i const data_key = __lasx_xvxor_v(data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            xacc[i] = __lasx_xvmul_d(data_key, prime32);
+        }
+    }
+}
+
+#endif
+
+/* scalar variants - universal */
+
+#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
+/*
+ * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they
+ * emit an excess mask and a full 64-bit multiply-add (MADD X-form).
+ *
+ * While this might not seem like much, as AArch64 is a 64-bit architecture, only
+ * big Cortex designs have a full 64-bit multiplier.
+ *
+ * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit
+ * multiplies expand to 2-3 multiplies in microcode. This has a major penalty
+ * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline.
+ *
+ * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does
+ * not have this penalty and does the mask automatically.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
+{
+    xxh_u64 ret;
+    /* note: %x = 64-bit register, %w = 32-bit register */
+    __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc));
+    return ret;
+}
+#else
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
+{
+    return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc;
+}
+#endif
+
+/*!
+ * @internal
+ * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc,
+                 void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret,
+                 size_t lane)
+{
+    xxh_u64* xacc = (xxh_u64*) acc;
+    xxh_u8 const* xinput  = (xxh_u8 const*) input;
+    xxh_u8 const* xsecret = (xxh_u8 const*) secret;
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    {
+        xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
+        xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
+        xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]);
+    }
+}
+
+/*!
+ * @internal
+ * @brief Processes a 64 byte block of data using the scalar path.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */
+#if defined(__GNUC__) && !defined(__clang__) \
+  && (defined(__arm__) || defined(__thumb2__)) \
+  && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \
+  && XXH_SIZE_OPT <= 0
+#  pragma GCC unroll 8
+#endif
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarRound(acc, input, secret, i);
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)
+
+/*!
+ * @internal
+ * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret,
+                         size_t lane)
+{
+    xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);
+        xxh_u64 acc64 = xacc[lane];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= XXH_PRIME32_1;
+        xacc[lane] = acc64;
+    }
+}
+
+/*!
+ * @internal
+ * @brief Scrambles the accumulators after a large chunk has been read
+ */
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarScrambleRound(acc, secret, i);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    /*
+     * We need a separate pointer for the hack below,
+     * which requires a non-const pointer.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8* kSecretPtr = XXH3_kSecret;
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__GNUC__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), it fights for bandwidth with
+     * the arithmetic instructions.
+     *
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes the compiler to assume
+     * that XXH3_kSecretPtr has been changed), the pipelines are used more
+     * efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     *
+     * See XXH3_NEON_LANES for details on the pipsline.
+     *
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    XXH_COMPILER_GUARD(kSecretPtr);
+#endif
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+        int i;
+        for (i=0; i < nbRounds; i++) {
+            /*
+             * The asm hack causes the compiler to assume that kSecretPtr aliases with
+             * customSecret, and on aarch64, this prevented LDP from merging two
+             * loads together for free. Putting the loads together before the stores
+             * properly generates LDP.
+             */
+            xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+            xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i,     lo);
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);
+    }   }
+}
+
+
+typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
+typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
+typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
+
+
+#if (XXH_VECTOR == XXH_AVX512)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
+#define XXH3_accumulate     XXH3_accumulate_avx512
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx512
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
+#define XXH3_accumulate     XXH3_accumulate_avx2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
+#define XXH3_accumulate     XXH3_accumulate_sse2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_sse2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_neon
+#define XXH3_accumulate     XXH3_accumulate_neon
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_neon
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
+#define XXH3_accumulate     XXH3_accumulate_vsx
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_vsx
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_SVE)
+#define XXH3_accumulate_512 XXH3_accumulate_512_sve
+#define XXH3_accumulate     XXH3_accumulate_sve
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_LASX)
+#define XXH3_accumulate_512 XXH3_accumulate_512_lasx
+#define XXH3_accumulate     XXH3_accumulate_lasx
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_lasx
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_LSX)
+#define XXH3_accumulate_512 XXH3_accumulate_512_lsx
+#define XXH3_accumulate     XXH3_accumulate_lsx
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_lsx
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#else /* scalar */
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+#define XXH3_accumulate     XXH3_accumulate_scalar
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#endif
+
+#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */
+#  undef XXH3_initCustomSecret
+#  define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+#endif
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate f_acc,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
+    size_t const nb_blocks = (len - 1) / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
+        f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > XXH_STRIPE_LEN);
+    {   size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);
+
+        /* last stripe */
+        {   const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7  /* not aligned on 8, last secret is different from acc & scrambler */
+            XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH_PUREF XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        XXH_COMPILER_GUARD(result64);
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+/* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+
+static XXH_PUREF XXH64_hash_t
+XXH3_finalizeLong_64b(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 len)
+{
+    return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, len * XXH_PRIME64_1);
+}
+
+#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
+                        XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
+                           const void* XXH_RESTRICT secret, size_t secretSize,
+                           XXH3_f_accumulate f_acc,
+                           XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_finalizeLong_64b(acc, (const xxh_u8*)secret, (xxh_u64)len);
+}
+
+/*
+ * It's important for performance to transmit secret's size (when it's static)
+ * so that the compiler can properly optimize the vectorized loop.
+ * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
+ * breaks -Og, this is XXH_NO_INLINE.
+ */
+XXH3_WITH_SECRET_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                             XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * It's preferable for performance that XXH3_hashLong is not inlined,
+ * as it results in a smaller function for small data, easier to the instruction cache.
+ * Note that inside this no_inline function, we do inline the internal loop,
+ * and provide a statically defined secret size to allow optimization of vector loop.
+ */
+XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
+                          XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
+                                    XXH64_hash_t seed,
+                                    XXH3_f_accumulate f_acc,
+                                    XXH3_f_scrambleAcc f_scramble,
+                                    XXH3_f_initCustomSecret f_initSec)
+{
+#if XXH_SIZE_OPT <= 0
+    if (seed == 0)
+        return XXH3_hashLong_64b_internal(input, len,
+                                          XXH3_kSecret, sizeof(XXH3_kSecret),
+                                          f_acc, f_scramble);
+#endif
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed);
+        return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
+                                          f_acc, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
+                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+
+typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,
+                                          XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
+                     XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                     XXH3_hashLong64_f f_hashLong)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secretLen` condition is not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     * Also, note that function signature doesn't offer room to return an error.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
+}
+
+
+/* ===   Public entry point   === */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)
+{
+    return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)
+{
+    return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (length <= XXH3_MIDSIZE_MAX)
+        return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize);
+}
+
+
+/* ===   XXH3 streaming   === */
+#ifndef XXH_NO_STREAM
+/*
+ * Malloc's a pointer that is always aligned to @align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+/*! @ingroup XXH3_family */
+/*!
+ * @brief Allocate an @ref XXH3_state_t.
+ *
+ * @return An allocated pointer of @ref XXH3_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH3_freeState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+    if (state==NULL) return NULL;
+    XXH3_INITSTATE(state);
+    return state;
+}
+
+/*! @ingroup XXH3_family */
+/*!
+ * @brief Frees an @ref XXH3_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note Must be allocated with XXH3_createState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)
+{
+    XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_reset_internal(XXH3_state_t* statePtr,
+                    XXH64_hash_t seed,
+                    const void* secret, size_t secretSize)
+{
+    size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
+    size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
+    XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
+    XXH_ASSERT(statePtr != NULL);
+    /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
+    XXH_memset((char*)statePtr + initStart, 0, initLength);
+    statePtr->acc[0] = XXH_PRIME32_3;
+    statePtr->acc[1] = XXH_PRIME64_1;
+    statePtr->acc[2] = XXH_PRIME64_2;
+    statePtr->acc[3] = XXH_PRIME64_3;
+    statePtr->acc[4] = XXH_PRIME64_4;
+    statePtr->acc[5] = XXH_PRIME32_2;
+    statePtr->acc[6] = XXH_PRIME64_5;
+    statePtr->acc[7] = XXH_PRIME32_1;
+    statePtr->seed = seed;
+    statePtr->useSeed = (seed != 0);
+    statePtr->extSecret = (const unsigned char*)secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (seed==0) return XXH3_64bits_reset(statePtr);
+    if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
+        XXH3_initCustomSecret(statePtr->customSecret, seed);
+    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, seed64, secret, secretSize);
+    statePtr->useSeed = 1; /* always, even if seed64==0 */
+    return XXH_OK;
+}
+
+/*!
+ * @internal
+ * @brief Processes a large input for XXH3_update() and XXH3_digest_long().
+ *
+ * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block.
+ *
+ * @param acc                Pointer to the 8 accumulator lanes
+ * @param nbStripesSoFarPtr  In/out pointer to the number of leftover stripes in the block*
+ * @param nbStripesPerBlock  Number of stripes in a block
+ * @param input              Input pointer
+ * @param nbStripes          Number of stripes to process
+ * @param secret             Secret pointer
+ * @param secretLimit        Offset of the last block in @p secret
+ * @param f_acc              Pointer to an XXH3_accumulate implementation
+ * @param f_scramble         Pointer to an XXH3_scrambleAcc implementation
+ * @return                   Pointer past the end of @p input after processing
+ */
+XXH_FORCE_INLINE const xxh_u8 *
+XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
+                    size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
+                    const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
+                    const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
+                    XXH3_f_accumulate f_acc,
+                    XXH3_f_scrambleAcc f_scramble)
+{
+    const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE;
+    /* Process full blocks */
+    if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) {
+        /* Process the initial partial block... */
+        size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr;
+
+        do {
+            /* Accumulate and scramble */
+            f_acc(acc, input, initialSecret, nbStripesThisIter);
+            f_scramble(acc, secret + secretLimit);
+            input += nbStripesThisIter * XXH_STRIPE_LEN;
+            nbStripes -= nbStripesThisIter;
+            /* Then continue the loop with the full block size */
+            nbStripesThisIter = nbStripesPerBlock;
+            initialSecret = secret;
+        } while (nbStripes >= nbStripesPerBlock);
+        *nbStripesSoFarPtr = 0;
+    }
+    /* Process a partial block */
+    if (nbStripes > 0) {
+        f_acc(acc, input, initialSecret, nbStripes);
+        input += nbStripes * XXH_STRIPE_LEN;
+        *nbStripesSoFarPtr += nbStripes;
+    }
+    /* Return end pointer */
+    return input;
+}
+
+#ifndef XXH3_STREAM_USE_STACK
+# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */
+#   define XXH3_STREAM_USE_STACK 1
+# endif
+#endif
+/* This function accepts f_acc and f_scramble as function pointers,
+ * making it possible to implement multiple variants with different acc & scramble stages.
+ * This is notably useful to implement multiple vector variants with different intrinsics.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
+            const xxh_u8* XXH_RESTRICT input, size_t len,
+            XXH3_f_accumulate f_acc,
+            XXH3_f_scrambleAcc f_scramble)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    XXH_ASSERT(state != NULL);
+    {   const xxh_u8* const bEnd = input + len;
+        const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* For some reason, gcc and MSVC seem to suffer greatly
+         * when operating accumulators directly into state.
+         * Operating into stack space seems to enable proper optimization.
+         * clang, on the other hand, doesn't seem to need this trick */
+        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
+        XXH_memcpy(acc, state->acc, sizeof(acc));
+#else
+        xxh_u64* XXH_RESTRICT const acc = state->acc;
+#endif
+        state->totalLen += len;
+        XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
+
+        /* small input : just fill in tmp buffer */
+        if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) {
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * Internal buffer is partially filled (always, except at beginning)
+         * Complete it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                secret, state->secretLimit,
+                                f_acc, f_scramble);
+            state->bufferedSize = 0;
+        }
+        XXH_ASSERT(input < bEnd);
+        if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
+            size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
+            input = XXH3_consumeStripes(acc,
+                                       &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                       input, nbStripes,
+                                       secret, state->secretLimit,
+                                       f_acc, f_scramble);
+            XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+
+        }
+        /* Some remaining input (always) : buffer it */
+        XXH_ASSERT(input < bEnd);
+        XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
+        XXH_ASSERT(state->bufferedSize == 0);
+        XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+        state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* save stack accumulators into state */
+        XXH_memcpy(state->acc, acc, sizeof(acc));
+#endif
+    }
+
+    return XXH_OK;
+}
+
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_NO_INLINE XXH_errorcode
+XXH3_update_regular(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len,
+                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_update_regular(state, input, len);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc,
+                  const XXH3_state_t* state,
+                  const unsigned char* secret)
+{
+    xxh_u8 lastStripe[XXH_STRIPE_LEN];
+    const xxh_u8* lastStripePtr;
+
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    XXH_memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= XXH_STRIPE_LEN) {
+        /* Consume remaining stripes then point to remaining data in buffer */
+        size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
+        size_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, nbStripes,
+                            secret, state->secretLimit,
+                            XXH3_accumulate, XXH3_scrambleAcc);
+        lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN;
+    } else {  /* bufferedSize < XXH_STRIPE_LEN */
+        /* Copy to temp buffer */
+        size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
+        XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
+        XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+        XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+        lastStripePtr = lastStripe;
+    }
+    /* Last stripe */
+    XXH3_accumulate_512(acc,
+                        lastStripePtr,
+                        secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        return XXH3_finalizeLong_64b(acc, secret, (xxh_u64)state->totalLen);
+    }
+    /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
+    if (state->useSeed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+#endif /* !XXH_NO_STREAM */
+
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        XXH128_hash_t h128;
+        h128.low64  = XXH64_avalanche(keyed_lo);
+        h128.high64 = XXH64_avalanche(keyed_hi);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= PRIME_MX2;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = XXH_PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
+            h128.high64 += m128.high64 * XXH_PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH64_avalanche(seed ^ bitflipl);
+            h128.high64 = XXH64_avalanche( seed ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+
+#if XXH_SIZE_OPT >= 1
+        {
+            /* Smaller, but slightly slower. */
+            unsigned int i = (unsigned int)(len - 1) / 32;
+            do {
+                acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
+            } while (i-- != 0);
+        }
+#else
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+#endif
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        unsigned i;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+        /*
+         *  We set as `i` as offset + 32. We do this so that unchanged
+         * `len` can be used as upper bound. This reaches a sweet spot
+         * where both x86 and aarch64 get simple agen and good codegen
+         * for the loop.
+         */
+        for (i = 32; i < 160; i += 32) {
+            acc = XXH128_mix32B(acc,
+                                input  + i - 32,
+                                input  + i - 16,
+                                secret + i - 32,
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        /*
+         * NB: `i <= len` will duplicate the last 32-bytes if
+         * len % 32 was zero. This is an unfortunate necessity to keep
+         * the hash result stable.
+         */
+        for (i=160; i <= len; i += 32) {
+            acc = XXH128_mix32B(acc,
+                                input + i - 32,
+                                input + i - 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            (XXH64_hash_t)0 - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+static XXH_PUREF XXH128_hash_t
+XXH3_finalizeLong_128b(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, xxh_u64 len)
+{
+    XXH128_hash_t h128;
+    h128.low64 = XXH3_finalizeLong_64b(acc, secret, len);
+    h128.high64 = XXH3_mergeAccs(acc, secret + secretSize
+                                             - XXH_STRIPE_LEN - XXH_SECRET_MERGEACCS_START,
+                                             ~(len * XXH_PRIME64_2));
+    return h128;
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate f_acc,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_finalizeLong_128b(acc, secret, secretSize, (xxh_u64)len);
+}
+
+/*
+ * It's important for performance that XXH3_hashLong() is not inlined.
+ */
+XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed64,
+                           const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
+                                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * It's important for performance to pass @p secretLen (when it's static)
+ * to the compiler, so that it can properly optimize the vectorized loop.
+ *
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
+ * breaks -Og, this is XXH_NO_INLINE.
+ */
+XXH3_WITH_SECRET_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                              XXH64_hash_t seed64,
+                              const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
+                                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
+                                XXH64_hash_t seed64,
+                                XXH3_f_accumulate f_acc,
+                                XXH3_f_scrambleAcc f_scramble,
+                                XXH3_f_initCustomSecret f_initSec)
+{
+    if (seed64 == 0)
+        return XXH3_hashLong_128b_internal(input, len,
+                                           XXH3_kSecret, sizeof(XXH3_kSecret),
+                                           f_acc, f_scramble);
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed64);
+        return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
+                                           f_acc, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const void* input, size_t len,
+                            XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
+                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
+                                            XXH64_hash_t, const void* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_128bits_internal(const void* input, size_t len,
+                      XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                      XXH3_hashLong128_f f_hl128)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hl128(input, len, seed64, secret, secretLen);
+}
+
+
+/* ===   Public XXH128 API   === */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_default);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 (const xxh_u8*)secret, secretSize,
+                                 XXH3_hashLong_128b_withSecret);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_internal(input, len, seed,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_withSeed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+#ifndef XXH_NO_STREAM
+/*
+ * All initialization and update functions are identical to 64-bit streaming variant.
+ * The only difference is the finalization routine.
+ */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
+{
+    return XXH3_64bits_reset(statePtr);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSeed(statePtr, seed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_update_regular(state, input, len);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        return XXH3_finalizeLong_128b(acc, secret, state->secretLimit + XXH_STRIPE_LEN,  (xxh_u64)state->totalLen);
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->useSeed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+#endif /* !XXH_NO_STREAM */
+/* 128-bit utility functions */
+
+/* return : 1 is equal, 0 if different */
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(XXH_memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * @return : >0 if *h128_1  > *h128_2
+ *           <0 if *h128_1  < *h128_2
+ *           =0 if *h128_1 == *h128_2  */
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
+    XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+
+
+/* ==========================================
+ * Secret generators
+ * ==========================================
+ */
+#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+
+XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
+{
+    XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
+    XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)
+{
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(secretBuffer != NULL);
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+#else
+    /* production mode, assert() are disabled */
+    if (secretBuffer == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+#endif
+
+    if (customSeedSize == 0) {
+        customSeed = XXH3_kSecret;
+        customSeedSize = XXH_SECRET_DEFAULT_SIZE;
+    }
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(customSeed != NULL);
+#else
+    if (customSeed == NULL) return XXH_ERROR;
+#endif
+
+    /* Fill secretBuffer with a copy of customSeed - repeat as needed */
+    {   size_t pos = 0;
+        while (pos < secretSize) {
+            size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
+            XXH_memcpy((char*)secretBuffer + pos, customSeed, toCopy);
+            pos += toCopy;
+    }   }
+
+    {   size_t const nbSeg16 = secretSize / 16;
+        size_t n;
+        XXH128_canonical_t scrambler;
+        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
+        for (n=0; n<nbSeg16; n++) {
+            XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
+            XXH3_combine16((char*)secretBuffer + n*16, h128);
+        }
+        /* last segment */
+        XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
+    }
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)
+{
+    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    XXH3_initCustomSecret(secret, seed);
+    XXH_ASSERT(secretBuffer != NULL);
+    XXH_memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
+}
+
+
+
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+#  pragma GCC pop_options
+#endif
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif  /* XXH_NO_XXH3 */
+
+/*!
+ * @}
+ */
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus) && !defined(XXH_NO_EXTERNC_GUARD)
+} /* extern "C" */
+#endif
diff --git a/fonts.d b/fonts.d
new file mode 100644
index 0000000..9b4b0cf
--- /dev/null
+++ b/fonts.d
@@ -0,0 +1,210 @@
+import aliases;
+import includes;
+import util;
+import alloc;
+
+enum AtlasType
+{
+	None = 0,
+	SoftMask,
+}
+
+enum YOrigin
+{
+	None = 0,
+	Bottom,
+}
+
+struct FontAtlas
+{
+	AtlasType type;
+	f32 			size;
+	u32				width;
+	u32				height;
+	YOrigin	  y_origin;
+	f32				em_size;
+	f32				line_height;
+	f32				ascender;
+	f32				descender;
+	f32				underline_y;
+	f32				underline_thickness;
+	Glyph[]		glyphs;
+}
+
+struct Glyph
+{
+	dchar ch;
+	f32		advance;
+	f32		plane_left;
+	f32		plane_bottom;
+	f32 	plane_right;
+	f32 	plane_top;
+	f32		atlas_left;
+	f32 	atlas_bottom;
+	f32		atlas_right;
+	f32		atlas_top;
+}
+
+FT_Library FT_LIB;
+alias FontFace = FT_Face;
+
+struct FontAtlasBuf
+{
+	u8[] 	data;
+	FontAtlas atlas;
+}
+
+void
+InitFreeType()
+{
+	FT_Init_FreeType(&FT_LIB);
+}
+
+void
+CloseFreeType()
+{
+	if (FT_LIB)
+	{
+		FT_Done_FreeType(FT_LIB);
+	}
+}
+
+FontFace
+OpenFont(u8[] data)
+{
+	FontFace font;
+	FT_New_Memory_Face(FT_LIB, data.ptr, cast(FT_Long)data.length, 0, &font);
+	return font;
+}
+
+void
+CloseFont(FontFace font)
+{
+	if (font != null)
+	{
+		FT_Done_Face(font);
+	}
+}
+
+FontAtlasBuf
+CreateAtlas(Arena* arena, FontFace font, f32 size, u32 dimension)
+{
+	assert(dimension >= 128, "Dimension must be at least 128");
+
+	FontAtlasBuf atlas = {
+		data: AllocArray!(u8)(arena, dimension * dimension * 4),
+		atlas: {
+			size: size,
+			width: dimension,
+			height: dimension,
+		},
+	};
+
+	// TODO: proper packing algorithm
+	if (font != null)
+	{
+		FT_Set_Pixel_Sizes(font, 0, cast(FT_UInt)((96.0/72.0) * size));
+
+		i64 f_ascent = cast(i64)(font.size.metrics.ascender >> 6);
+		i64 f_descent = cast(i64)(font.size.metrics.descender >> 6);
+		i64 f_height = cast(i64)(font.size.metrics.height >> 6);
+
+		u32 max_w = 0;
+		u32 max_h = 0;
+		u32 current_h = 0;
+		u32 count = 0;
+
+		FT_UInt index;
+		FT_ULong char_code = FT_Get_First_Char(font, &index);
+		while (index != 0)
+		{
+			FT_Load_Char(font, char_code, cast(FT_Int32)FT_LOAD_RENDER);
+
+			u32 bmp_w = font.glyph.bitmap.width;
+			u32 bmp_h = font.glyph.bitmap.rows;
+			if (max_w + bmp_w > dimension)
+			{
+				max_h += current_h;
+				max_w = 0;
+			}
+
+			assert(max_h < dimension, "Unable to pack atlas within dimensions");
+			
+			max_w += bmp_w;
+			current_h = bmp_h > current_h ? bmp_h : current_h;
+			count += 1;
+			
+			char_code = FT_Get_Next_Char(font, char_code, &index);
+		}
+
+		atlas.atlas.glyphs = AllocArray!(Glyph)(arena, count);
+
+		max_w = 0;
+		max_h = 0;
+		current_h = 0;
+		count = 0;
+
+		u32 font_w = font.size.metrics.x_ppem;
+		u32 font_h = font.size.metrics.y_ppem;
+
+		char_code = FT_Get_First_Char(font, &index);
+		while (index != 0)
+		{
+			FT_Load_Char(font, char_code, cast(FT_Int32)FT_LOAD_RENDER);
+
+			FT_GlyphSlot glyph = font.glyph;
+			FT_Bitmap* bmp = &font.glyph.bitmap;
+			i32 top = font.glyph.bitmap_top;
+			i32 left = font.glyph.bitmap_left;
+
+			if (max_w + bmp.rows > dimension)
+			{
+				max_h += current_h;
+				max_w = 0;
+			}
+
+			i32 x, y;
+			foreach(r; 0 .. bmp.rows)
+			{
+				y = cast(i32)(max_h + r);
+				foreach(c; 0 .. bmp.width)
+				{
+					x = max_w + c;
+					u64 offset = (y*dimension + x) * 4;
+
+					atlas.data[offset+0] = bmp.buffer[r*bmp.pitch + c];
+					atlas.data[offset+1] = bmp.buffer[r*bmp.pitch + c];
+					atlas.data[offset+2] = bmp.buffer[r*bmp.pitch + c];
+					atlas.data[offset+3] = 255;
+				}
+			}
+
+			Glyph* g = atlas.atlas.glyphs.ptr + count;
+
+			g.ch = cast(dchar)char_code;
+			g.advance = cast(f32)(glyph.advance.x >> 6);
+			g.plane_left = cast(f32)left;
+			g.plane_right = g.plane_left + bmp.width;
+			g.plane_top = cast(f32)top;
+			g.plane_bottom = g.plane_top + bmp.rows;
+
+			g.atlas_top = max_h;
+			g.atlas_left = max_w;
+			g.atlas_bottom = max_h + bmp.rows;
+			g.atlas_right = max_w + bmp.width;
+
+			max_w += bmp.width;
+			current_h = bmp.rows > current_h ? bmp.rows : current_h;
+
+			char_code = FT_Get_Next_Char(font, char_code, &index);
+
+			count += 1;
+		}
+
+	}
+
+
+
+	return atlas;
+}
+
diff --git a/includes.c b/includes.c
new file mode 100644
index 0000000..a2c74e1
--- /dev/null
+++ b/includes.c
@@ -0,0 +1,28 @@
+#pragma attribute(push, nogc, nothrow)
+
+#ifdef __linux__
+#	include <xcb/xcb.h>
+# include <xcb/xfixes.h>
+#	include <X11/XKBlib.h>
+#	include <X11/Xlib-xcb.h>
+#	include <X11/Xlib.h>
+#	include <X11/keysym.h>
+# include <X11/extensions/Xfixes.h>
+# include <ft2build.h>
+# include FT_FREETYPE_H
+# include FT_GLYPH_H
+#endif
+
+#include <xmmintrin.h>
+
+#define STB_IMAGE_IMPLEMENTATION
+
+#include "../../external/stb/stb_image.h"
+
+#define M3D_IMPLEMENTATION
+
+#include "../../external/m3d/m3d.h"
+
+#define CGLM_FORCE_DEPTH_ZERO_TO_ONE
+
+#include "../../external/cglm/cglm.h"
diff --git a/math.d b/math.d
new file mode 100644
index 0000000..84a0392
--- /dev/null
+++ b/math.d
@@ -0,0 +1,1086 @@
+import aliases;
+import includes;
+import util;
+import std.math;
+import std.math.algebraic;
+import core.stdc.math : tanf, cosf, sinf, sqrtf;
+import std.traits;
+import inteli;
+import std.meta;
+import std.format;
+import std.stdio;
+
+T AlignPow2(T)(T v, T a)
+{
+	return (v + a - 1) & ~(a - 1);
+}
+
+f32 Radians(f32 deg)
+{
+	return deg * (PI / 180.0);
+}
+
+struct Vector(T, int N)
+{
+	static assert(N > 1 && N <= 4);
+
+	enum _N = N;
+	alias T _T;
+
+	union
+	{
+		T[N] v;
+		struct
+		{
+			T x;
+			alias x r;
+
+			T y;
+			alias y g;
+
+			static if (N > 2)
+			{
+				T z;
+				alias z b;
+			}
+
+			static if (N > 3)
+			{
+				T w;
+				alias w a;
+			}
+		};
+	}
+
+	this(Vec3, f32)(Vec3 v3, f32 f) if (N == 4 && is(T: f32))
+	{
+		x = v3.x;
+		y = v3.y;
+		z = v3.z;
+		w = f;
+	}
+
+	this(Arr)(Arr arr) if (is(Arr: typeof(v)))
+	{
+		this.v = arr;
+	}
+
+	this(Args...)(Args args) 
+	{
+		static if (args.length == 1)
+		{
+			opAssign!(Args[0])(args[0]);
+		}
+		else static if (args.length == N)
+		{
+			mixin(GenerateLoop!("v[@] = args[@];", N)());
+		}
+		else static if (args.length == 2 && N == 4)
+		{
+			v[0] = args[0];
+			v[1] = args[0];
+			v[2] = args[0];
+			v[3] = args[1];
+		}
+		else
+		{
+			static assert(false, "Invalid Vector constructor");
+		}
+	}
+
+	ref Vector opAssign(U)(U x) if (is(U: T))
+	{
+		mixin(GenerateLoop!("v[@] = x;", N)());
+		return this;
+	}
+
+	ref Vector opAssign(U)(U u) if (is(U : Vector))
+	{
+		v[] = u.v[];
+		return this;
+	}
+
+	inout(T)* ptr() inout @property
+	{
+		return v.ptr;
+	}
+
+	bool opEquals(U)(U other) if (is(U: Vector!(T, N)))
+	{
+		bool result = true;
+
+		foreach(i; 0 .. N)
+		{
+			if (fabs(v[i] - other.v[i]) > 0.0000009)
+			{
+				result = false;
+				break;
+			}
+		}
+
+		return result;
+	}
+
+	int opDollar()
+	{
+		return N;
+	}
+
+	T[] opSlice()
+	{
+		return v[];
+	}
+
+	Vector opUnary(string op)() if (op == "+" || op == "-" || op == "~" || op == "!")
+	{
+		Vector result;
+		mixin(GenerateLoop!("res.v[@] = " ~ op ~ " v[@];", N)());
+		return res;
+	}
+
+	ref Vector opOpAssign(string op, U)(U value) if (is(U: Vector))
+	{
+		mixin(GenerateLoop!("v[@] " ~ op ~ "= value.v[@];", N)());
+		return this;
+	}
+
+	ref Vector opOpAssign(string op, U)(U value) if (IsConvertible!(U))
+	{
+		Vector conv = value;
+		return opOpAssign!(op)(conv);
+	}
+
+	@property auto opDispatch(string op, U = void)() if (ValidSwizzle!(op) && op.length <= 4)
+	{
+		Vector!(T, op.length) result;
+		enum index_tuple = SwizzleTuple!(op);
+		static foreach(i, index; index_tuple)
+		{
+			result.v[i] = v[index];
+		}
+		return result;
+	}
+
+	@property void opDispatch(string op, U)(U x) if ((op.length > 1) && ValidUniqueSwizzle!(op) && is(typeof(Vector!(T, op.length)(x))))
+	{
+		Vector!(T, op.length) conv = x;
+		enum index_tuple = SwizzleTuple!(op);
+		static foreach(i, index; index_tuple)
+		{
+			v[index] = conv[i];
+		}
+	}
+
+	static if (N == 4)
+	{
+		Vector opBinary(string op, U)(U operand) if ((is(U: Vector!(f32, 4)) && is(T: f32)) && (op == "*" || op == "+" || op == "-" || op == "/"))
+		{
+			Vector result;
+			f32* l = &x;
+			f32* r = &operand.x;
+			f32* res = &result.x;
+			
+			asm 
+			{
+				mov R8, l;
+				mov R9, r;
+				mov R10, res;
+				movups XMM0, x.offsetof[R8];
+				movups XMM1, operand.x.offsetof[R9];
+			}
+			static if (op == "*")      asm  { mulps XMM0, XMM1; }
+			else static if (op == "-") asm  { subps XMM0, XMM1; }
+			else static if (op == "+") asm  { addps XMM0, XMM1; }
+			else static if (op == "/") asm  { divps XMM0, XMM1; }
+			asm 
+			{
+				movups result.x.offsetof[R10], XMM0;
+			}
+
+			return result;
+		}
+
+		Vector opBinary(string op, U)(U operand) if (IsConvertible!(U) && (op == "*" || op == "+" || op == "-" || op == "/")) 
+		{
+			Vector result;
+			Vector other = operand;
+			f32* l = &x;
+			f32* r = &other.x;
+			f32* res = &result.x;
+			
+			asm 
+			{
+				mov R8, l;
+				mov R9, r;
+				mov R10, res;
+				movups XMM0, x.offsetof[R8];
+				movups XMM1, other.x.offsetof[R9];
+			}
+			static if (op == "*")      asm  { mulps XMM0, XMM1; }
+			else static if (op == "-") asm  { subps XMM0, XMM1; }
+			else static if (op == "+") asm  { addps XMM0, XMM1; }
+			else static if (op == "/") asm  { divps XMM0, XMM1; }
+			asm 
+			{
+				movups result.x.offsetof[R8], XMM0;
+			}
+
+			return result;
+		}
+	}
+	else
+	{
+		Vector opBinary(string op, U)(U operand) if (is(U: Vector) && U._N == N && (op == "*" || op == "+" || op == "-" || op == "/"))
+		{
+			Vector res;
+			mixin(GenerateLoop!("res.v[@] = v[@] " ~ op ~ " operand.v[@];", N)());
+			return res;
+		}
+
+		Vector opBinary(string op, U)(U operand) if (IsConvertible!(U) && (op == "*" || op == "+" || op == "-" || op == "/"))
+		{
+			Vector res;
+			Vector other = operand;
+			mixin(GenerateLoop!("res.v[@] = v[@] " ~ op ~ " other.v[@];", N)());
+			return res;
+		}
+	}
+
+	
+
+	ref T opIndex(size_t i)
+	{
+		return v[i];
+	}
+
+	T opIndexAssign(U : T)(U x, size_t i)
+	{
+		return v[i] = x;
+	}
+
+	U opCast(U)() if (IsVector!(U) && (U._N == _N))
+	{
+		U result;
+		mixin(GenerateLoop!("res.v[@] = cast(U._T)v[@];", N)());
+		return result;
+	}
+
+	template IsConvertible(T)
+	{
+		enum bool IsConvertible = (!is(T : Vector)) && is(typeof({ T x; Vector v = x; }()));
+	}
+
+	template SwizzleIndex(char c)
+	{
+		static if ((c == 'x' || c == 'r') && N > 0)
+			enum SwizzleIndex = 0;
+		else static if ((c == 'y' || c == 'g') && N > 1)
+			enum SwizzleIndex = 1;
+		else static if ((c == 'z' || c == 'b') && N > 2)
+			enum SwizzleIndex = 2;
+		else static if ((c == 'w' || c == 'a') && N > 3)
+			enum SwizzleIndex = 3;
+		else
+			enum SwizzleIndex = -1;
+	}
+
+	template SwizzleSet(char c)
+	{
+		static if (c == 'x' || c == 'y' || c == 'z' || c == 'w')
+			enum SwizzleSet = 0;
+		else static if (c == 'r' || c == 'g' || c == 'b' || c == 'a')
+			enum SwizzleSet = 1;
+		else
+			enum SwizzleSet = -1;
+	}
+
+	template SwizzleTuple(string op)
+	{
+		enum op_length = op.length;
+		static if (op.length == 0)
+			enum SwizzleTuple = [];
+		else
+			enum SwizzleTuple = [ SwizzleIndex!(op[0])] ~ SwizzleTuple!(op[1 .. op.length]);
+	}
+
+	template SearchString(char c, string s)
+	{
+		static if (s.length == 0)
+		{
+			enum bool result = false;
+		}
+		else
+		{
+			enum string tail = s[1 .. s.length];
+			enum bool result = (s[0] == c) || SearchString!(c, tail).result;
+		}
+	}
+
+	template UniqueChars(string s)
+	{
+		static if (s.length == 1)
+		{
+			enum bool result = true;
+		}
+		else
+		{
+			enum tail = s[1 .. s.length];
+			enum bool result = !(SearchString!(s[0], tail).result) && UniqueChars!(tail).result;
+		}
+	}
+
+	template ValidSwizzle(string op, int last_swizzle = -1)
+	{
+		static if (op.length == 0)
+		{
+			enum bool ValidSwizzle = true;
+		}
+		else
+		{
+			enum length = op.length;
+			enum int swizzle_set = SwizzleSet!(op[0]);
+			enum bool valid_swizzle_set = (last_swizzle == -1 || (swizzle_set == last_swizzle));
+			enum bool ValidSwizzle = (SwizzleIndex!(op[0]) != -1) && valid_swizzle_set && ValidSwizzle!(op[1 .. length], swizzle_set);
+		}
+	}
+
+	template ValidUniqueSwizzle(string op)
+	{
+		static if (ValidSwizzle!(op))
+		{
+			enum ValidUniqueSwizzle = UniqueChars!(op).result;
+		}
+		else
+		{
+			enum ValidUniqueSwizzle = false;
+		}
+	}
+}
+
+// TODO: fix alignment for non mat4s
+align(16) struct Matrix(T, int D)
+{
+	static assert(D > 0 && D <= 4);
+
+	alias Vector!(T, D) MatrixVec;
+	alias T _T;
+	alias T[4] Row;
+
+	enum N = D*D;
+	enum _D = D;
+
+	union
+	{
+		T[N] v;
+		Row[D] rows;
+		MatrixVec[D] vec;
+		static if (D == 4) mat4 glm_mat;
+		static if (D == 3) mat3 glm_mat;
+		static if (D == 2) mat2 glm_mat;
+	}
+
+	// TODO: setup 
+
+	this(U...)(U values)
+	{
+		static if ((U.length == N) && allSatisfy!(IsTypeAssignable, U))
+		{
+			static foreach(i, x; values)
+			{
+				v[i] = x;
+			}
+		}
+		else static if ((U.length == 1) && (isAssignable!(U[0])) && (!is(U[0] : Matrix)))
+		{
+			v[] = values[0];
+		}
+		else static assert(false, "Cannot construct matrix with provided parameters");
+	}
+
+	this(U)(T x)
+	{
+		static foreach(i; 0 .. N)
+		{
+			v[i] = x;
+		}
+	}
+
+	@property inout(T)* ptr() inout
+	{
+		return v.ptr;
+	}
+
+	ref Matrix opAssign(U : T)(U x)
+	{
+		static foreach(i; 0 .. N)
+		{
+			v[i] = x;
+		}
+
+		return this;
+	}
+
+	ref Matrix opAssign(U : Matrix)(U x)
+	{
+		static foreach(i; 0 .. N)
+		{
+			v[i] = x.v[i];
+		}
+
+		return this;
+	}
+
+	ref Matrix opAssign(U)(U x) if (IsMatrixInstantiation!(U) && is(U._T : _T) && (!is(U: Matrix) && (U.N != N)))
+	{
+		static foreach(i; 0 .. N)
+		{
+			v[i] = x.v[i];
+		}
+
+		return this;
+	}
+
+	ref T opIndex(size_t i, size_t j)
+	{
+		return v[(i * D) + j];
+	}
+
+	T opIndexAssign(U: T)(U x, size_t i, size_t j)
+	{
+		return v[(i * D) + j] = x;
+	}
+
+	bool opEquals(U)(U other) if (is(U: Matrix!(T, D)))
+	{
+		bool result = true;
+
+		static foreach(i; 0 .. N)
+		{
+			if (fabs(this.v[i] - other.v[i]) > 0.0000009)
+			{
+				result = false;
+			}
+		}
+
+		return result;
+	}
+
+	Matrix opBinary(string op)(T scalar) if (op == "*")
+	{
+		Matrix result;
+
+		static foreach(i; 0 .. N)
+		{
+			result.v[i] = v[i] * scalar;
+		}
+
+		return result;
+	}
+
+	static if (D == 4)
+	{
+		Vec4 opBinary(string op, U)(U x) if (is(U: Vec4) && is(T: f32) && (op == "*"))
+		{
+			Vec4 result = 0.0;
+			glm_mat4_mulv(glm_mat.ptr, x.v.ptr, result.v.ptr);
+			return result;
+		}
+
+		Matrix opBinary(string op, U)(U x) if (is(U: Matrix!(T, D)) && is(T: f32) && D == 4 && (op == "*")) 
+		{
+			Matrix result;
+			MatZero(&result);
+
+			glm_mat4_mul(glm_mat.ptr, x.glm_mat.ptr, result.glm_mat.ptr);
+
+			return result;
+		}
+	}
+
+	template IsTypeAssignable(U)
+	{
+		enum bool IsTypeAssignable = std.traits.isAssignable!(T, U);
+	}
+
+	template IsMatrixInstantiation(U)
+	{
+		static void IsMatrix(T, int D)(Matrix!(T, D) x) {}
+		
+		enum bool IsMatrixInstantiation = is(typeof(IsMatrix(U.init)));
+	}
+}
+
+struct Quat
+{
+	union
+	{
+		f32[4] v;
+		Vec4 vec;
+		struct
+		{
+			f32 x;
+			f32 y;
+			f32 z;
+			f32 w;
+		};
+	};
+
+	this(f32 w, f32 x, f32 y, f32 z)
+	{
+		vec.x = x;
+		vec.y = y;
+		vec.z = z;
+		vec.w = w;
+	}
+
+	U opCast(U)() if (is(U: Mat4))
+	{
+		Mat4 result;
+		glm_quat_mat4(vec.ptr, result.glm_mat.ptr);
+		return result;
+	}
+
+	Quat opBinary(string op, U)(U r) if (op == "*" && is(U: Quat))
+	{
+		Quat q;
+
+		q.x = this.w * r.x + this.x * r.w + this.y * r.z - this.z * r.y;
+		q.y = this.w * r.y - this.x * r.z + this.y * r.w + this.z * r.x;
+		q.z = this.w * r.z + this.x * r.y - this.y * r.x + this.z * r.w;
+		q.w = this.w * r.w - this.x * r.x - this.y * r.y - this.z * r.z;
+
+		return q;
+	}
+}
+
+Mat4
+Mat4MulASM(Mat4 l, Mat4 r)
+{
+	Mat4 result;
+
+	auto lp = &l;
+	auto rp = &r;
+	auto res = &result;
+
+	// TODO: fix this
+	asm @trusted 
+	{
+		mov R8, lp;
+		mov R9, rp;
+		mov R10, res;
+
+		movups XMM0, [R8];
+		movups XMM1, [R9+00];
+		movups XMM2, [R9+16];
+		movups XMM3, [R9+32];
+		movups XMM4, [R9+48];
+
+		movups XMM6, XMM1;
+		shufps XMM6, XMM6, 0; // XMM5 = vec.xxxx;
+		mulps XMM6, XMM0; // XMM6 = col1;
+
+		movups XMM7, XMM2;
+		shufps XMM7, XMM7, 0;
+		mulps XMM7, XMM0; // XMM7 = col2;
+
+		movups XMM8, XMM3;
+		shufps XMM8, XMM8, 0;
+		mulps XMM8, XMM0; // XMM8 = col3;
+
+		movups XMM9, XMM3;
+		shufps XMM9, XMM9, 0;
+		mulps XMM9, XMM0; // XMM9 = col4;
+
+		movups XMM0, [R8+16];
+
+		movups XMM5, XMM1;
+		shufps XMM5, XMM5, 85; // XMM5 = vec.yyyy;
+		mulps XMM5, XMM0;
+		addps XMM6, XMM5;
+
+		movups XMM5, XMM2;
+		shufps XMM5, XMM5, 85;
+		mulps XMM5, XMM0;
+		addps XMM7, XMM5;
+
+		movups XMM5, XMM3;
+		shufps XMM5, XMM5, 85;
+		mulps XMM5, XMM0;
+		addps XMM8, XMM5;
+
+		movups XMM5, XMM4;
+		shufps XMM5, XMM5, 85;
+		mulps XMM5, XMM0;
+		addps XMM9, XMM5;
+
+		movups XMM0, [R8+32];
+
+		movups XMM5, XMM1;
+		shufps XMM5, XMM5, 170; // XMM5 = vec.zzzz;
+		mulps XMM5, XMM0;
+		addps XMM6, XMM5;
+
+		movups XMM5, XMM2;
+		shufps XMM5, XMM5, 170;
+		mulps XMM5, XMM0;
+		addps XMM7, XMM5;
+
+		movups XMM5, XMM3;
+		shufps XMM5, XMM5, 170;
+		mulps XMM5, XMM0;
+		addps XMM8, XMM5;
+
+		movups XMM5, XMM4;
+		shufps XMM5, XMM5, 170;
+		mulps XMM5, XMM0;
+		addps XMM9, XMM5;
+
+		movups XMM0, [R8+48];
+
+		movups XMM5, XMM1;
+		shufps XMM5, XMM5, 255; // XMM5 = vec.wwww;
+		mulps XMM5, XMM0;
+		addps XMM6, XMM5;
+
+		movups XMM5, XMM2;
+		shufps XMM5, XMM5, 255;
+		mulps XMM5, XMM0;
+		addps XMM7, XMM5;
+
+		movups XMM5, XMM3;
+		shufps XMM5, XMM5, 255;
+		mulps XMM5, XMM0;
+		addps XMM8, XMM5;
+
+		movups XMM5, XMM4;
+		shufps XMM5, XMM5, 255;
+		mulps XMM5, XMM0;
+		addps XMM9, XMM5;
+
+		movups [R10+00], XMM6;
+		movups [R10+16], XMM7;
+		movups [R10+32], XMM8;
+		movups [R10+48], XMM9;
+	}
+
+	return result;
+}
+
+pragma(inline): Mat4 
+Mat4Identity()
+{
+	return Mat4(
+		1.0, 0.0, 0.0, 0.0,
+		0.0, 1.0, 0.0, 0.0,
+		0.0, 0.0, 1.0, 0.0,
+		0.0, 0.0, 0.0, 1.0
+	);
+}
+
+pragma(inline): void
+Mat4Identity(Mat4* mat)
+{
+	MatZero(mat);
+
+	(*mat)[0, 0] = 1.0;
+	(*mat)[1, 1] = 1.0;
+	(*mat)[2, 2] = 1.0;
+	(*mat)[3, 3] = 1.0;
+}
+
+ pragma(inline): Mat3 
+Mat3Identity()
+{
+	return Mat3(
+		1.0, 0.0, 0.0,
+		0.0, 1.0, 0.0,
+		0.0, 0.0, 1.0
+	);
+}
+
+ pragma(inline): Mat2
+Mat2Identity()
+{
+	return Mat2(
+		1.0, 0.0,
+		0.0, 1.0
+	);
+}
+
+pragma(inline): Quat
+QuatFromAxis(f32 angle, Vec3 axis)
+{
+	Quat q;
+	glm_quatv(q.vec.ptr, angle, axis.v.ptr);
+	return q;
+}
+
+pragma(inline): f32
+Dot(Vec2* l, Vec2* r)
+{
+	return l.x * r.x + l.y * r.y;
+}
+
+pragma(inline): f32
+Dot(Vec3* l, Vec3* r)
+{
+	return l.x * r.x + l.y * r.y + l.z * r.z;
+}
+
+pragma(inline): f32
+Dot(Vec4* l, Vec4* r)
+{
+	// TODO: SIMD this
+	return l.x * r.x + l.y * r.y + l.z * r.z + l.w * r.w;
+}
+
+pragma(inline): f32
+Norm(Vec3* v)
+{
+	return sqrtf(Dot(v, v));
+}
+
+pragma(inline): f32
+Norm(Vec4* v)
+{
+	// TODO: SIMD this
+	return sqrtf(Dot(v, v));
+}
+
+pragma(inline): void
+Normalize(T)(T* vec) if (is(T: Vec2) || is(T: Vec3) || is(T: Vec4))
+{
+	f32 length = Norm(vec);
+
+	if (length < f32.epsilon)
+	{
+		mixin(GenerateLoop!("vec.v[@] = 0.0;", vec._N)());
+	}
+	else
+	{
+		mixin(GenerateLoop!("vec.v[@] *= (1.0 / length);", vec._N)());
+	}
+}
+
+pragma(inline): T
+Normalize(T)(T vec) if (is(T: Vec2) || is(T: Vec3) || is(T: Vec4))
+{
+	Normalize(&vec);
+	return vec;
+}
+
+pragma(inline): Quat
+Normalize(Quat q)
+{
+	f32 dot = Norm(&q.vec);
+ 
+	if (dot <= 0.0)
+	{
+		q = Quat(1.0, 0.0, 0.0, 0.0);
+	}
+
+	q.vec *= 1.0 / sqrtf(dot);
+
+	return q;
+}
+
+pragma(inline): Mat4
+Perspective(f32 fov, f32 aspect, f32 near, f32 far)
+{
+	Mat4 res;
+	MatZero(&res);
+	glm_perspective(fov, aspect, near, far, res.glm_mat.ptr);
+	res[1, 1] *= -1.0;
+	return res;
+}
+
+pragma(inline): Vec3
+Rotate(Quat q, Vec3 vec)
+{
+	Quat p = Normalize(q);
+
+	Vec3 i = p.vec.xyz;
+	f32 r = p.vec.w;
+
+	Vec3 v1 = i * (2.0 * Dot(&i, &vec));
+	Vec3 v2 = i * (r * r - Dot(&i, &i));
+	v1 += v2;
+
+	v2 = i * vec;
+	v2 = v2 * (2.0 * r);
+
+	return v1 + v2;
+}
+
+pragma(inline): Mat4
+LookAt(Vec3 eye, Vec3 center, Vec3 up)
+{
+	Mat4 result;
+	MatZero(&result);
+	glm_lookat(eye.v.ptr, center.v.ptr, up.v.ptr, result.glm_mat.ptr);
+	return result;
+}
+
+pragma(inline): Mat4
+Look(Vec3 eye, Vec3 dir, Vec3 up)
+{
+	return LookAt(eye, eye + dir, up);
+}
+
+pragma(inline): Vec3
+Cross(Vec3 a, Vec3 b)
+{
+	Vec3 c;
+	Cross(a, b, &c);
+	return c;
+}
+
+pragma(inline): Vec3
+CrossN(Vec3 a, Vec3 b)
+{
+	Vec3 c;
+	Cross(a, b, &c);
+	Normalize(&c);
+	return c;
+}
+
+pragma(inline): void
+CrossN(Vec3 a, Vec3 b, Vec3* dst)
+{
+	Cross(a, b, dst);
+	glm_vec3_normalize(dst.v.ptr);
+}
+
+pragma(inline): void
+Cross(Vec3 a, Vec3 b, Vec3* dst)
+{
+	glm_vec3_cross(a.v.ptr, b.v.ptr, dst.v.ptr);
+}
+
+pragma(inline): void
+MatZero(Mat4* mat)
+{
+	auto v = &mat.vec;
+	asm 
+	{
+		mov R8, v;
+		xorps XMM0, XMM0;
+		movups mat.vec.offsetof[R8]+00, XMM0;
+		movups mat.vec.offsetof[R8]+16, XMM0;
+		movups mat.vec.offsetof[R8]+32, XMM0;
+		movups mat.vec.offsetof[R8]+48, XMM0;
+	}
+}
+
+pragma(inline): void
+Translate(Mat4* mat, Vec3 vec)
+{
+	glm_translate(mat.glm_mat.ptr, vec.v.ptr);
+}
+
+pragma(inline): Mat4
+Inverse(Mat4 mat)
+{
+	Mat4 res;
+	MatZero(&res);
+	glm_mat4_inv(mat.glm_mat.ptr, res.glm_mat.ptr);
+	return res;
+}
+
+pragma(inline): f32
+Mix(f32 x, f32 y, f32 a)
+{
+	return x * (1 - a) + y * a;
+}
+
+pragma(inline): f32
+InverseLerp(f32 v, f32 min, f32 max)
+{
+	return (v - min) / (max - min);
+}
+
+pragma(inline): f32
+Remap(f32 v, f32 in_min, f32 in_max, f32 out_min, f32 out_max)
+{
+	f32 t = InverseLerp(v, in_min, in_max);
+	return Mix(out_min, out_max, t);
+}
+
+unittest
+{
+	enum FLOAT_MAX = f32.max;
+	enum FLOAT_MIN = -f32.max;
+
+	import core.stdc.stdio;
+	import core.stdc.stdlib;
+	import core.stdc.time;
+	import std.range : take;
+	import std.algorithm.iteration : sum;
+
+	void PrintMatrix(Mat4 mat)
+	{
+		foreach(i; 0 .. mat.N)
+		{
+			if (i % 4 == 0)
+			{
+				printf("\n");
+			}
+			printf("%.08f ", mat.v[i]);
+		}
+		printf("\n");
+	}
+
+	srand(cast(u32)time(null));
+
+	f32 RandomFloat()
+	{
+		return cast(f32)(rand())/cast(f32)(RAND_MAX + 1.0);	
+	}
+
+	{ // Vec2 arithmetic
+		Vec2 v1 = Vec2(5.0, 10.0);
+		Vec2 v2 = Vec2(2.5, 5.0);
+
+		Vec2 result = v1 * v2;
+
+		assert(result == Vec2(12.5, 50.0), "Vec2 mul failure");
+
+		result = v1 + v2;
+
+		assert(result == Vec2(7.5, 15.0), "Vec2 add failure");
+
+		result = v1 - v2;
+
+		assert(result == Vec2(2.5, 5.0), "Vec2 sub failure");
+
+		result = v1 / v2;
+
+		assert(result == Vec2(2.0), "Vec2 div failure");
+	}
+
+	{ // Vec3 Arithmetic
+		Vec3 v1 = Vec3(5.0, 10.0, 15.0);
+		Vec3 v2 = Vec3(2.5, 5.0, 7.5);
+
+		Vec3 result = v1 * v2;
+
+		assert(result == Vec3(12.5, 50.0, 112.5), "Vec3 mul failure");
+
+		result = v1 + v2;
+
+		assert(result == Vec3(7.5, 15.0, 22.5), "Vec3 add failure");
+
+		result = v1 - v2;
+
+		assert(result == Vec3(2.5, 5.0, 7.5), "Vec3 sub failure");
+
+		result = v1 / v2;
+
+		assert(result == Vec3(2.0), "Vec3 div failure");
+	}
+
+	{ // Vec3 Arithmetic
+		Vec4 v1 = Vec4(5.0, 10.0, 15.0, 20.0);
+		Vec4 v2 = Vec4(2.5, 5.0, 7.5, 10.0);
+
+		Vec4 result = v1 * v2;
+
+		assert(result == Vec4(12.5, 50.0, 112.5, 200.0), "Vec4 mul failure");
+
+		result = v1 + v2;
+
+		assert(result == Vec4(7.5, 15.0, 22.5, 30.0), "Vec4 add failure");
+
+		result = v1 - v2;
+
+		assert(result == Vec4(2.5, 5.0, 7.5, 10.0), "Vec4 sub failure");
+
+		result = v1 / v2;
+
+		assert(result == Vec4(2.0), "Vec4 div failure");
+	}
+
+	{ // Mat4 Arithmetic
+		Mat4 m1 = RandomMat4();
+		Mat4 m2 = RandomMat4();
+		Mat4 m3 = m1 * m2;
+		Mat4 m4;
+		
+		MatZero(&m4);
+
+		for(u32 i = 0; i < 4; i += 1)
+		{
+			for(u32 j = 0; j < 4; j += 1)
+			{
+				for(u32 k = 0; k < 4; k += 1)
+				{
+					m4.rows[i][j] += m1.rows[k][j] * m2.rows[i][k];
+				}
+			}
+		}
+
+		assert(m3 == m4, "Mat4 mul failure");
+	}
+
+	{ // Translate
+		Mat4 mat = Mat4Identity();
+		Vec4 vec = Vec4(1.0, 2.0, 3.0, 1.0);
+
+		Translate(&mat, Vec3(13.0, 11.0, 7.0));
+		Vec4 result = mat * vec;
+
+		assert(result == Vec4(14.0, 13.0, 10.0, 1.0));
+
+		mat = Mat4Identity();
+		Translate(&mat, Vec3(1.0, -1.0, -5.0));
+		result = mat * result;
+
+		assert(result == Vec4(15.0, 12.0, 5.0, 1.0));
+	}
+
+	{ // Identity
+		Mat4 identity = Mat4(
+				1.0, 0.0, 0.0, 0.0,
+				0.0, 1.0, 0.0, 0.0,
+				0.0, 0.0, 1.0, 0.0,
+				0.0, 0.0, 0.0, 1.0
+				);
+		Mat4 mat = Mat4Identity();
+
+		assert(identity == mat);
+	}
+
+
+	{ // Inverse
+		foreach(i; 0 .. 1000)
+		{
+			Mat4 m1 = RandomMat4();
+
+			Mat4 m1_inv = Inverse(m1);
+			Mat4 m1_reinv = Inverse(m1_inv);
+
+			assert(m1 == m1_reinv, "Inverse test failed");
+		}
+	}
+
+	{ // Cross
+		Vec3 v1 = Vec3(2.0, -3.0, 4.0);
+		Vec3 v2 = Vec3(12.0, -31.0, 43.0);
+
+		Vec3 v3 = Cross(v1, v2);
+
+		Vec3 v4 = Vec3(
+			v1.y * v2.z - v1.z * v2.y,
+			v1.z * v2.x - v1.x * v2.z,
+			v1.x * v2.y - v1.y * v2.x
+		);
+
+		assert(v3 == v4, "Vec3 Cross failure");
+
+		v3 = CrossN(v1, v2);
+
+		Normalize(&v4);
+
+		assert(v3 == v4, "Vec3 CrossN failure");
+	}
+}
diff --git a/util.d b/util.d
new file mode 100644
index 0000000..dfaeffd
--- /dev/null
+++ b/util.d
@@ -0,0 +1,661 @@
+import aliases;
+import xxhash3;
+import includes;
+import std.stdio;
+import core.stdc.string : memset;
+import alloc;
+import core.simd;
+import std.conv;
+import std.string;
+
+struct DynSlice(T)
+{
+	T[][] slices;
+	u32		length;
+	u32		capacity;
+	u32		grow_size;
+}
+
+DynSlice!(T)
+CreateDynSlice(T)(u32 size)
+{
+	DynSlice!(T) dslice = {
+		slices: MAllocArray!(T[])(size),
+		length: 0,
+		capacity: size,
+		grow_size: size,
+	};
+
+	dslice.slices[0] = MAllocArray!(T)(size);
+
+	return dslice;
+}
+
+u32
+Next(T)(DynSlice!(T)* slice)
+{
+	if (slice.length < slice.capacity)
+	{
+		
+	}
+
+	return 0;
+}
+
+void
+Logf(Args...)(string fmt, Args args)
+{
+	try
+	{
+		writefln(fmt, args);
+	}
+	catch (Exception e)
+	{
+		assert(false, "Incompatible format type");
+	}
+}
+
+void
+Log(string str)
+{
+	writeln(str);
+}
+
+void
+Log(char* str)
+{
+	writeln(str);
+}
+
+u64 
+KB(u64 v)
+{
+	return v * 1024;
+};
+
+u64 
+MB(u64 v)
+{
+	return KB(v) * 1024;
+};
+
+u64 
+GB(u64 v)
+{
+	return MB(v) * 1024;
+};
+
+pragma(inline): void
+ConvertColor(Vec4 *dst, u32 src)
+{
+	if (src == 0)
+	{
+		dst.rgb = 0.0;
+		dst.a = 1.0;
+	}
+	else
+	{
+		Convert(dst, src);
+	}
+}
+
+pragma(inline): void
+Convert(Vec4* dst, u32 src)
+{
+	dst.r = cast(f32)((src >> 0)  & 0xFF) / 255.0;
+	dst.g = cast(f32)((src >> 8)  & 0xFF) / 255.0;
+	dst.b = cast(f32)((src >> 16) & 0xFF) / 255.0;
+	dst.a = cast(f32)((src >> 24) & 0xFF) / 255.0;
+}
+
+bool
+BitEq(u64 l, u64 r)
+{
+	return (l & r) == r;
+}
+
+struct Node(T)
+{
+	Node!(T)* next;
+	T				 value;
+}
+
+struct SLList(T)
+{
+	Node!(T)* first;
+	Node!(T)* last;
+}
+
+pragma(inline): bool
+CheckNil(T)(Node!(T)* nil, Node!(T)* node)
+{
+	return node == null || node == nil;
+}
+
+pragma(inline): void
+ConcatInPlace(T)(SLList!(T)* list, SLList!(T)* to_concat)
+{
+	if (to_concat.first)
+	{
+		if (list.first)
+		{
+			list.last.next = to_concat.first;
+			list.last = to_concat.last;
+		}
+		else
+		{
+			list.first = to_concat.first;
+			list.last = to_concat.last;
+		}
+
+		memset(to_concat, 0, SLList!(T).sizeof);
+	}
+}
+
+pragma(inline): Node!(T)*
+Pop(T)(SLList!(T)*list, Node!(T)* nil)
+{
+	Node!(T)* node = list.first;
+
+	if (list.first == list.last)
+	{
+		list.first = list.last = nil;
+	}
+	else
+	{
+		list.first = list.first.next;
+	}
+
+	return node;
+}
+
+pragma(inline): void
+Remove(T)(SLList!(T)*list, Node!(T)* node, Node!(T)* prev, Node!(T)* nil)
+{
+	node.next = nil;
+
+	if (list.first == list.last)
+	{
+		list.first = list.last = nil;
+	}
+	else if (list.first == node)
+	{
+		list.first = node.next;
+	}
+	else if (list.last == node)
+	{
+		list.last = prev;
+		prev.next = nil;
+	}
+	else
+	{
+		prev.next = node.next;
+	}
+}
+
+pragma(inline): void
+PushFront(T)(SLList!(T)*list, Node!(T)* node, Node!(T)* nil)
+{
+	if (CheckNil(nil, list.first))
+	{
+		list.first = list.last = node;
+		node.next = nil;
+	}
+	else
+	{
+		node.next = list.first;
+		list.first = node;
+	}
+}
+
+pragma(inline): void
+Push(T)(SLList!(T)*list, Node!(T)* node, Node!(T)* nil)
+{
+	if (CheckNil(nil, list.first))
+	{
+		list.first = list.last = node;
+		node.next = nil;
+	}
+	else
+	{
+		list.last.next = node;
+		list.last = node;
+		node.next = nil;
+	}
+}
+
+struct KVPair(K, V)
+{
+	K key;
+	V value;
+}
+
+struct Result(V)
+{
+	V 	 value;
+	bool ok;
+}
+
+struct HashTable(K, V)
+{
+	alias P = KVPair!(K, V);
+
+	SLList!(P)   free_lists;
+	SLList!(P)[] lists;
+	Node!(P)*  	 nil;
+	u64 						  			 node_count;
+	u64							  			 list_count;
+
+	void opIndexAssign(V value, K key)
+	{
+		Push(&this, key, value);
+	}
+
+	Result!(V) opIndex(K key)
+	{
+		P* pair = Search(&this, key);
+		assert(pair != null, "HashTable key index failure: Result must be present");
+
+		Result!(V) result = { ok: false };
+		if (pair != null)
+		{
+			result.value = pair.value;
+			result.ok = true;
+		}
+
+		return result;
+	}
+
+	Result!(V) opIndexUnary(string s: "~")(K key)
+	{
+		return Delete(&this, key);
+	}
+}
+
+HashTable!(K, V)
+CreateHashTable(K, V)(u64 size)
+{
+	auto nil = Alloc!(Node!(KVPair!(K, V)));
+	auto lists = AllocArray!(SLList!(KVPair!(K, V)))(size);
+
+	HashTable!(K, V) table = {
+		lists: lists,
+		list_count: size,
+		nil: nil,
+		free_lists: {
+			first: nil,
+			last: nil,
+		},
+	};
+
+	foreach(list; table.lists)
+	{
+		list.first = nil;
+		list.last = nil;
+	}
+
+	return table;
+}
+
+pragma(inline): void
+Clear(K, V)(HashTable!(K, V)* ht)
+{
+	table.count = 0;
+	foreach(i, list; ht.lists)
+	{
+		ConcatInPlace(&ht.free_lists, ht.lists.ptr + i);
+	}
+}
+
+pragma(inline): Node!(KVPair!(K, V))*
+Push(K, V)(HashTable!(K, V)* ht, K key, V value)
+{
+	alias P = KVPair!(K, V);
+	alias N = Node!(P);
+
+	N* node = ht.nil;
+
+	if (ht.free_lists.first != ht.nil)
+	{
+		node = Pop(&ht.free_lists, ht.nil);
+	}
+	else
+	{
+		node = Alloc!(N);
+	}
+
+	node.next = ht.nil;
+	node.value.key = key;
+	node.value.value = value;
+
+	Push(GetList(ht, key), node, ht.nil);
+
+	ht.node_count += 1;
+
+	return node;
+}
+
+pragma(inline): KVPair!(K, V)*
+Search(K, V)(HashTable!(K, V)* ht, K key)
+{
+	KVPair!(K, V)* result = null;
+
+	auto list = GetList(ht, key);
+	for(auto node = list.first; node != ht.nil; node = node.next)
+	{
+		if (node.value.key == key)
+		{
+			result = &node.value;
+			break;
+		}
+	}
+
+	return result;
+}
+
+pragma(inline): SLList!(KVPair!(K, V))*
+GetList(K, V)(HashTable!(K, V)* ht, K key)
+{
+	u64 hash = Hash(&key);
+	u64 index = hash % ht.list_count;
+	return ht.lists.ptr + index;
+}
+
+pragma(inline): Result!(V)
+Delete(K, V)(HashTable!(K, V)* ht, K key)
+{
+	Result!(V) result = { ok: false };
+
+	auto list = GetList(ht, key);
+	auto prev = ht.nil;
+	for(auto node = list.first; node != ht.nil; node = node.next)
+	{
+		if (node.value.key == key)
+		{
+			Remove(list, node, prev, ht.nil);
+
+			result.ok = true;
+			result.value = node.value.value;
+
+			memset(&node.value, 0, node.value.sizeof);
+
+			Push(&ht.free_lists, node, ht.nil);
+
+			break;
+		}
+	}
+
+	return result;
+}
+
+const u64 HASH_SEED = 5995;
+
+pragma(inline): u64
+Hash(T)(T* value)
+{
+	return xxh3_64bits_withSeed(value, T.sizeof / u8.sizeof, HASH_SEED);
+}
+
+pragma(inline): u64
+Hash(string str)
+{
+	return xxh3_64bits_withSeed(str.ptr, str.length, HASH_SEED);
+}
+
+pragma(inline): u64
+RDTSC()
+{
+	union u64_split
+	{
+		u64 full;
+		struct 
+		{
+			u32 lower;
+			u32 upper;
+		};
+	};
+
+	u64_split val;
+	u64_split* valp = &val;
+	asm
+	{
+		cpuid;
+		rdtsc;
+		mov R8, valp;
+		mov valp.upper.offsetof[R8], EDX;
+		mov valp.lower.offsetof[R8], EAX;
+	}
+
+	return val.full;
+}
+
+pragma(inline): u64
+OSTimeFreq()
+{
+	version (linux)
+	{
+		u64 freq = 1000000;
+	}
+
+	return freq;
+}
+
+pragma(inline): u64
+OSTime()
+{
+	version(linux)
+	{
+		import core.sys.linux.sys.time;
+		timeval value;
+		gettimeofday(&value, null);
+
+		u64 time = OSTimeFreq() * cast(u64)(value.tv_sec) + cast(u64)(value.tv_usec);
+	}
+
+	return time;
+}
+
+// TODO: probably needs improvement/testing
+struct IntervalTimer
+{
+	u64 cpu_freq;
+	u64 interval;
+	u64 prev;
+}
+
+IntervalTimer
+CreateTimer(u64 fps)
+{
+	IntervalTimer timer;
+
+	u64 ms_to_wait = 50;
+
+	u64 os_freq = OSTimeFreq();
+	u64 cpu_start = RDTSC();
+	u64 os_start = OSTime();
+	u64 os_end = 0;
+	u64 os_elapsed = 0;
+
+	u64 os_wait_time = os_freq * ms_to_wait / 1000;
+
+	while (os_elapsed < os_wait_time)
+	{
+		os_end = OSTime();
+		os_elapsed = os_end - os_start;
+	}
+
+	u64 cpu_end = RDTSC();
+	u64 cpu_elapsed = cpu_end - cpu_start;
+	u64 cpu_freq = 0;
+	if (os_elapsed)
+	{
+		cpu_freq = os_freq * cpu_elapsed / os_elapsed;
+	}
+
+	timer.cpu_freq = cpu_freq;
+	timer.interval = cpu_freq/(fps+1);
+	timer.prev = RDTSC();
+
+	return timer;
+}
+
+pragma(inline): bool
+CheckTimer(IntervalTimer* t)
+{
+	bool result = false;
+	u64 time = RDTSC();
+	if (time - t.prev > t.interval)
+	{
+		result = true;
+		t.prev = time;
+	}
+
+	return result;
+}
+
+struct Timer
+{
+	u64 cpu_freq;
+	u64 prev;
+}
+
+Timer
+CreateTimer()
+{
+	u64 ms_to_wait = 50;
+
+	u64 os_freq = OSTimeFreq();
+	u64 cpu_start = RDTSC();
+	u64 os_start = OSTime();
+	u64 os_end = 0;
+	u64 os_elapsed = 0;
+
+	u64 os_wait_time = os_freq * ms_to_wait / 1000;
+
+	while (os_elapsed < os_wait_time)
+	{
+		os_end = OSTime();
+		os_elapsed = os_end - os_start;
+	}
+
+	u64 cpu_end = RDTSC();
+	u64 cpu_elapsed = cpu_end - cpu_start;
+	u64 cpu_freq = 0;
+	if (os_elapsed)
+	{
+		cpu_freq = os_freq * cpu_elapsed / os_elapsed;
+	}
+
+	Timer timer = {
+		cpu_freq: cpu_freq,
+		prev: RDTSC(),
+	};
+
+	return timer;
+}
+
+pragma(inline): f32
+DeltaTime(Timer* t)
+{
+	u64 time = RDTSC();
+	u64 step = time - t.prev;
+	t.prev = time;
+	return cast(f32)(step) / cast(f32)(t.cpu_freq);
+}
+
+static string
+IntToStr(int n) nothrow pure @safe
+{
+	string result;
+
+	static immutable string[] table = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"];
+	if (n < table.length)
+	{
+		result = table[n];
+	}
+	else
+	{
+		result = to!string(n);
+	}
+
+	return result;
+}
+
+static string
+GenerateLoop(string format_string, int N)() nothrow pure @safe
+{
+	string result;
+	for (int i = 0; i < N; i++)
+	{
+		result ~= format_string.replace("@", IntToStr(i));
+	}
+	return result;
+}
+
+void
+MemCpy(void* dst_p, void* src_p, u64 length)
+{
+	u8* dst = cast(u8*)dst_p;
+	u8* src = cast(u8*)src_p;
+
+	u64 remaining = length;
+	if (remaining >= 64)
+	{
+		for(u64 i = 0; i < length; i += 64)
+		{
+			asm
+			{
+				mov R8, src;
+				mov R9, dst;
+
+				add R8, i;
+				movdqu XMM0, [R8+00];
+				movdqu XMM1, [R8+16];
+				movdqu XMM2, [R8+32];
+				movdqu XMM3, [R8+48];
+		
+				add R9, i;
+				movdqu [R9+00], XMM0;
+				movups [R9+16], XMM1;
+				movups [R9+32], XMM2;
+				movups [R9+48], XMM3;
+
+				sub remaining, 64;
+			}
+		}
+	}
+
+	if (remaining >= 32)
+	{
+		for(u64 i = remaining; i < length; i += 32)
+		{
+			asm
+			{
+				mov R8, src;
+				mov R9, dst;
+
+				add R8, i;
+				movdqu XMM0, [R8+00];
+				movdqu XMM1, [R8+16];
+
+				add R9, i;
+				movdqu [R9+00], XMM0;
+				movdqu [R9+16], XMM1;
+
+				sub remaining, 32;
+			}
+		}
+	}
+
+	for(u64 i = remaining; i < length; i += 1)
+	{
+		dst[i] = src[i];
+	}
+}
+
+u8[]
+Embed(string file_name)
+{
+	import std.file;
+	return cast(u8[])read(file_name);
+}