diff --git a/3rdparty/meshoptimizer/src/clusterizer.cpp b/3rdparty/meshoptimizer/src/clusterizer.cpp
index 8dd6fb54d..2b29e2e60 100644
--- a/3rdparty/meshoptimizer/src/clusterizer.cpp
+++ b/3rdparty/meshoptimizer/src/clusterizer.cpp
@@ -595,7 +595,7 @@ struct KDNode
 	unsigned int children : 30;
 };
 
-static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
+static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, int axis, float pivot)
 {
 	size_t m = 0;
 
@@ -666,7 +666,7 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
 	}
 
 	// split axis is one where the variance is largest
-	unsigned int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
+	int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
 
 	float split = mean[axis];
 	size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
@@ -768,8 +768,8 @@ static float boxMerge(BVHBoxT& box, const BVHBox& other)
 	min = _mm_min_ps(min, _mm_loadu_ps(other.min));
 	max = _mm_max_ps(max, _mm_loadu_ps(other.max));
 
-	_mm_store_ps(box.min, min);
-	_mm_store_ps(box.max, max);
+	_mm_storeu_ps(box.min, min);
+	_mm_storeu_ps(box.max, max);
 
 	__m128 size = _mm_sub_ps(max, min);
 	__m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1));
diff --git a/3rdparty/meshoptimizer/src/meshoptimizer.h b/3rdparty/meshoptimizer/src/meshoptimizer.h
index 535853d80..46778feff 100644
--- a/3rdparty/meshoptimizer/src/meshoptimizer.h
+++ b/3rdparty/meshoptimizer/src/meshoptimizer.h
@@ -125,14 +125,14 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
 MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
 
 /**
- * Experimental: Generates a remap table that maps all vertices with the same position to the same (existing) index.
+ * Generates a remap table that maps all vertices with the same position to the same (existing) index.
  * Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering.
  * This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification.
  *
  * destination must contain enough space for the resulting remap table (vertex_count elements)
  * vertex_positions should have float3 position in the first 12 bytes of each vertex
  */
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_API void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
 /**
  * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
@@ -418,7 +418,7 @@ enum
 	meshopt_SimplifyErrorAbsolute = 1 << 2,
 	/* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
 	meshopt_SimplifyPrune = 1 << 3,
-	/* Experimental: Produce more regular triangle sizes and shapes during simplification, at some cost to geometric quality. */
+	/* Produce more regular triangle sizes and shapes during simplification, at some cost to geometric quality. */
 	meshopt_SimplifyRegularize = 1 << 4,
 	/* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */
 	meshopt_SimplifyPermissive = 1 << 5,
@@ -501,7 +501,7 @@ MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destinatio
 MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
 
 /**
- * Experimental: Mesh simplifier (sloppy)
+ * Mesh simplifier (sloppy)
  * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
  * The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error.
  * Returns the number of indices after simplification, with destination containing new index data
@@ -514,7 +514,7 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indic
  * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
  * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
  */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error);
+MESHOPTIMIZER_API size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error);
 
 /**
  * Mesh simplifier (pruner)
@@ -670,7 +670,7 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshl
 MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
 
 /**
- * Experimental: Meshlet builder with flexible cluster sizes
+ * Meshlet builder with flexible cluster sizes
  * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet.
  * Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor.
  *
@@ -682,10 +682,10 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t m
  * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
  * split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold
  */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
+MESHOPTIMIZER_API size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
 
 /**
- * Experimental: Meshlet builder that produces clusters optimized for raytracing
+ * Meshlet builder that produces clusters optimized for raytracing
  * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet.
  *
  * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
@@ -695,7 +695,7 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshl
  * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
  * fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default
  */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
+MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
 
 /**
  * Meshlet optimizer
@@ -761,13 +761,14 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float*
 /**
  * Cluster partitioner
  * Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other.
+ * When vertex positions are not provided, only clusters that share vertices will be grouped together, which may result in small partitions for some inputs.
  *
  * destination must contain enough space for the resulting partition data (cluster_count elements)
  * destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function
  * cluster_indices should have the vertex indices referenced by each cluster, stored sequentially
  * cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count
- * vertex_positions should have float3 position in the first 12 bytes of each vertex (or can be NULL if not used)
- * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger
+ * vertex_positions can be NULL; when it's not NULL, it should have float3 position in the first 12 bytes of each vertex
+ * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger (up to target + target/3)
  */
 MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
 
@@ -909,6 +910,8 @@ inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float*
 template <typename T>
 inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL);
 template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error = NULL);
+template <typename T>
 inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
 template <typename T>
 inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
@@ -1293,6 +1296,15 @@ inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t in
 	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error);
 }
 
+template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
+
+	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_lock, target_index_count, target_error, result_error);
+}
+
 template <typename T>
 inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error)
 {
diff --git a/3rdparty/meshoptimizer/src/partition.cpp b/3rdparty/meshoptimizer/src/partition.cpp
index 3edc86442..c7a05a564 100644
--- a/3rdparty/meshoptimizer/src/partition.cpp
+++ b/3rdparty/meshoptimizer/src/partition.cpp
@@ -52,49 +52,44 @@ static void filterClusterIndices(unsigned int* data, unsigned int* offsets, cons
 	offsets[cluster_count] = unsigned(cluster_write);
 }
 
-static void computeClusterBounds(float* cluster_bounds, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, const float* vertex_positions, size_t vertex_positions_stride)
+static float computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, float* out_center)
 {
 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
 
-	for (size_t i = 0; i < cluster_count; ++i)
+	float center[3] = {0, 0, 0};
+
+	// approximate center of the cluster by averaging all vertex positions
+	for (size_t j = 0; j < index_count; ++j)
 	{
-		float center[3] = {0, 0, 0};
+		const float* p = vertex_positions + indices[j] * vertex_stride_float;
 
-		// approximate center of the cluster by averaging all vertex positions
-		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
-		{
-			const float* p = vertex_positions + cluster_indices[j] * vertex_stride_float;
-
-			center[0] += p[0];
-			center[1] += p[1];
-			center[2] += p[2];
-		}
-
-		// note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes
-		if (size_t cluster_size = cluster_offsets[i + 1] - cluster_offsets[i])
-		{
-			center[0] /= float(cluster_size);
-			center[1] /= float(cluster_size);
-			center[2] /= float(cluster_size);
-		}
-
-		// compute radius of the bounding sphere for each cluster
-		float radiussq = 0;
-
-		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
-		{
-			const float* p = vertex_positions + cluster_indices[j] * vertex_stride_float;
-
-			float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
-
-			radiussq = radiussq < d2 ? d2 : radiussq;
-		}
-
-		cluster_bounds[i * 4 + 0] = center[0];
-		cluster_bounds[i * 4 + 1] = center[1];
-		cluster_bounds[i * 4 + 2] = center[2];
-		cluster_bounds[i * 4 + 3] = sqrtf(radiussq);
+		center[0] += p[0];
+		center[1] += p[1];
+		center[2] += p[2];
 	}
+
+	// note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes
+	if (index_count)
+	{
+		center[0] /= float(index_count);
+		center[1] /= float(index_count);
+		center[2] /= float(index_count);
+	}
+
+	// compute radius of the bounding sphere for each cluster
+	float radiussq = 0;
+
+	for (size_t j = 0; j < index_count; ++j)
+	{
+		const float* p = vertex_positions + indices[j] * vertex_stride_float;
+
+		float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+
+		radiussq = radiussq < d2 ? d2 : radiussq;
+	}
+
+	memcpy(out_center, center, sizeof(center));
+	return sqrtf(radiussq);
 }
 
 static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, size_t vertex_count, meshopt_Allocator& allocator)
@@ -211,6 +206,9 @@ struct ClusterGroup
 	int next;
 	unsigned int size; // 0 unless root
 	unsigned int vertices;
+
+	float center[3];
+	float radius;
 };
 
 struct GroupOrder
@@ -285,15 +283,18 @@ static unsigned int countShared(const ClusterGroup* groups, int group1, int grou
 	return total;
 }
 
-static void mergeBounds(float* target, const float* source)
+static void mergeBounds(ClusterGroup& target, const ClusterGroup& source)
 {
-	float r1 = target[3], r2 = source[3];
-	float dx = source[0] - target[0], dy = source[1] - target[1], dz = source[2] - target[2];
+	float r1 = target.radius, r2 = source.radius;
+	float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
 	float d = sqrtf(dx * dx + dy * dy + dz * dz);
 
 	if (d + r1 < r2)
 	{
-		memcpy(target, source, 4 * sizeof(float));
+		target.center[0] = source.center[0];
+		target.center[1] = source.center[1];
+		target.center[2] = source.center[2];
+		target.radius = source.radius;
 		return;
 	}
 
@@ -301,17 +302,17 @@ static void mergeBounds(float* target, const float* source)
 	{
 		float k = d > 0 ? (d + r2 - r1) / (2 * d) : 0.f;
 
-		target[0] += dx * k;
-		target[1] += dy * k;
-		target[2] += dz * k;
-		target[3] = (d + r2 + r1) / 2;
+		target.center[0] += dx * k;
+		target.center[1] += dy * k;
+		target.center[2] += dz * k;
+		target.radius = (d + r2 + r1) / 2;
 	}
 }
 
-static float boundsScore(const float* target, const float* source)
+static float boundsScore(const ClusterGroup& target, const ClusterGroup& source)
 {
-	float r1 = target[3], r2 = source[3];
-	float dx = source[0] - target[0], dy = source[1] - target[1], dz = source[2] - target[2];
+	float r1 = target.radius, r2 = source.radius;
+	float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
 	float d = sqrtf(dx * dx + dy * dy + dz * dz);
 
 	float mr = d + r1 < r2 ? r2 : (d + r2 < r1 ? r1 : (d + r2 + r1) / 2);
@@ -319,7 +320,7 @@ static float boundsScore(const float* target, const float* source)
 	return mr > 0 ? r1 / mr : 0.f;
 }
 
-static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, const float* cluster_bounds)
+static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, bool use_bounds)
 {
 	assert(groups[id].size > 0);
 
@@ -347,8 +348,8 @@ static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdj
 			float score = float(int(shared)) * (group_rsqrt + other_rsqrt);
 
 			// incorporate spatial score to favor merging nearby groups
-			if (cluster_bounds)
-				score *= 1.f + 0.4f * boundsScore(&cluster_bounds[id * 4], &cluster_bounds[other * 4]);
+			if (use_bounds)
+				score *= 1.f + 0.4f * boundsScore(groups[id], groups[other]);
 
 			if (score > best_score)
 			{
@@ -361,6 +362,118 @@ static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdj
 	return best_group;
 }
 
+static void mergeLeaf(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size)
+{
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = order[i];
+		if (groups[id].size == 0 || groups[id].size >= target_partition_size)
+			continue;
+
+		float best_score = -1.f;
+		int best_group = -1;
+
+		for (size_t j = 0; j < count; ++j)
+		{
+			unsigned int other = order[j];
+			if (id == other || groups[other].size == 0)
+				continue;
+
+			if (groups[id].size + groups[other].size > max_partition_size)
+				continue;
+
+			// favor merging nearby groups
+			float score = boundsScore(groups[id], groups[other]);
+
+			if (score > best_score)
+			{
+				best_score = score;
+				best_group = other;
+			}
+		}
+
+		// merge id *into* best_group; that way, we may merge more groups into the same best_group, maximizing the chance of reaching target
+		if (best_group != -1)
+		{
+			// combine groups by linking them together
+			unsigned int tail = best_group;
+			while (groups[tail].next >= 0)
+				tail = groups[tail].next;
+
+			groups[tail].next = id;
+
+			// update group sizes; note, we omit vertices update for simplicity as it's not used for spatial merge
+			groups[best_group].size += groups[id].size;
+			groups[id].size = 0;
+
+			// merge bounding spheres
+			mergeBounds(groups[best_group], groups[id]);
+			groups[id].radius = 0.f;
+		}
+	}
+}
+
+static size_t mergePartition(unsigned int* order, size_t count, const ClusterGroup* groups, int axis, float pivot)
+{
+	size_t m = 0;
+
+	// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
+	for (size_t i = 0; i < count; ++i)
+	{
+		float v = groups[order[i]].center[axis];
+
+		// swap(m, i) unconditionally
+		unsigned int t = order[m];
+		order[m] = order[i];
+		order[i] = t;
+
+		// when v >= pivot, we swap i with m without advancing it, preserving invariants
+		m += v < pivot;
+	}
+
+	return m;
+}
+
+static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size)
+{
+	size_t total = 0;
+	for (size_t i = 0; i < count; ++i)
+		total += groups[order[i]].size;
+
+	if (total <= max_partition_size || count <= leaf_size)
+		return mergeLeaf(groups, order, count, target_partition_size, max_partition_size);
+
+	float mean[3] = {};
+	float vars[3] = {};
+	float runc = 1, runs = 1;
+
+	// gather statistics on the points in the subtree using Welford's algorithm
+	for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
+	{
+		const float* point = groups[order[i]].center;
+
+		for (int k = 0; k < 3; ++k)
+		{
+			float delta = point[k] - mean[k];
+			mean[k] += delta * runs;
+			vars[k] += delta * (point[k] - mean[k]);
+		}
+	}
+
+	// split axis is one where the variance is largest
+	int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
+
+	float split = mean[axis];
+	size_t middle = mergePartition(order, count, groups, axis, split);
+
+	// enforce balance for degenerate partitions
+	if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2)
+		middle = count / 2;
+
+	mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size);
+	mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size);
+}
+
 } // namespace meshopt
 
 size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
@@ -371,7 +484,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
 	assert(vertex_positions_stride % sizeof(float) == 0);
 	assert(target_partition_size > 0);
 
-	size_t max_partition_size = target_partition_size + target_partition_size * 3 / 8;
+	size_t max_partition_size = target_partition_size + target_partition_size / 3;
 
 	meshopt_Allocator allocator;
 
@@ -385,20 +498,12 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
 	filterClusterIndices(cluster_newindices, cluster_offsets, cluster_indices, cluster_index_counts, cluster_count, used, vertex_count, total_index_count);
 	cluster_indices = cluster_newindices;
 
-	// compute bounding sphere for each cluster if positions are provided
-	float* cluster_bounds = NULL;
-
-	if (vertex_positions)
-	{
-		cluster_bounds = allocator.allocate<float>(cluster_count * 4);
-		computeClusterBounds(cluster_bounds, cluster_indices, cluster_offsets, cluster_count, vertex_positions, vertex_positions_stride);
-	}
-
 	// build cluster adjacency along with edge weights (shared vertex count)
 	ClusterAdjacency adjacency = {};
 	buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, vertex_count, allocator);
 
 	ClusterGroup* groups = allocator.allocate<ClusterGroup>(cluster_count);
+	memset(groups, 0, sizeof(ClusterGroup) * cluster_count);
 
 	GroupOrder* order = allocator.allocate<GroupOrder>(cluster_count);
 	size_t pending = 0;
@@ -412,6 +517,10 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
 		groups[i].vertices = cluster_offsets[i + 1] - cluster_offsets[i];
 		assert(groups[i].vertices > 0);
 
+		// compute bounding sphere for each cluster if positions are provided
+		if (vertex_positions)
+			groups[i].radius = computeClusterBounds(cluster_indices + cluster_offsets[i], cluster_offsets[i + 1] - cluster_offsets[i], vertex_positions, vertex_positions_stride, groups[i].center);
+
 		GroupOrder item = {};
 		item.id = unsigned(i);
 		item.order = groups[i].vertices;
@@ -439,7 +548,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
 		if (groups[top.id].size >= target_partition_size)
 			continue;
 
-		int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, cluster_bounds);
+		int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, /* use_bounds= */ vertex_positions);
 
 		// we can't grow the group any more, emit as is
 		if (best_group == -1)
@@ -449,14 +558,11 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
 		unsigned int shared = countShared(groups, top.id, best_group, adjacency);
 
 		// combine groups by linking them together
-		assert(groups[best_group].size > 0);
+		unsigned int tail = top.id;
+		while (groups[tail].next >= 0)
+			tail = groups[tail].next;
 
-		for (int i = top.id; i >= 0; i = groups[i].next)
-			if (groups[i].next < 0)
-			{
-				groups[i].next = best_group;
-				break;
-			}
+		groups[tail].next = best_group;
 
 		// update group sizes; note, the vertex update is a O(1) approximation which avoids recomputing the true size
 		groups[top.id].size += groups[best_group].size;
@@ -467,10 +573,10 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
 		groups[best_group].vertices = 0;
 
 		// merge bounding spheres if bounds are available
-		if (cluster_bounds)
+		if (vertex_positions)
 		{
-			mergeBounds(&cluster_bounds[top.id * 4], &cluster_bounds[best_group * 4]);
-			memset(&cluster_bounds[best_group * 4], 0, 4 * sizeof(float));
+			mergeBounds(groups[top.id], groups[best_group]);
+			groups[best_group].radius = 0;
 		}
 
 		// re-associate all clusters back to the merged group
@@ -481,6 +587,20 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
 		heapPush(order, pending++, top);
 	}
 
+	// if vertex positions are provided, we do a final pass to see if we can merge small groups based on spatial locality alone
+	if (vertex_positions)
+	{
+		unsigned int* merge_order = reinterpret_cast<unsigned int*>(order);
+		size_t merge_offset = 0;
+
+		for (size_t i = 0; i < cluster_count; ++i)
+			if (groups[i].size)
+				merge_order[merge_offset++] = unsigned(i);
+
+		mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8);
+	}
+
+	// output each remaining group
 	size_t next_group = 0;
 
 	for (size_t i = 0; i < cluster_count; ++i)
diff --git a/3rdparty/meshoptimizer/src/simplifier.cpp b/3rdparty/meshoptimizer/src/simplifier.cpp
index f1effc38e..5dcb459ce 100644
--- a/3rdparty/meshoptimizer/src/simplifier.cpp
+++ b/3rdparty/meshoptimizer/src/simplifier.cpp
@@ -243,14 +243,18 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count,
 {
 	// use a bit set to compute the precise number of unique vertices
 	unsigned char* filter = allocator.allocate<unsigned char>((vertex_count + 7) / 8);
-	memset(filter, 0, (vertex_count + 7) / 8);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+		filter[index / 8] = 0;
+	}
 
 	size_t unique = 0;
 	for (size_t i = 0; i < index_count; ++i)
 	{
 		unsigned int index = indices[i];
-		assert(index < vertex_count);
-
 		unique += (filter[index / 8] & (1 << (index % 8))) == 0;
 		filter[index / 8] |= 1 << (index % 8);
 	}
@@ -269,7 +273,6 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count,
 	for (size_t i = 0; i < index_count; ++i)
 	{
 		unsigned int index = indices[i];
-
 		unsigned int* entry = hashLookup2(revremap, revremap_size, hasher, index, ~0u);
 
 		if (*entry == ~0u)
@@ -2264,7 +2267,7 @@ static float interpolate(float y, float x0, float y0, float x1, float y1, float
 	// three point interpolation from "revenge of interpolation search" paper
 	float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0);
 	float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2);
-	return x1 + num / den;
+	return x1 + (den == 0.f ? 0.f : num / den);
 }
 
 } // namespace meshopt
diff --git a/3rdparty/meshoptimizer/src/vertexfilter.cpp b/3rdparty/meshoptimizer/src/vertexfilter.cpp
index af15d59c6..b20d998ca 100644
--- a/3rdparty/meshoptimizer/src/vertexfilter.cpp
+++ b/3rdparty/meshoptimizer/src/vertexfilter.cpp
@@ -109,28 +109,33 @@ static void decodeFilterOct(T* data, size_t count)
 
 static void decodeFilterQuat(short* data, size_t count)
 {
-	const float scale = 1.f / sqrtf(2.f);
+	const float scale = 32767.f / sqrtf(2.f);
 
 	for (size_t i = 0; i < count; ++i)
 	{
 		// recover scale from the high byte of the component
 		int sf = data[i * 4 + 3] | 3;
-		float ss = scale / float(sf);
+		float s = float(sf);
 
-		// convert x/y/z to [-1..1] (scaled...)
-		float x = float(data[i * 4 + 0]) * ss;
-		float y = float(data[i * 4 + 1]) * ss;
-		float z = float(data[i * 4 + 2]) * ss;
+		// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+		float x = float(data[i * 4 + 0]);
+		float y = float(data[i * 4 + 1]);
+		float z = float(data[i * 4 + 2]);
 
-		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
-		float ww = 1.f - x * x - y * y - z * z;
+		// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
+		float ws = s * s;
+		float ww = ws * 2.f - x * x - y * y - z * z;
 		float w = sqrtf(ww >= 0.f ? ww : 0.f);
 
+		// compute final scale; note that all computations above are unscaled
+		// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+		float ss = scale / s;
+
 		// rounded signed float->int
-		int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
-		int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
-		int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
-		int wf = int(w * 32767.f + 0.5f);
+		int xf = int(x * ss + (x >= 0.f ? 0.5f : -0.5f));
+		int yf = int(y * ss + (y >= 0.f ? 0.5f : -0.5f));
+		int zf = int(z * ss + (z >= 0.f ? 0.5f : -0.5f));
+		int wf = int(w * ss + 0.5f);
 
 		int qc = data[i * 4 + 3] & 3;
 
@@ -347,7 +352,7 @@ static void decodeFilterOctSimd16(short* data, size_t count)
 
 static void decodeFilterQuatSimd(short* data, size_t count)
 {
-	const float scale = 1.f / sqrtf(2.f);
+	const float scale = 32767.f / sqrtf(2.f);
 
 	for (size_t i = 0; i < count; i += 4)
 	{
@@ -366,24 +371,27 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
 		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
 		__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
-		__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
+		__m128 s = _mm_cvtepi32_ps(sf);
 
-		// convert x/y/z to [-1..1] (scaled...)
-		__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
-		__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
-		__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
+		// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+		__m128 x = _mm_cvtepi32_ps(xf);
+		__m128 y = _mm_cvtepi32_ps(yf);
+		__m128 z = _mm_cvtepi32_ps(zf);
 
-		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
-		__m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
+		// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
+		__m128 ws = _mm_mul_ps(s, _mm_add_ps(s, s)); // s*2s instead of 2*(s*s) to work around clang bug with integer multiplication
+		__m128 ww = _mm_sub_ps(ws, _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
 		__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
 
-		__m128 s = _mm_set1_ps(32767.f);
+		// compute final scale; note that all computations above are unscaled
+		// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+		__m128 ss = _mm_div_ps(_mm_set1_ps(scale), s);
 
 		// rounded signed float->int
-		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
-		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
-		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
-		__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, ss));
+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, ss));
+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, ss));
+		__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, ss));
 
 		// mix x/z and w/y to make 16-bit unpack easier
 		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
@@ -658,7 +666,7 @@ static void decodeFilterOctSimd16(short* data, size_t count)
 
 static void decodeFilterQuatSimd(short* data, size_t count)
 {
-	const float scale = 1.f / sqrtf(2.f);
+	const float scale = 32767.f / sqrtf(2.f);
 
 	for (size_t i = 0; i < count; i += 4)
 	{
@@ -677,27 +685,30 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
 		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
 		int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
-		float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
+		float32x4_t s = vcvtq_f32_s32(sf);
 
-		// convert x/y/z to [-1..1] (scaled...)
-		float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
-		float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
-		float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
+		// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+		float32x4_t x = vcvtq_f32_s32(xf);
+		float32x4_t y = vcvtq_f32_s32(yf);
+		float32x4_t z = vcvtq_f32_s32(zf);
 
-		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
-		float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
+		// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
+		float32x4_t ws = vmulq_f32(s, s);
+		float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
 		float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
 
-		float32x4_t s = vdupq_n_f32(32767.f);
+		// compute final scale; note that all computations above are unscaled
+		// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+		float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), s);
 
 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
 		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
 		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
 
-		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
-		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
-		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
-		int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, ss), fsnap));
+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, ss), fsnap));
+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, ss), fsnap));
+		int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, ss), fsnap));
 
 		// mix x/z and w/y to make 16-bit unpack easier
 		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
@@ -958,7 +969,7 @@ static void decodeFilterOctSimd16(short* data, size_t count)
 
 static void decodeFilterQuatSimd(short* data, size_t count)
 {
-	const float scale = 1.f / sqrtf(2.f);
+	const float scale = 32767.f / sqrtf(2.f);
 
 	for (size_t i = 0; i < count; i += 4)
 	{
@@ -977,28 +988,31 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
 		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
 		v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
-		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
+		v128_t s = wasm_f32x4_convert_i32x4(sf);
 
-		// convert x/y/z to [-1..1] (scaled...)
-		v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
-		v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
-		v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
+		// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+		v128_t x = wasm_f32x4_convert_i32x4(xf);
+		v128_t y = wasm_f32x4_convert_i32x4(yf);
+		v128_t z = wasm_f32x4_convert_i32x4(zf);
 
-		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
 		// note: i32x4_max with 0 is equivalent to f32x4_max
-		v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
+		v128_t ws = wasm_f32x4_mul(s, s);
+		v128_t ww = wasm_f32x4_sub(wasm_f32x4_add(ws, ws), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
 		v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
 
-		v128_t s = wasm_f32x4_splat(32767.f);
+		// compute final scale; note that all computations above are unscaled
+		// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), s);
 
 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
 		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
 		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
 
-		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
-		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
-		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
-		v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, ss), fsnap);
+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, ss), fsnap);
+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, ss), fsnap);
+		v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, ss), fsnap);
 
 		// mix x/z and w/y to make 16-bit unpack easier
 		v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));