diff --git a/3rdparty/meshoptimizer/src/clusterizer.cpp b/3rdparty/meshoptimizer/src/clusterizer.cpp index 8dd6fb54d..2b29e2e60 100644 --- a/3rdparty/meshoptimizer/src/clusterizer.cpp +++ b/3rdparty/meshoptimizer/src/clusterizer.cpp @@ -595,7 +595,7 @@ struct KDNode unsigned int children : 30; }; -static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot) +static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, int axis, float pivot) { size_t m = 0; @@ -666,7 +666,7 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const } // split axis is one where the variance is largest - unsigned int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2); + int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2); float split = mean[axis]; size_t middle = kdtreePartition(indices, count, points, stride, axis, split); @@ -768,8 +768,8 @@ static float boxMerge(BVHBoxT& box, const BVHBox& other) min = _mm_min_ps(min, _mm_loadu_ps(other.min)); max = _mm_max_ps(max, _mm_loadu_ps(other.max)); - _mm_store_ps(box.min, min); - _mm_store_ps(box.max, max); + _mm_storeu_ps(box.min, min); + _mm_storeu_ps(box.max, max); __m128 size = _mm_sub_ps(max, min); __m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1)); diff --git a/3rdparty/meshoptimizer/src/meshoptimizer.h b/3rdparty/meshoptimizer/src/meshoptimizer.h index 535853d80..46778feff 100644 --- a/3rdparty/meshoptimizer/src/meshoptimizer.h +++ b/3rdparty/meshoptimizer/src/meshoptimizer.h @@ -125,14 +125,14 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count); /** - * Experimental: Generates a remap table that maps all vertices with the same position to the same (existing) index. + * Generates a remap table that maps all vertices with the same position to the same (existing) index. * Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering. * This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification. * * destination must contain enough space for the resulting remap table (vertex_count elements) * vertex_positions should have float3 position in the first 12 bytes of each vertex */ -MESHOPTIMIZER_EXPERIMENTAL void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +MESHOPTIMIZER_API void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); /** * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology @@ -418,7 +418,7 @@ enum meshopt_SimplifyErrorAbsolute = 1 << 2, /* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */ meshopt_SimplifyPrune = 1 << 3, - /* Experimental: Produce more regular triangle sizes and shapes during simplification, at some cost to geometric quality. */ + /* Produce more regular triangle sizes and shapes during simplification, at some cost to geometric quality. */ meshopt_SimplifyRegularize = 1 << 4, /* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */ meshopt_SimplifyPermissive = 1 << 5, @@ -501,7 +501,7 @@ MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destinatio MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); /** - * Experimental: Mesh simplifier (sloppy) + * Mesh simplifier (sloppy) * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance * The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error. * Returns the number of indices after simplification, with destination containing new index data @@ -514,7 +514,7 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indic * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error); +MESHOPTIMIZER_API size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error); /** * Mesh simplifier (pruner) @@ -670,7 +670,7 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshl MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles); /** - * Experimental: Meshlet builder with flexible cluster sizes + * Meshlet builder with flexible cluster sizes * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet. * Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor. * @@ -682,10 +682,10 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t m * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency * split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor); +MESHOPTIMIZER_API size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor); /** - * Experimental: Meshlet builder that produces clusters optimized for raytracing + * Meshlet builder that produces clusters optimized for raytracing * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet. * * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!) @@ -695,7 +695,7 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshl * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles) * fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight); +MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight); /** * Meshlet optimizer @@ -761,13 +761,14 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float* /** * Cluster partitioner * Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other. + * When vertex positions are not provided, only clusters that share vertices will be grouped together, which may result in small partitions for some inputs. * * destination must contain enough space for the resulting partition data (cluster_count elements) * destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function * cluster_indices should have the vertex indices referenced by each cluster, stored sequentially * cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count - * vertex_positions should have float3 position in the first 12 bytes of each vertex (or can be NULL if not used) - * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger + * vertex_positions can be NULL; when it's not NULL, it should have float3 position in the first 12 bytes of each vertex + * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger (up to target + target/3) */ MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size); @@ -909,6 +910,8 @@ inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* template inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL); template +inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error = NULL); +template inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error); template inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index); @@ -1293,6 +1296,15 @@ inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t in return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error); } +template +inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); + + return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_lock, target_index_count, target_error, result_error); +} + template inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error) { diff --git a/3rdparty/meshoptimizer/src/partition.cpp b/3rdparty/meshoptimizer/src/partition.cpp index 3edc86442..c7a05a564 100644 --- a/3rdparty/meshoptimizer/src/partition.cpp +++ b/3rdparty/meshoptimizer/src/partition.cpp @@ -52,49 +52,44 @@ static void filterClusterIndices(unsigned int* data, unsigned int* offsets, cons offsets[cluster_count] = unsigned(cluster_write); } -static void computeClusterBounds(float* cluster_bounds, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, const float* vertex_positions, size_t vertex_positions_stride) +static float computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, float* out_center) { size_t vertex_stride_float = vertex_positions_stride / sizeof(float); - for (size_t i = 0; i < cluster_count; ++i) + float center[3] = {0, 0, 0}; + + // approximate center of the cluster by averaging all vertex positions + for (size_t j = 0; j < index_count; ++j) { - float center[3] = {0, 0, 0}; + const float* p = vertex_positions + indices[j] * vertex_stride_float; - // approximate center of the cluster by averaging all vertex positions - for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) - { - const float* p = vertex_positions + cluster_indices[j] * vertex_stride_float; - - center[0] += p[0]; - center[1] += p[1]; - center[2] += p[2]; - } - - // note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes - if (size_t cluster_size = cluster_offsets[i + 1] - cluster_offsets[i]) - { - center[0] /= float(cluster_size); - center[1] /= float(cluster_size); - center[2] /= float(cluster_size); - } - - // compute radius of the bounding sphere for each cluster - float radiussq = 0; - - for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) - { - const float* p = vertex_positions + cluster_indices[j] * vertex_stride_float; - - float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]); - - radiussq = radiussq < d2 ? d2 : radiussq; - } - - cluster_bounds[i * 4 + 0] = center[0]; - cluster_bounds[i * 4 + 1] = center[1]; - cluster_bounds[i * 4 + 2] = center[2]; - cluster_bounds[i * 4 + 3] = sqrtf(radiussq); + center[0] += p[0]; + center[1] += p[1]; + center[2] += p[2]; } + + // note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes + if (index_count) + { + center[0] /= float(index_count); + center[1] /= float(index_count); + center[2] /= float(index_count); + } + + // compute radius of the bounding sphere for each cluster + float radiussq = 0; + + for (size_t j = 0; j < index_count; ++j) + { + const float* p = vertex_positions + indices[j] * vertex_stride_float; + + float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]); + + radiussq = radiussq < d2 ? d2 : radiussq; + } + + memcpy(out_center, center, sizeof(center)); + return sqrtf(radiussq); } static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, size_t vertex_count, meshopt_Allocator& allocator) @@ -211,6 +206,9 @@ struct ClusterGroup int next; unsigned int size; // 0 unless root unsigned int vertices; + + float center[3]; + float radius; }; struct GroupOrder @@ -285,15 +283,18 @@ static unsigned int countShared(const ClusterGroup* groups, int group1, int grou return total; } -static void mergeBounds(float* target, const float* source) +static void mergeBounds(ClusterGroup& target, const ClusterGroup& source) { - float r1 = target[3], r2 = source[3]; - float dx = source[0] - target[0], dy = source[1] - target[1], dz = source[2] - target[2]; + float r1 = target.radius, r2 = source.radius; + float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2]; float d = sqrtf(dx * dx + dy * dy + dz * dz); if (d + r1 < r2) { - memcpy(target, source, 4 * sizeof(float)); + target.center[0] = source.center[0]; + target.center[1] = source.center[1]; + target.center[2] = source.center[2]; + target.radius = source.radius; return; } @@ -301,17 +302,17 @@ static void mergeBounds(float* target, const float* source) { float k = d > 0 ? (d + r2 - r1) / (2 * d) : 0.f; - target[0] += dx * k; - target[1] += dy * k; - target[2] += dz * k; - target[3] = (d + r2 + r1) / 2; + target.center[0] += dx * k; + target.center[1] += dy * k; + target.center[2] += dz * k; + target.radius = (d + r2 + r1) / 2; } } -static float boundsScore(const float* target, const float* source) +static float boundsScore(const ClusterGroup& target, const ClusterGroup& source) { - float r1 = target[3], r2 = source[3]; - float dx = source[0] - target[0], dy = source[1] - target[1], dz = source[2] - target[2]; + float r1 = target.radius, r2 = source.radius; + float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2]; float d = sqrtf(dx * dx + dy * dy + dz * dz); float mr = d + r1 < r2 ? r2 : (d + r2 < r1 ? r1 : (d + r2 + r1) / 2); @@ -319,7 +320,7 @@ static float boundsScore(const float* target, const float* source) return mr > 0 ? r1 / mr : 0.f; } -static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, const float* cluster_bounds) +static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, bool use_bounds) { assert(groups[id].size > 0); @@ -347,8 +348,8 @@ static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdj float score = float(int(shared)) * (group_rsqrt + other_rsqrt); // incorporate spatial score to favor merging nearby groups - if (cluster_bounds) - score *= 1.f + 0.4f * boundsScore(&cluster_bounds[id * 4], &cluster_bounds[other * 4]); + if (use_bounds) + score *= 1.f + 0.4f * boundsScore(groups[id], groups[other]); if (score > best_score) { @@ -361,6 +362,118 @@ static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdj return best_group; } +static void mergeLeaf(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size) +{ + for (size_t i = 0; i < count; ++i) + { + unsigned int id = order[i]; + if (groups[id].size == 0 || groups[id].size >= target_partition_size) + continue; + + float best_score = -1.f; + int best_group = -1; + + for (size_t j = 0; j < count; ++j) + { + unsigned int other = order[j]; + if (id == other || groups[other].size == 0) + continue; + + if (groups[id].size + groups[other].size > max_partition_size) + continue; + + // favor merging nearby groups + float score = boundsScore(groups[id], groups[other]); + + if (score > best_score) + { + best_score = score; + best_group = other; + } + } + + // merge id *into* best_group; that way, we may merge more groups into the same best_group, maximizing the chance of reaching target + if (best_group != -1) + { + // combine groups by linking them together + unsigned int tail = best_group; + while (groups[tail].next >= 0) + tail = groups[tail].next; + + groups[tail].next = id; + + // update group sizes; note, we omit vertices update for simplicity as it's not used for spatial merge + groups[best_group].size += groups[id].size; + groups[id].size = 0; + + // merge bounding spheres + mergeBounds(groups[best_group], groups[id]); + groups[id].radius = 0.f; + } + } +} + +static size_t mergePartition(unsigned int* order, size_t count, const ClusterGroup* groups, int axis, float pivot) +{ + size_t m = 0; + + // invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot + for (size_t i = 0; i < count; ++i) + { + float v = groups[order[i]].center[axis]; + + // swap(m, i) unconditionally + unsigned int t = order[m]; + order[m] = order[i]; + order[i] = t; + + // when v >= pivot, we swap i with m without advancing it, preserving invariants + m += v < pivot; + } + + return m; +} + +static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size) +{ + size_t total = 0; + for (size_t i = 0; i < count; ++i) + total += groups[order[i]].size; + + if (total <= max_partition_size || count <= leaf_size) + return mergeLeaf(groups, order, count, target_partition_size, max_partition_size); + + float mean[3] = {}; + float vars[3] = {}; + float runc = 1, runs = 1; + + // gather statistics on the points in the subtree using Welford's algorithm + for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc) + { + const float* point = groups[order[i]].center; + + for (int k = 0; k < 3; ++k) + { + float delta = point[k] - mean[k]; + mean[k] += delta * runs; + vars[k] += delta * (point[k] - mean[k]); + } + } + + // split axis is one where the variance is largest + int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2); + + float split = mean[axis]; + size_t middle = mergePartition(order, count, groups, axis, split); + + // enforce balance for degenerate partitions + if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2) + middle = count / 2; + + mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size); + mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size); +} + } // namespace meshopt size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size) @@ -371,7 +484,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* assert(vertex_positions_stride % sizeof(float) == 0); assert(target_partition_size > 0); - size_t max_partition_size = target_partition_size + target_partition_size * 3 / 8; + size_t max_partition_size = target_partition_size + target_partition_size / 3; meshopt_Allocator allocator; @@ -385,20 +498,12 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* filterClusterIndices(cluster_newindices, cluster_offsets, cluster_indices, cluster_index_counts, cluster_count, used, vertex_count, total_index_count); cluster_indices = cluster_newindices; - // compute bounding sphere for each cluster if positions are provided - float* cluster_bounds = NULL; - - if (vertex_positions) - { - cluster_bounds = allocator.allocate(cluster_count * 4); - computeClusterBounds(cluster_bounds, cluster_indices, cluster_offsets, cluster_count, vertex_positions, vertex_positions_stride); - } - // build cluster adjacency along with edge weights (shared vertex count) ClusterAdjacency adjacency = {}; buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, vertex_count, allocator); ClusterGroup* groups = allocator.allocate(cluster_count); + memset(groups, 0, sizeof(ClusterGroup) * cluster_count); GroupOrder* order = allocator.allocate(cluster_count); size_t pending = 0; @@ -412,6 +517,10 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* groups[i].vertices = cluster_offsets[i + 1] - cluster_offsets[i]; assert(groups[i].vertices > 0); + // compute bounding sphere for each cluster if positions are provided + if (vertex_positions) + groups[i].radius = computeClusterBounds(cluster_indices + cluster_offsets[i], cluster_offsets[i + 1] - cluster_offsets[i], vertex_positions, vertex_positions_stride, groups[i].center); + GroupOrder item = {}; item.id = unsigned(i); item.order = groups[i].vertices; @@ -439,7 +548,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* if (groups[top.id].size >= target_partition_size) continue; - int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, cluster_bounds); + int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, /* use_bounds= */ vertex_positions); // we can't grow the group any more, emit as is if (best_group == -1) @@ -449,14 +558,11 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* unsigned int shared = countShared(groups, top.id, best_group, adjacency); // combine groups by linking them together - assert(groups[best_group].size > 0); + unsigned int tail = top.id; + while (groups[tail].next >= 0) + tail = groups[tail].next; - for (int i = top.id; i >= 0; i = groups[i].next) - if (groups[i].next < 0) - { - groups[i].next = best_group; - break; - } + groups[tail].next = best_group; // update group sizes; note, the vertex update is a O(1) approximation which avoids recomputing the true size groups[top.id].size += groups[best_group].size; @@ -467,10 +573,10 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* groups[best_group].vertices = 0; // merge bounding spheres if bounds are available - if (cluster_bounds) + if (vertex_positions) { - mergeBounds(&cluster_bounds[top.id * 4], &cluster_bounds[best_group * 4]); - memset(&cluster_bounds[best_group * 4], 0, 4 * sizeof(float)); + mergeBounds(groups[top.id], groups[best_group]); + groups[best_group].radius = 0; } // re-associate all clusters back to the merged group @@ -481,6 +587,20 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* heapPush(order, pending++, top); } + // if vertex positions are provided, we do a final pass to see if we can merge small groups based on spatial locality alone + if (vertex_positions) + { + unsigned int* merge_order = reinterpret_cast(order); + size_t merge_offset = 0; + + for (size_t i = 0; i < cluster_count; ++i) + if (groups[i].size) + merge_order[merge_offset++] = unsigned(i); + + mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8); + } + + // output each remaining group size_t next_group = 0; for (size_t i = 0; i < cluster_count; ++i) diff --git a/3rdparty/meshoptimizer/src/simplifier.cpp b/3rdparty/meshoptimizer/src/simplifier.cpp index f1effc38e..5dcb459ce 100644 --- a/3rdparty/meshoptimizer/src/simplifier.cpp +++ b/3rdparty/meshoptimizer/src/simplifier.cpp @@ -243,14 +243,18 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count, { // use a bit set to compute the precise number of unique vertices unsigned char* filter = allocator.allocate((vertex_count + 7) / 8); - memset(filter, 0, (vertex_count + 7) / 8); + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + filter[index / 8] = 0; + } size_t unique = 0; for (size_t i = 0; i < index_count; ++i) { unsigned int index = indices[i]; - assert(index < vertex_count); - unique += (filter[index / 8] & (1 << (index % 8))) == 0; filter[index / 8] |= 1 << (index % 8); } @@ -269,7 +273,6 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count, for (size_t i = 0; i < index_count; ++i) { unsigned int index = indices[i]; - unsigned int* entry = hashLookup2(revremap, revremap_size, hasher, index, ~0u); if (*entry == ~0u) @@ -2264,7 +2267,7 @@ static float interpolate(float y, float x0, float y0, float x1, float y1, float // three point interpolation from "revenge of interpolation search" paper float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0); float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2); - return x1 + num / den; + return x1 + (den == 0.f ? 0.f : num / den); } } // namespace meshopt diff --git a/3rdparty/meshoptimizer/src/vertexfilter.cpp b/3rdparty/meshoptimizer/src/vertexfilter.cpp index af15d59c6..b20d998ca 100644 --- a/3rdparty/meshoptimizer/src/vertexfilter.cpp +++ b/3rdparty/meshoptimizer/src/vertexfilter.cpp @@ -109,28 +109,33 @@ static void decodeFilterOct(T* data, size_t count) static void decodeFilterQuat(short* data, size_t count) { - const float scale = 1.f / sqrtf(2.f); + const float scale = 32767.f / sqrtf(2.f); for (size_t i = 0; i < count; ++i) { // recover scale from the high byte of the component int sf = data[i * 4 + 3] | 3; - float ss = scale / float(sf); + float s = float(sf); - // convert x/y/z to [-1..1] (scaled...) - float x = float(data[i * 4 + 0]) * ss; - float y = float(data[i * 4 + 1]) * ss; - float z = float(data[i * 4 + 2]) * ss; + // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf) + float x = float(data[i * 4 + 0]); + float y = float(data[i * 4 + 1]); + float z = float(data[i * 4 + 2]); - // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors - float ww = 1.f - x * x - y * y - z * z; + // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors + float ws = s * s; + float ww = ws * 2.f - x * x - y * y - z * z; float w = sqrtf(ww >= 0.f ? ww : 0.f); + // compute final scale; note that all computations above are unscaled + // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range + float ss = scale / s; + // rounded signed float->int - int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f)); - int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f)); - int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f)); - int wf = int(w * 32767.f + 0.5f); + int xf = int(x * ss + (x >= 0.f ? 0.5f : -0.5f)); + int yf = int(y * ss + (y >= 0.f ? 0.5f : -0.5f)); + int zf = int(z * ss + (z >= 0.f ? 0.5f : -0.5f)); + int wf = int(w * ss + 0.5f); int qc = data[i * 4 + 3] & 3; @@ -347,7 +352,7 @@ static void decodeFilterOctSimd16(short* data, size_t count) static void decodeFilterQuatSimd(short* data, size_t count) { - const float scale = 1.f / sqrtf(2.f); + const float scale = 32767.f / sqrtf(2.f); for (size_t i = 0; i < count; i += 4) { @@ -366,24 +371,27 @@ static void decodeFilterQuatSimd(short* data, size_t count) // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f) __m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3)); - __m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf)); + __m128 s = _mm_cvtepi32_ps(sf); - // convert x/y/z to [-1..1] (scaled...) - __m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss); - __m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss); - __m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss); + // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf) + __m128 x = _mm_cvtepi32_ps(xf); + __m128 y = _mm_cvtepi32_ps(yf); + __m128 z = _mm_cvtepi32_ps(zf); - // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors - __m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)))); + // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors + __m128 ws = _mm_mul_ps(s, _mm_add_ps(s, s)); // s*2s instead of 2*(s*s) to work around clang bug with integer multiplication + __m128 ww = _mm_sub_ps(ws, _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)))); __m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps())); - __m128 s = _mm_set1_ps(32767.f); + // compute final scale; note that all computations above are unscaled + // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range + __m128 ss = _mm_div_ps(_mm_set1_ps(scale), s); // rounded signed float->int - __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s)); - __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s)); - __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s)); - __m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s)); + __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, ss)); + __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, ss)); + __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, ss)); + __m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, ss)); // mix x/z and w/y to make 16-bit unpack easier __m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16)); @@ -658,7 +666,7 @@ static void decodeFilterOctSimd16(short* data, size_t count) static void decodeFilterQuatSimd(short* data, size_t count) { - const float scale = 1.f / sqrtf(2.f); + const float scale = 32767.f / sqrtf(2.f); for (size_t i = 0; i < count; i += 4) { @@ -677,27 +685,30 @@ static void decodeFilterQuatSimd(short* data, size_t count) // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f) int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3)); - float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf)); + float32x4_t s = vcvtq_f32_s32(sf); - // convert x/y/z to [-1..1] (scaled...) - float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss); - float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss); - float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss); + // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf) + float32x4_t x = vcvtq_f32_s32(xf); + float32x4_t y = vcvtq_f32_s32(yf); + float32x4_t z = vcvtq_f32_s32(zf); - // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors - float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)))); + // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors + float32x4_t ws = vmulq_f32(s, s); + float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)))); float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f))); - float32x4_t s = vdupq_n_f32(32767.f); + // compute final scale; note that all computations above are unscaled + // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range + float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), s); // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap)); - int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap)); - int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap)); - int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap)); + int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, ss), fsnap)); + int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, ss), fsnap)); + int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, ss), fsnap)); + int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, ss), fsnap)); // mix x/z and w/y to make 16-bit unpack easier int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16)); @@ -958,7 +969,7 @@ static void decodeFilterOctSimd16(short* data, size_t count) static void decodeFilterQuatSimd(short* data, size_t count) { - const float scale = 1.f / sqrtf(2.f); + const float scale = 32767.f / sqrtf(2.f); for (size_t i = 0; i < count; i += 4) { @@ -977,28 +988,31 @@ static void decodeFilterQuatSimd(short* data, size_t count) // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f) v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3)); - v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf)); + v128_t s = wasm_f32x4_convert_i32x4(sf); - // convert x/y/z to [-1..1] (scaled...) - v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss); - v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss); - v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss); + // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf) + v128_t x = wasm_f32x4_convert_i32x4(xf); + v128_t y = wasm_f32x4_convert_i32x4(yf); + v128_t z = wasm_f32x4_convert_i32x4(zf); - // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors + // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors // note: i32x4_max with 0 is equivalent to f32x4_max - v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)))); + v128_t ws = wasm_f32x4_mul(s, s); + v128_t ww = wasm_f32x4_sub(wasm_f32x4_add(ws, ws), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)))); v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0))); - v128_t s = wasm_f32x4_splat(32767.f); + // compute final scale; note that all computations above are unscaled + // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range + v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), s); // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const v128_t fsnap = wasm_f32x4_splat(3 << 22); - v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap); - v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap); - v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap); - v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap); + v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, ss), fsnap); + v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, ss), fsnap); + v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, ss), fsnap); + v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, ss), fsnap); // mix x/z and w/y to make 16-bit unpack easier v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));