Updated meshoptimizer.

This commit is contained in:
Бранимир Караџић
2025-10-21 19:17:49 -07:00
parent a9b8b0b0d5
commit f1a138b847
5 changed files with 292 additions and 143 deletions

View File

@@ -595,7 +595,7 @@ struct KDNode
unsigned int children : 30;
};
static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, int axis, float pivot)
{
size_t m = 0;
@@ -666,7 +666,7 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
}
// split axis is one where the variance is largest
unsigned int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
float split = mean[axis];
size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
@@ -768,8 +768,8 @@ static float boxMerge(BVHBoxT& box, const BVHBox& other)
min = _mm_min_ps(min, _mm_loadu_ps(other.min));
max = _mm_max_ps(max, _mm_loadu_ps(other.max));
_mm_store_ps(box.min, min);
_mm_store_ps(box.max, max);
_mm_storeu_ps(box.min, min);
_mm_storeu_ps(box.max, max);
__m128 size = _mm_sub_ps(max, min);
__m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1));

View File

@@ -125,14 +125,14 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
/**
* Experimental: Generates a remap table that maps all vertices with the same position to the same (existing) index.
* Generates a remap table that maps all vertices with the same position to the same (existing) index.
* Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering.
* This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification.
*
* destination must contain enough space for the resulting remap table (vertex_count elements)
* vertex_positions should have float3 position in the first 12 bytes of each vertex
*/
MESHOPTIMIZER_EXPERIMENTAL void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
MESHOPTIMIZER_API void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
/**
* Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
@@ -418,7 +418,7 @@ enum
meshopt_SimplifyErrorAbsolute = 1 << 2,
/* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
meshopt_SimplifyPrune = 1 << 3,
/* Experimental: Produce more regular triangle sizes and shapes during simplification, at some cost to geometric quality. */
/* Produce more regular triangle sizes and shapes during simplification, at some cost to geometric quality. */
meshopt_SimplifyRegularize = 1 << 4,
/* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */
meshopt_SimplifyPermissive = 1 << 5,
@@ -501,7 +501,7 @@ MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destinatio
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
/**
* Experimental: Mesh simplifier (sloppy)
* Mesh simplifier (sloppy)
* Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
* The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error.
* Returns the number of indices after simplification, with destination containing new index data
@@ -514,7 +514,7 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indic
* target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
* result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
*/
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error);
MESHOPTIMIZER_API size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error);
/**
* Mesh simplifier (pruner)
@@ -670,7 +670,7 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshl
MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
/**
* Experimental: Meshlet builder with flexible cluster sizes
* Meshlet builder with flexible cluster sizes
* Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet.
* Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor.
*
@@ -682,10 +682,10 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t m
* cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
* split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold
*/
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
MESHOPTIMIZER_API size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
/**
* Experimental: Meshlet builder that produces clusters optimized for raytracing
* Meshlet builder that produces clusters optimized for raytracing
* Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet.
*
* meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
@@ -695,7 +695,7 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshl
* max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
* fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default
*/
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
/**
* Meshlet optimizer
@@ -761,13 +761,14 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float*
/**
* Cluster partitioner
* Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other.
* When vertex positions are not provided, only clusters that share vertices will be grouped together, which may result in small partitions for some inputs.
*
* destination must contain enough space for the resulting partition data (cluster_count elements)
* destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function
* cluster_indices should have the vertex indices referenced by each cluster, stored sequentially
* cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count
* vertex_positions should have float3 position in the first 12 bytes of each vertex (or can be NULL if not used)
* target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger
* vertex_positions can be NULL; when it's not NULL, it should have float3 position in the first 12 bytes of each vertex
* target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger (up to target + target/3)
*/
MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
@@ -909,6 +910,8 @@ inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float*
template <typename T>
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL);
template <typename T>
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error = NULL);
template <typename T>
inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
template <typename T>
inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
@@ -1293,6 +1296,15 @@ inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t in
return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error);
}
template <typename T>
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error)
{
meshopt_IndexAdapter<T> in(NULL, indices, index_count);
meshopt_IndexAdapter<T> out(destination, NULL, index_count);
return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_lock, target_index_count, target_error, result_error);
}
template <typename T>
inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error)
{

View File

@@ -52,18 +52,16 @@ static void filterClusterIndices(unsigned int* data, unsigned int* offsets, cons
offsets[cluster_count] = unsigned(cluster_write);
}
static void computeClusterBounds(float* cluster_bounds, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, const float* vertex_positions, size_t vertex_positions_stride)
static float computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, float* out_center)
{
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
for (size_t i = 0; i < cluster_count; ++i)
{
float center[3] = {0, 0, 0};
// approximate center of the cluster by averaging all vertex positions
for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
for (size_t j = 0; j < index_count; ++j)
{
const float* p = vertex_positions + cluster_indices[j] * vertex_stride_float;
const float* p = vertex_positions + indices[j] * vertex_stride_float;
center[0] += p[0];
center[1] += p[1];
@@ -71,30 +69,27 @@ static void computeClusterBounds(float* cluster_bounds, const unsigned int* clus
}
// note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes
if (size_t cluster_size = cluster_offsets[i + 1] - cluster_offsets[i])
if (index_count)
{
center[0] /= float(cluster_size);
center[1] /= float(cluster_size);
center[2] /= float(cluster_size);
center[0] /= float(index_count);
center[1] /= float(index_count);
center[2] /= float(index_count);
}
// compute radius of the bounding sphere for each cluster
float radiussq = 0;
for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
for (size_t j = 0; j < index_count; ++j)
{
const float* p = vertex_positions + cluster_indices[j] * vertex_stride_float;
const float* p = vertex_positions + indices[j] * vertex_stride_float;
float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
radiussq = radiussq < d2 ? d2 : radiussq;
}
cluster_bounds[i * 4 + 0] = center[0];
cluster_bounds[i * 4 + 1] = center[1];
cluster_bounds[i * 4 + 2] = center[2];
cluster_bounds[i * 4 + 3] = sqrtf(radiussq);
}
memcpy(out_center, center, sizeof(center));
return sqrtf(radiussq);
}
static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, size_t vertex_count, meshopt_Allocator& allocator)
@@ -211,6 +206,9 @@ struct ClusterGroup
int next;
unsigned int size; // 0 unless root
unsigned int vertices;
float center[3];
float radius;
};
struct GroupOrder
@@ -285,15 +283,18 @@ static unsigned int countShared(const ClusterGroup* groups, int group1, int grou
return total;
}
static void mergeBounds(float* target, const float* source)
static void mergeBounds(ClusterGroup& target, const ClusterGroup& source)
{
float r1 = target[3], r2 = source[3];
float dx = source[0] - target[0], dy = source[1] - target[1], dz = source[2] - target[2];
float r1 = target.radius, r2 = source.radius;
float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
float d = sqrtf(dx * dx + dy * dy + dz * dz);
if (d + r1 < r2)
{
memcpy(target, source, 4 * sizeof(float));
target.center[0] = source.center[0];
target.center[1] = source.center[1];
target.center[2] = source.center[2];
target.radius = source.radius;
return;
}
@@ -301,17 +302,17 @@ static void mergeBounds(float* target, const float* source)
{
float k = d > 0 ? (d + r2 - r1) / (2 * d) : 0.f;
target[0] += dx * k;
target[1] += dy * k;
target[2] += dz * k;
target[3] = (d + r2 + r1) / 2;
target.center[0] += dx * k;
target.center[1] += dy * k;
target.center[2] += dz * k;
target.radius = (d + r2 + r1) / 2;
}
}
static float boundsScore(const float* target, const float* source)
static float boundsScore(const ClusterGroup& target, const ClusterGroup& source)
{
float r1 = target[3], r2 = source[3];
float dx = source[0] - target[0], dy = source[1] - target[1], dz = source[2] - target[2];
float r1 = target.radius, r2 = source.radius;
float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
float d = sqrtf(dx * dx + dy * dy + dz * dz);
float mr = d + r1 < r2 ? r2 : (d + r2 < r1 ? r1 : (d + r2 + r1) / 2);
@@ -319,7 +320,7 @@ static float boundsScore(const float* target, const float* source)
return mr > 0 ? r1 / mr : 0.f;
}
static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, const float* cluster_bounds)
static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, bool use_bounds)
{
assert(groups[id].size > 0);
@@ -347,8 +348,8 @@ static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdj
float score = float(int(shared)) * (group_rsqrt + other_rsqrt);
// incorporate spatial score to favor merging nearby groups
if (cluster_bounds)
score *= 1.f + 0.4f * boundsScore(&cluster_bounds[id * 4], &cluster_bounds[other * 4]);
if (use_bounds)
score *= 1.f + 0.4f * boundsScore(groups[id], groups[other]);
if (score > best_score)
{
@@ -361,6 +362,118 @@ static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdj
return best_group;
}
static void mergeLeaf(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size)
{
for (size_t i = 0; i < count; ++i)
{
unsigned int id = order[i];
if (groups[id].size == 0 || groups[id].size >= target_partition_size)
continue;
float best_score = -1.f;
int best_group = -1;
for (size_t j = 0; j < count; ++j)
{
unsigned int other = order[j];
if (id == other || groups[other].size == 0)
continue;
if (groups[id].size + groups[other].size > max_partition_size)
continue;
// favor merging nearby groups
float score = boundsScore(groups[id], groups[other]);
if (score > best_score)
{
best_score = score;
best_group = other;
}
}
// merge id *into* best_group; that way, we may merge more groups into the same best_group, maximizing the chance of reaching target
if (best_group != -1)
{
// combine groups by linking them together
unsigned int tail = best_group;
while (groups[tail].next >= 0)
tail = groups[tail].next;
groups[tail].next = id;
// update group sizes; note, we omit vertices update for simplicity as it's not used for spatial merge
groups[best_group].size += groups[id].size;
groups[id].size = 0;
// merge bounding spheres
mergeBounds(groups[best_group], groups[id]);
groups[id].radius = 0.f;
}
}
}
static size_t mergePartition(unsigned int* order, size_t count, const ClusterGroup* groups, int axis, float pivot)
{
size_t m = 0;
// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
for (size_t i = 0; i < count; ++i)
{
float v = groups[order[i]].center[axis];
// swap(m, i) unconditionally
unsigned int t = order[m];
order[m] = order[i];
order[i] = t;
// when v >= pivot, we swap i with m without advancing it, preserving invariants
m += v < pivot;
}
return m;
}
static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size)
{
size_t total = 0;
for (size_t i = 0; i < count; ++i)
total += groups[order[i]].size;
if (total <= max_partition_size || count <= leaf_size)
return mergeLeaf(groups, order, count, target_partition_size, max_partition_size);
float mean[3] = {};
float vars[3] = {};
float runc = 1, runs = 1;
// gather statistics on the points in the subtree using Welford's algorithm
for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
{
const float* point = groups[order[i]].center;
for (int k = 0; k < 3; ++k)
{
float delta = point[k] - mean[k];
mean[k] += delta * runs;
vars[k] += delta * (point[k] - mean[k]);
}
}
// split axis is one where the variance is largest
int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
float split = mean[axis];
size_t middle = mergePartition(order, count, groups, axis, split);
// enforce balance for degenerate partitions
if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2)
middle = count / 2;
mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size);
mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size);
}
} // namespace meshopt
size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
@@ -371,7 +484,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
assert(vertex_positions_stride % sizeof(float) == 0);
assert(target_partition_size > 0);
size_t max_partition_size = target_partition_size + target_partition_size * 3 / 8;
size_t max_partition_size = target_partition_size + target_partition_size / 3;
meshopt_Allocator allocator;
@@ -385,20 +498,12 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
filterClusterIndices(cluster_newindices, cluster_offsets, cluster_indices, cluster_index_counts, cluster_count, used, vertex_count, total_index_count);
cluster_indices = cluster_newindices;
// compute bounding sphere for each cluster if positions are provided
float* cluster_bounds = NULL;
if (vertex_positions)
{
cluster_bounds = allocator.allocate<float>(cluster_count * 4);
computeClusterBounds(cluster_bounds, cluster_indices, cluster_offsets, cluster_count, vertex_positions, vertex_positions_stride);
}
// build cluster adjacency along with edge weights (shared vertex count)
ClusterAdjacency adjacency = {};
buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, vertex_count, allocator);
ClusterGroup* groups = allocator.allocate<ClusterGroup>(cluster_count);
memset(groups, 0, sizeof(ClusterGroup) * cluster_count);
GroupOrder* order = allocator.allocate<GroupOrder>(cluster_count);
size_t pending = 0;
@@ -412,6 +517,10 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
groups[i].vertices = cluster_offsets[i + 1] - cluster_offsets[i];
assert(groups[i].vertices > 0);
// compute bounding sphere for each cluster if positions are provided
if (vertex_positions)
groups[i].radius = computeClusterBounds(cluster_indices + cluster_offsets[i], cluster_offsets[i + 1] - cluster_offsets[i], vertex_positions, vertex_positions_stride, groups[i].center);
GroupOrder item = {};
item.id = unsigned(i);
item.order = groups[i].vertices;
@@ -439,7 +548,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
if (groups[top.id].size >= target_partition_size)
continue;
int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, cluster_bounds);
int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, /* use_bounds= */ vertex_positions);
// we can't grow the group any more, emit as is
if (best_group == -1)
@@ -449,14 +558,11 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
unsigned int shared = countShared(groups, top.id, best_group, adjacency);
// combine groups by linking them together
assert(groups[best_group].size > 0);
unsigned int tail = top.id;
while (groups[tail].next >= 0)
tail = groups[tail].next;
for (int i = top.id; i >= 0; i = groups[i].next)
if (groups[i].next < 0)
{
groups[i].next = best_group;
break;
}
groups[tail].next = best_group;
// update group sizes; note, the vertex update is a O(1) approximation which avoids recomputing the true size
groups[top.id].size += groups[best_group].size;
@@ -467,10 +573,10 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
groups[best_group].vertices = 0;
// merge bounding spheres if bounds are available
if (cluster_bounds)
if (vertex_positions)
{
mergeBounds(&cluster_bounds[top.id * 4], &cluster_bounds[best_group * 4]);
memset(&cluster_bounds[best_group * 4], 0, 4 * sizeof(float));
mergeBounds(groups[top.id], groups[best_group]);
groups[best_group].radius = 0;
}
// re-associate all clusters back to the merged group
@@ -481,6 +587,20 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
heapPush(order, pending++, top);
}
// if vertex positions are provided, we do a final pass to see if we can merge small groups based on spatial locality alone
if (vertex_positions)
{
unsigned int* merge_order = reinterpret_cast<unsigned int*>(order);
size_t merge_offset = 0;
for (size_t i = 0; i < cluster_count; ++i)
if (groups[i].size)
merge_order[merge_offset++] = unsigned(i);
mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8);
}
// output each remaining group
size_t next_group = 0;
for (size_t i = 0; i < cluster_count; ++i)

View File

@@ -243,14 +243,18 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count,
{
// use a bit set to compute the precise number of unique vertices
unsigned char* filter = allocator.allocate<unsigned char>((vertex_count + 7) / 8);
memset(filter, 0, (vertex_count + 7) / 8);
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
filter[index / 8] = 0;
}
size_t unique = 0;
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
unique += (filter[index / 8] & (1 << (index % 8))) == 0;
filter[index / 8] |= 1 << (index % 8);
}
@@ -269,7 +273,6 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count,
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
unsigned int* entry = hashLookup2(revremap, revremap_size, hasher, index, ~0u);
if (*entry == ~0u)
@@ -2264,7 +2267,7 @@ static float interpolate(float y, float x0, float y0, float x1, float y1, float
// three point interpolation from "revenge of interpolation search" paper
float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0);
float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2);
return x1 + num / den;
return x1 + (den == 0.f ? 0.f : num / den);
}
} // namespace meshopt

View File

@@ -109,28 +109,33 @@ static void decodeFilterOct(T* data, size_t count)
static void decodeFilterQuat(short* data, size_t count)
{
const float scale = 1.f / sqrtf(2.f);
const float scale = 32767.f / sqrtf(2.f);
for (size_t i = 0; i < count; ++i)
{
// recover scale from the high byte of the component
int sf = data[i * 4 + 3] | 3;
float ss = scale / float(sf);
float s = float(sf);
// convert x/y/z to [-1..1] (scaled...)
float x = float(data[i * 4 + 0]) * ss;
float y = float(data[i * 4 + 1]) * ss;
float z = float(data[i * 4 + 2]) * ss;
// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
float x = float(data[i * 4 + 0]);
float y = float(data[i * 4 + 1]);
float z = float(data[i * 4 + 2]);
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
float ww = 1.f - x * x - y * y - z * z;
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
float ws = s * s;
float ww = ws * 2.f - x * x - y * y - z * z;
float w = sqrtf(ww >= 0.f ? ww : 0.f);
// compute final scale; note that all computations above are unscaled
// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
float ss = scale / s;
// rounded signed float->int
int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
int wf = int(w * 32767.f + 0.5f);
int xf = int(x * ss + (x >= 0.f ? 0.5f : -0.5f));
int yf = int(y * ss + (y >= 0.f ? 0.5f : -0.5f));
int zf = int(z * ss + (z >= 0.f ? 0.5f : -0.5f));
int wf = int(w * ss + 0.5f);
int qc = data[i * 4 + 3] & 3;
@@ -347,7 +352,7 @@ static void decodeFilterOctSimd16(short* data, size_t count)
static void decodeFilterQuatSimd(short* data, size_t count)
{
const float scale = 1.f / sqrtf(2.f);
const float scale = 32767.f / sqrtf(2.f);
for (size_t i = 0; i < count; i += 4)
{
@@ -366,24 +371,27 @@ static void decodeFilterQuatSimd(short* data, size_t count)
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
__m128 s = _mm_cvtepi32_ps(sf);
// convert x/y/z to [-1..1] (scaled...)
__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
__m128 x = _mm_cvtepi32_ps(xf);
__m128 y = _mm_cvtepi32_ps(yf);
__m128 z = _mm_cvtepi32_ps(zf);
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
__m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
__m128 ws = _mm_mul_ps(s, _mm_add_ps(s, s)); // s*2s instead of 2*(s*s) to work around clang bug with integer multiplication
__m128 ww = _mm_sub_ps(ws, _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
__m128 s = _mm_set1_ps(32767.f);
// compute final scale; note that all computations above are unscaled
// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
__m128 ss = _mm_div_ps(_mm_set1_ps(scale), s);
// rounded signed float->int
__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, ss));
__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, ss));
__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, ss));
__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, ss));
// mix x/z and w/y to make 16-bit unpack easier
__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
@@ -658,7 +666,7 @@ static void decodeFilterOctSimd16(short* data, size_t count)
static void decodeFilterQuatSimd(short* data, size_t count)
{
const float scale = 1.f / sqrtf(2.f);
const float scale = 32767.f / sqrtf(2.f);
for (size_t i = 0; i < count; i += 4)
{
@@ -677,27 +685,30 @@ static void decodeFilterQuatSimd(short* data, size_t count)
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
float32x4_t s = vcvtq_f32_s32(sf);
// convert x/y/z to [-1..1] (scaled...)
float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
float32x4_t x = vcvtq_f32_s32(xf);
float32x4_t y = vcvtq_f32_s32(yf);
float32x4_t z = vcvtq_f32_s32(zf);
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
float32x4_t ws = vmulq_f32(s, s);
float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
float32x4_t s = vdupq_n_f32(32767.f);
// compute final scale; note that all computations above are unscaled
// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), s);
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, ss), fsnap));
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, ss), fsnap));
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, ss), fsnap));
int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, ss), fsnap));
// mix x/z and w/y to make 16-bit unpack easier
int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
@@ -958,7 +969,7 @@ static void decodeFilterOctSimd16(short* data, size_t count)
static void decodeFilterQuatSimd(short* data, size_t count)
{
const float scale = 1.f / sqrtf(2.f);
const float scale = 32767.f / sqrtf(2.f);
for (size_t i = 0; i < count; i += 4)
{
@@ -977,28 +988,31 @@ static void decodeFilterQuatSimd(short* data, size_t count)
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
v128_t s = wasm_f32x4_convert_i32x4(sf);
// convert x/y/z to [-1..1] (scaled...)
v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
v128_t x = wasm_f32x4_convert_i32x4(xf);
v128_t y = wasm_f32x4_convert_i32x4(yf);
v128_t z = wasm_f32x4_convert_i32x4(zf);
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
// note: i32x4_max with 0 is equivalent to f32x4_max
v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
v128_t ws = wasm_f32x4_mul(s, s);
v128_t ww = wasm_f32x4_sub(wasm_f32x4_add(ws, ws), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
v128_t s = wasm_f32x4_splat(32767.f);
// compute final scale; note that all computations above are unscaled
// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), s);
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, ss), fsnap);
v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, ss), fsnap);
v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, ss), fsnap);
v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, ss), fsnap);
// mix x/z and w/y to make 16-bit unpack easier
v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));