From c35f437910ffd75fa2f2c94e2824b2448a34535d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=91=D1=80=D0=B0=D0=BD=D0=B8=D0=BC=D0=B8=D1=80=20=D0=9A?= =?UTF-8?q?=D0=B0=D1=80=D0=B0=D1=9F=D0=B8=D1=9B?= Date: Thu, 20 Nov 2025 09:30:44 -0800 Subject: [PATCH] Updated meshoptimizer. --- 3rdparty/meshoptimizer/src/clusterizer.cpp | 22 +++-- 3rdparty/meshoptimizer/src/meshoptimizer.h | 17 ++-- 3rdparty/meshoptimizer/src/partition.cpp | 15 ++- 3rdparty/meshoptimizer/src/simplifier.cpp | 90 +++++++++++++----- 3rdparty/meshoptimizer/src/spatialorder.cpp | 1 + 3rdparty/meshoptimizer/src/vertexfilter.cpp | 100 +++++++++++--------- 6 files changed, 151 insertions(+), 94 deletions(-) diff --git a/3rdparty/meshoptimizer/src/clusterizer.cpp b/3rdparty/meshoptimizer/src/clusterizer.cpp index 2b29e2e60..73cc0ab53 100644 --- a/3rdparty/meshoptimizer/src/clusterizer.cpp +++ b/3rdparty/meshoptimizer/src/clusterizer.cpp @@ -640,7 +640,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u return offset + count; } -static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size) +static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth) { assert(count > 0); assert(offset < node_count); @@ -672,7 +672,8 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const size_t middle = kdtreePartition(indices, count, points, stride, axis, split); // when the partition is degenerate simply consolidate the points into a single node - if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2) + // this also ensures recursion depth is bounded on pathological inputs + if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth) return kdtreeBuildLeaf(offset, nodes, node_count, indices, count); KDNode& result = nodes[offset]; @@ -681,13 +682,13 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const result.axis = axis; // left subtree is right after our node - size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size); + size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1); // distance to the right subtree is represented explicitly assert(next_offset - offset > 1); result.children = unsigned(next_offset - offset - 1); - return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size); + return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1); } static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit) @@ -739,6 +740,7 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0) nodes[root].children = 0; + // recursion depth is bounded by tree depth (which is limited by construction) kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit); // only process the other node if it can have a match based on closest distance so far @@ -765,6 +767,7 @@ static float boxMerge(BVHBoxT& box, const BVHBox& other) __m128 min = _mm_loadu_ps(box.min); __m128 max = _mm_loadu_ps(box.max); + // note: over-read is safe because BVHBox array is allocated with padding min = _mm_min_ps(min, _mm_loadu_ps(other.min)); max = _mm_max_ps(max, _mm_loadu_ps(other.max)); @@ -785,6 +788,7 @@ static float boxMerge(BVHBoxT& box, const BVHBox& other) float32x4_t min = vld1q_f32(box.min); float32x4_t max = vld1q_f32(box.max); + // note: over-read is safe because BVHBox array is allocated with padding min = vminq_f32(min, vld1q_f32(other.min)); max = vmaxq_f32(max, vld1q_f32(other.max)); @@ -1046,9 +1050,6 @@ static void bvhPartition(unsigned int* target, const unsigned int* order, const static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) { - if (depth >= kMeshletMaxTreeDepth) - return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles); - if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices) return bvhPackLeaf(boundary, count); @@ -1091,8 +1092,8 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or } } - // this may happen if SAH costs along the admissible splits are NaN - if (bestk < 0) + // this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs + if (bestk < 0 || depth >= kMeshletMaxTreeDepth) return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles); // mark sides of split for partitioning @@ -1117,6 +1118,7 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or bvhPartition(axis, temp, sides, bestsplit, count); } + // recursion depth is bounded due to max depth check above bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight); bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight); } @@ -1191,7 +1193,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle kdindices[i] = unsigned(i); KDNode* nodes = allocator.allocate(face_count * 2); - kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8); + kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0); // find a specific corner of the mesh to use as a starting point for meshlet flow float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX; diff --git a/3rdparty/meshoptimizer/src/meshoptimizer.h b/3rdparty/meshoptimizer/src/meshoptimizer.h index 46778feff..d3faf869d 100644 --- a/3rdparty/meshoptimizer/src/meshoptimizer.h +++ b/3rdparty/meshoptimizer/src/meshoptimizer.h @@ -360,13 +360,13 @@ MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, s * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M. * Each 32-bit component is decoded in isolation; stride must be divisible by 4. * - * Experimental: meshopt_decodeFilterColor decodes YCoCg (+A) color encoding where RGB is converted to YCoCg space with variable bit quantization. + * meshopt_decodeFilterColor decodes YCoCg (+A) color encoding where RGB is converted to YCoCg space with variable bit quantization. * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. */ MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride); MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride); MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride); +MESHOPTIMIZER_API void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride); /** * Vertex buffer filter encoders @@ -384,7 +384,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t c * Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4. * Input data must contain stride/4 floats for every vector (count*stride/4 total). * - * Experimental: meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with variable bit quantization. + * meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with variable bit quantization. * Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8. * Input data must contain 4 floats for every color (count*4 total). */ @@ -403,7 +403,7 @@ enum meshopt_EncodeExpMode MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data); MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data); MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data); +MESHOPTIMIZER_API void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data); /** * Simplification options @@ -478,7 +478,7 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); /** - * Experimental: Mesh simplifier with position/attribute update + * Mesh simplifier with position/attribute update * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible. * Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance. * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error. @@ -498,7 +498,7 @@ MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destinatio * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); +MESHOPTIMIZER_API size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); /** * Mesh simplifier (sloppy) @@ -699,10 +699,9 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* me /** * Meshlet optimizer - * Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput + * Reorders meshlet vertices and triangles to maximize locality which can improve rasterizer throughput or ray tracing performance when using fast-build modes. * - * meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these - * need to be computed from meshlet's vertex_offset and triangle_offset + * meshlet_triangles and meshlet_vertices must refer to meshlet data; when buildMeshlets* is used, these need to be computed from meshlet's vertex_offset and triangle_offset * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512) */ MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count); diff --git a/3rdparty/meshoptimizer/src/partition.cpp b/3rdparty/meshoptimizer/src/partition.cpp index c7a05a564..4119a53ed 100644 --- a/3rdparty/meshoptimizer/src/partition.cpp +++ b/3rdparty/meshoptimizer/src/partition.cpp @@ -10,6 +10,9 @@ namespace meshopt { +// To avoid excessive recursion for malformed inputs, we switch to bisection after some depth +const int kMergeDepthCutoff = 40; + struct ClusterAdjacency { unsigned int* offsets; @@ -434,7 +437,7 @@ static size_t mergePartition(unsigned int* order, size_t count, const ClusterGro return m; } -static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size) +static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size, int depth) { size_t total = 0; for (size_t i = 0; i < count; ++i) @@ -467,11 +470,13 @@ static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count size_t middle = mergePartition(order, count, groups, axis, split); // enforce balance for degenerate partitions - if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2) + // this also ensures recursion depth is bounded on pathological inputs + if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2 || depth >= kMergeDepthCutoff) middle = count / 2; - mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size); - mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size); + // recursion depth is logarithmic and bounded due to max depth check above + mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size, depth + 1); + mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size, depth + 1); } } // namespace meshopt @@ -597,7 +602,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* if (groups[i].size) merge_order[merge_offset++] = unsigned(i); - mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8); + mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8, 0); } // output each remaining group diff --git a/3rdparty/meshoptimizer/src/simplifier.cpp b/3rdparty/meshoptimizer/src/simplifier.cpp index 5dcb459ce..efbdf91a6 100644 --- a/3rdparty/meshoptimizer/src/simplifier.cpp +++ b/3rdparty/meshoptimizer/src/simplifier.cpp @@ -620,7 +620,7 @@ static void rescaleAttributes(float* result, const float* vertex_attributes_data } } -static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_update) +static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_kind, const unsigned char* vertex_update, const unsigned char* vertex_lock) { size_t vertex_positions_stride_float = vertex_positions_stride / sizeof(float); size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float); @@ -632,12 +632,20 @@ static void finalizeVertices(float* vertex_positions_data, size_t vertex_positio unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i); - const Vector3& p = vertex_positions[i]; - float* v = vertex_positions_data + ri * vertex_positions_stride_float; + // updating externally locked vertices is not allowed + if (vertex_lock && (vertex_lock[ri] & meshopt_SimplifyVertex_Lock) != 0) + continue; - v[0] = p.x * vertex_scale + vertex_offset[0]; - v[1] = p.y * vertex_scale + vertex_offset[1]; - v[2] = p.z * vertex_scale + vertex_offset[2]; + // moving locked vertices may result in floating point drift + if (vertex_kind[i] != Kind_Locked) + { + const Vector3& p = vertex_positions[i]; + float* v = vertex_positions_data + ri * vertex_positions_stride_float; + + v[0] = p.x * vertex_scale + vertex_offset[0]; + v[1] = p.y * vertex_scale + vertex_offset[1]; + v[2] = p.z * vertex_scale + vertex_offset[2]; + } if (attribute_count) { @@ -1637,10 +1645,10 @@ static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_cou } } -static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update) +static void solvePositions(Vector3* vertex_positions, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update) { #if TRACE - size_t stats[5] = {}; + size_t stats[6] = {}; #endif for (size_t i = 0; i < vertex_count; ++i) @@ -1648,7 +1656,6 @@ static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, s if (!vertex_update[i]) continue; - // moving externally locked vertices is prohibited // moving vertices on an attribute discontinuity may result in extrapolating UV outside of the chart bounds // moving vertices on a border requires a stronger edge quadric to preserve the border geometry if (vertex_kind[i] == Kind_Locked || vertex_kind[i] == Kind_Seam || vertex_kind[i] == Kind_Border) @@ -1712,36 +1719,64 @@ static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, s continue; } + // reject updates that increase positional error too much; allow some tolerance to improve attribute quality + if (quadricError(vertex_quadrics[i], p) > quadricError(vertex_quadrics[i], vp) * 1.5f + 1e-6f) + { + TRACESTATS(5); + continue; + } + TRACESTATS(1); vertex_positions[i] = p; } #if TRACE - printf("updated %d/%d positions; failed solve %d bounds %d flip %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4])); + printf("updated %d/%d positions; failed solve %d bounds %d flip %d error %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]), int(stats[5])); #endif +} - if (attribute_count == 0) - return; - +static void solveAttributes(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned char* vertex_update) +{ for (size_t i = 0; i < vertex_count; ++i) { if (!vertex_update[i]) continue; - // updating externally locked vertices is prohibited - if (vertex_kind[i] == Kind_Locked) + if (remap[i] != i) continue; - const Vector3& p = vertex_positions[remap[i]]; - const Quadric& A = attribute_quadrics[i]; - - float iw = A.w == 0 ? 0.f : 1.f / A.w; - for (size_t k = 0; k < attribute_count; ++k) { - const QuadricGrad& G = attribute_gradients[i * attribute_count + k]; + unsigned int shared = ~0u; - vertex_attributes[i * attribute_count + k] = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw; + // for complex vertices, preserve attribute continuity and use highest weight wedge if values were shared + if (vertex_kind[i] == Kind_Complex) + { + shared = unsigned(i); + + for (unsigned int v = wedge[i]; v != i; v = wedge[v]) + if (vertex_attributes[v * attribute_count + k] != vertex_attributes[i * attribute_count + k]) + shared = ~0u; + else if (shared != ~0u && attribute_quadrics[v].w > attribute_quadrics[shared].w) + shared = v; + } + + // update attributes for all wedges + unsigned int v = unsigned(i); + do + { + unsigned int r = (shared == ~0u) ? v : shared; + + const Vector3& p = vertex_positions[i]; // same for all wedges + const Quadric& A = attribute_quadrics[r]; + const QuadricGrad& G = attribute_gradients[r * attribute_count + k]; + + float iw = A.w == 0 ? 0.f : 1.f / A.w; + float av = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw; + + vertex_attributes[v * attribute_count + k] = av; + v = wedge[v]; + } while (v != i); } } } @@ -2522,16 +2557,19 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic { unsigned int v = result[i]; - // recomputing externally locked vertices may result in floating point drift - vertex_update[v] = vertex_kind[v] != Kind_Locked; + // mark the vertex for finalizeVertices and root vertex for solve* + vertex_update[remap[v]] = vertex_update[v] = 1; } // edge adjacency may be stale as we haven't updated it after last series of edge collapses updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap); - solveQuadrics(vertex_positions, vertex_attributes, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update); + solvePositions(vertex_positions, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update); - finalizeVertices(const_cast(vertex_positions_data), vertex_positions_stride, const_cast(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_update); + if (attribute_count) + solveAttributes(vertex_positions, vertex_attributes, vertex_count, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, vertex_update); + + finalizeVertices(const_cast(vertex_positions_data), vertex_positions_stride, const_cast(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_kind, vertex_update, vertex_lock); } // if debug visualization data is requested, fill it instead of index data; for simplicity, this doesn't work with sparsity diff --git a/3rdparty/meshoptimizer/src/spatialorder.cpp b/3rdparty/meshoptimizer/src/spatialorder.cpp index b65627900..8a785fcd5 100644 --- a/3rdparty/meshoptimizer/src/spatialorder.cpp +++ b/3rdparty/meshoptimizer/src/spatialorder.cpp @@ -208,6 +208,7 @@ static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigne partitionPoints(axis, temp, sides, split, count); } + // recursion depth is logarithmic and bounded as we always split in approximately half splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size); splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size); } diff --git a/3rdparty/meshoptimizer/src/vertexfilter.cpp b/3rdparty/meshoptimizer/src/vertexfilter.cpp index b20d998ca..7a7e67a39 100644 --- a/3rdparty/meshoptimizer/src/vertexfilter.cpp +++ b/3rdparty/meshoptimizer/src/vertexfilter.cpp @@ -550,6 +550,13 @@ inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y) r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate return vmulq_f32(x, r); } + +#ifndef __ARM_FEATURE_FMA +inline float32x4_t vfmaq_f32(float32x4_t x, float32x4_t y, float32x4_t z) +{ + return vaddq_f32(x, vmulq_f32(y, z)); +} +#endif #endif #ifdef SIMD_NEON @@ -580,23 +587,21 @@ static void decodeFilterOctSimd8(signed char* data, size_t count) y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign)))); // compute normal length & scale - float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))); + float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z); float32x4_t rl = vrsqrteq_f32(ll); float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl); // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value - // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap)); - int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap)); - int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap)); + int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s)); + int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s)); + int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s)); // combine xr/yr/zr into final value - int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000)); - res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff))); - res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8)); - res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16)); + int32x4_t res = vsliq_n_s32(xr, vsliq_n_s32(yr, zr, 8), 8); + res = vbslq_s32(vdupq_n_u32(0xff000000), n4, res); vst1q_s32(reinterpret_cast(&data[i * 4]), res); } @@ -634,21 +639,25 @@ static void decodeFilterOctSimd16(short* data, size_t count) y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign)))); // compute normal length & scale - float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))); + float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z); +#if !defined(__aarch64__) && !defined(_M_ARM64) float32x4_t rl = vrsqrteq_f32(ll); rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl); +#else + float32x4_t s = vdivq_f32(vdupq_n_f32(32767.f), vsqrtq_f32(ll)); +#endif // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap)); - int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap)); - int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap)); + int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s)); + int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s)); + int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s)); // mix x/z and y/0 to make 16-bit unpack easier - int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16)); + int32x4_t xzr = vsliq_n_s32(xr, zr, 16); int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff)); // pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w @@ -694,7 +703,7 @@ static void decodeFilterQuatSimd(short* data, size_t count) // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors float32x4_t ws = vmulq_f32(s, s); - float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)))); + float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z)); float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f))); // compute final scale; note that all computations above are unscaled @@ -705,26 +714,32 @@ static void decodeFilterQuatSimd(short* data, size_t count) // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, ss), fsnap)); - int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, ss), fsnap)); - int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, ss), fsnap)); - int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, ss), fsnap)); + int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, ss)); + int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, ss)); + int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, ss)); + int32x4_t wr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, w, ss)); // mix x/z and w/y to make 16-bit unpack easier - int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16)); - int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16)); + int32x4_t xzr = vsliq_n_s32(xr, zr, 16); + int32x4_t wyr = vsliq_n_s32(wr, yr, 16); // pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0) - int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]); - int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]); + uint64x2_t res_0 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]); + uint64x2_t res_1 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]); + + // store results to stack so that we can rotate using scalar instructions + // TODO: volatile works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/166808 + volatile uint64_t res[4]; + vst1q_u64(const_cast(&res[0]), res_0); + vst1q_u64(const_cast(&res[2]), res_1); // rotate and store - uint64_t* out = (uint64_t*)&data[i * 4]; + uint64_t* out = reinterpret_cast(&data[i * 4]); - out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4); - out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4); - out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4); - out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4); + out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4); + out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4); + out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4); + out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4); } } @@ -778,19 +793,16 @@ static void decodeFilterColorSimd8(unsigned char* data, size_t count) int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf)); // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value - // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap)); - int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap)); - int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap)); - int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap)); + int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss)); + int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss)); + int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss)); + int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss)); // repack rgba into final value - int32x4_t res = vandq_s32(rr, vdupq_n_s32(0xff)); - res = vorrq_s32(res, vshlq_n_s32(vandq_s32(gr, vdupq_n_s32(0xff)), 8)); - res = vorrq_s32(res, vshlq_n_s32(vandq_s32(br, vdupq_n_s32(0xff)), 16)); - res = vorrq_s32(res, vshlq_n_s32(ar, 24)); + int32x4_t res = vsliq_n_s32(rr, vsliq_n_s32(gr, vsliq_n_s32(br, ar, 8), 8), 8); vst1q_s32(reinterpret_cast(&data[i * 4]), res); } @@ -835,14 +847,14 @@ static void decodeFilterColorSimd16(unsigned short* data, size_t count) // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap)); - int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap)); - int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap)); - int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap)); + int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss)); + int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss)); + int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss)); + int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss)); // mix r/b and g/a to make 16-bit unpack easier - int32x4_t rbr = vorrq_s32(vandq_s32(rr, vdupq_n_s32(0xffff)), vshlq_n_s32(br, 16)); - int32x4_t gar = vorrq_s32(vandq_s32(gr, vdupq_n_s32(0xffff)), vshlq_n_s32(ar, 16)); + int32x4_t rbr = vsliq_n_s32(rr, br, 16); + int32x4_t gar = vsliq_n_s32(gr, ar, 16); // pack r/g/b/a using 16-bit unpacks int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]); @@ -1145,7 +1157,7 @@ static void decodeFilterColorSimd16(unsigned short* data, size_t count) v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf)); // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value - // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction + // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const v128_t fsnap = wasm_f32x4_splat(3 << 22); v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);