mirror of
https://github.com/bkaradzic/bgfx.git
synced 2026-02-17 20:52:36 +01:00
Updated meshoptimizer.
This commit is contained in:
committed by
Branimir Karadžić
parent
0292095363
commit
c35f437910
22
3rdparty/meshoptimizer/src/clusterizer.cpp
vendored
22
3rdparty/meshoptimizer/src/clusterizer.cpp
vendored
@@ -640,7 +640,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u
|
||||
return offset + count;
|
||||
}
|
||||
|
||||
static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
|
||||
static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth)
|
||||
{
|
||||
assert(count > 0);
|
||||
assert(offset < node_count);
|
||||
@@ -672,7 +672,8 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
|
||||
size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
|
||||
|
||||
// when the partition is degenerate simply consolidate the points into a single node
|
||||
if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
|
||||
// this also ensures recursion depth is bounded on pathological inputs
|
||||
if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth)
|
||||
return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
|
||||
|
||||
KDNode& result = nodes[offset];
|
||||
@@ -681,13 +682,13 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
|
||||
result.axis = axis;
|
||||
|
||||
// left subtree is right after our node
|
||||
size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
|
||||
size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1);
|
||||
|
||||
// distance to the right subtree is represented explicitly
|
||||
assert(next_offset - offset > 1);
|
||||
result.children = unsigned(next_offset - offset - 1);
|
||||
|
||||
return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
|
||||
return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1);
|
||||
}
|
||||
|
||||
static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
|
||||
@@ -739,6 +740,7 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
|
||||
if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0)
|
||||
nodes[root].children = 0;
|
||||
|
||||
// recursion depth is bounded by tree depth (which is limited by construction)
|
||||
kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
|
||||
|
||||
// only process the other node if it can have a match based on closest distance so far
|
||||
@@ -765,6 +767,7 @@ static float boxMerge(BVHBoxT& box, const BVHBox& other)
|
||||
__m128 min = _mm_loadu_ps(box.min);
|
||||
__m128 max = _mm_loadu_ps(box.max);
|
||||
|
||||
// note: over-read is safe because BVHBox array is allocated with padding
|
||||
min = _mm_min_ps(min, _mm_loadu_ps(other.min));
|
||||
max = _mm_max_ps(max, _mm_loadu_ps(other.max));
|
||||
|
||||
@@ -785,6 +788,7 @@ static float boxMerge(BVHBoxT& box, const BVHBox& other)
|
||||
float32x4_t min = vld1q_f32(box.min);
|
||||
float32x4_t max = vld1q_f32(box.max);
|
||||
|
||||
// note: over-read is safe because BVHBox array is allocated with padding
|
||||
min = vminq_f32(min, vld1q_f32(other.min));
|
||||
max = vmaxq_f32(max, vld1q_f32(other.max));
|
||||
|
||||
@@ -1046,9 +1050,6 @@ static void bvhPartition(unsigned int* target, const unsigned int* order, const
|
||||
|
||||
static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
|
||||
{
|
||||
if (depth >= kMeshletMaxTreeDepth)
|
||||
return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
|
||||
|
||||
if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices)
|
||||
return bvhPackLeaf(boundary, count);
|
||||
|
||||
@@ -1091,8 +1092,8 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or
|
||||
}
|
||||
}
|
||||
|
||||
// this may happen if SAH costs along the admissible splits are NaN
|
||||
if (bestk < 0)
|
||||
// this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs
|
||||
if (bestk < 0 || depth >= kMeshletMaxTreeDepth)
|
||||
return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
|
||||
|
||||
// mark sides of split for partitioning
|
||||
@@ -1117,6 +1118,7 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or
|
||||
bvhPartition(axis, temp, sides, bestsplit, count);
|
||||
}
|
||||
|
||||
// recursion depth is bounded due to max depth check above
|
||||
bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
|
||||
bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
|
||||
}
|
||||
@@ -1191,7 +1193,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
|
||||
kdindices[i] = unsigned(i);
|
||||
|
||||
KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
|
||||
kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
|
||||
kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0);
|
||||
|
||||
// find a specific corner of the mesh to use as a starting point for meshlet flow
|
||||
float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;
|
||||
|
||||
17
3rdparty/meshoptimizer/src/meshoptimizer.h
vendored
17
3rdparty/meshoptimizer/src/meshoptimizer.h
vendored
@@ -360,13 +360,13 @@ MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, s
|
||||
* meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
|
||||
* Each 32-bit component is decoded in isolation; stride must be divisible by 4.
|
||||
*
|
||||
* Experimental: meshopt_decodeFilterColor decodes YCoCg (+A) color encoding where RGB is converted to YCoCg space with variable bit quantization.
|
||||
* meshopt_decodeFilterColor decodes YCoCg (+A) color encoding where RGB is converted to YCoCg space with variable bit quantization.
|
||||
* Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8.
|
||||
*/
|
||||
MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
|
||||
MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
|
||||
MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
|
||||
MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
|
||||
MESHOPTIMIZER_API void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
|
||||
|
||||
/**
|
||||
* Vertex buffer filter encoders
|
||||
@@ -384,7 +384,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t c
|
||||
* Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
|
||||
* Input data must contain stride/4 floats for every vector (count*stride/4 total).
|
||||
*
|
||||
* Experimental: meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with variable bit quantization.
|
||||
* meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with variable bit quantization.
|
||||
* Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8.
|
||||
* Input data must contain 4 floats for every color (count*4 total).
|
||||
*/
|
||||
@@ -403,7 +403,7 @@ enum meshopt_EncodeExpMode
|
||||
MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
|
||||
MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
|
||||
MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
|
||||
MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
|
||||
MESHOPTIMIZER_API void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
|
||||
|
||||
/**
|
||||
* Simplification options
|
||||
@@ -478,7 +478,7 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig
|
||||
MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
|
||||
|
||||
/**
|
||||
* Experimental: Mesh simplifier with position/attribute update
|
||||
* Mesh simplifier with position/attribute update
|
||||
* Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
|
||||
* Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance.
|
||||
* The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
|
||||
@@ -498,7 +498,7 @@ MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destinatio
|
||||
* options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
|
||||
* result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
|
||||
*/
|
||||
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
|
||||
MESHOPTIMIZER_API size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
|
||||
|
||||
/**
|
||||
* Mesh simplifier (sloppy)
|
||||
@@ -699,10 +699,9 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* me
|
||||
|
||||
/**
|
||||
* Meshlet optimizer
|
||||
* Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput
|
||||
* Reorders meshlet vertices and triangles to maximize locality which can improve rasterizer throughput or ray tracing performance when using fast-build modes.
|
||||
*
|
||||
* meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these
|
||||
* need to be computed from meshlet's vertex_offset and triangle_offset
|
||||
* meshlet_triangles and meshlet_vertices must refer to meshlet data; when buildMeshlets* is used, these need to be computed from meshlet's vertex_offset and triangle_offset
|
||||
* triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512)
|
||||
*/
|
||||
MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
|
||||
|
||||
15
3rdparty/meshoptimizer/src/partition.cpp
vendored
15
3rdparty/meshoptimizer/src/partition.cpp
vendored
@@ -10,6 +10,9 @@
|
||||
namespace meshopt
|
||||
{
|
||||
|
||||
// To avoid excessive recursion for malformed inputs, we switch to bisection after some depth
|
||||
const int kMergeDepthCutoff = 40;
|
||||
|
||||
struct ClusterAdjacency
|
||||
{
|
||||
unsigned int* offsets;
|
||||
@@ -434,7 +437,7 @@ static size_t mergePartition(unsigned int* order, size_t count, const ClusterGro
|
||||
return m;
|
||||
}
|
||||
|
||||
static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size)
|
||||
static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size, int depth)
|
||||
{
|
||||
size_t total = 0;
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
@@ -467,11 +470,13 @@ static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count
|
||||
size_t middle = mergePartition(order, count, groups, axis, split);
|
||||
|
||||
// enforce balance for degenerate partitions
|
||||
if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2)
|
||||
// this also ensures recursion depth is bounded on pathological inputs
|
||||
if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2 || depth >= kMergeDepthCutoff)
|
||||
middle = count / 2;
|
||||
|
||||
mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size);
|
||||
mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size);
|
||||
// recursion depth is logarithmic and bounded due to max depth check above
|
||||
mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
|
||||
mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
|
||||
}
|
||||
|
||||
} // namespace meshopt
|
||||
@@ -597,7 +602,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
|
||||
if (groups[i].size)
|
||||
merge_order[merge_offset++] = unsigned(i);
|
||||
|
||||
mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8);
|
||||
mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8, 0);
|
||||
}
|
||||
|
||||
// output each remaining group
|
||||
|
||||
90
3rdparty/meshoptimizer/src/simplifier.cpp
vendored
90
3rdparty/meshoptimizer/src/simplifier.cpp
vendored
@@ -620,7 +620,7 @@ static void rescaleAttributes(float* result, const float* vertex_attributes_data
|
||||
}
|
||||
}
|
||||
|
||||
static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_update)
|
||||
static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_kind, const unsigned char* vertex_update, const unsigned char* vertex_lock)
|
||||
{
|
||||
size_t vertex_positions_stride_float = vertex_positions_stride / sizeof(float);
|
||||
size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float);
|
||||
@@ -632,12 +632,20 @@ static void finalizeVertices(float* vertex_positions_data, size_t vertex_positio
|
||||
|
||||
unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
|
||||
|
||||
const Vector3& p = vertex_positions[i];
|
||||
float* v = vertex_positions_data + ri * vertex_positions_stride_float;
|
||||
// updating externally locked vertices is not allowed
|
||||
if (vertex_lock && (vertex_lock[ri] & meshopt_SimplifyVertex_Lock) != 0)
|
||||
continue;
|
||||
|
||||
v[0] = p.x * vertex_scale + vertex_offset[0];
|
||||
v[1] = p.y * vertex_scale + vertex_offset[1];
|
||||
v[2] = p.z * vertex_scale + vertex_offset[2];
|
||||
// moving locked vertices may result in floating point drift
|
||||
if (vertex_kind[i] != Kind_Locked)
|
||||
{
|
||||
const Vector3& p = vertex_positions[i];
|
||||
float* v = vertex_positions_data + ri * vertex_positions_stride_float;
|
||||
|
||||
v[0] = p.x * vertex_scale + vertex_offset[0];
|
||||
v[1] = p.y * vertex_scale + vertex_offset[1];
|
||||
v[2] = p.z * vertex_scale + vertex_offset[2];
|
||||
}
|
||||
|
||||
if (attribute_count)
|
||||
{
|
||||
@@ -1637,10 +1645,10 @@ static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_cou
|
||||
}
|
||||
}
|
||||
|
||||
static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update)
|
||||
static void solvePositions(Vector3* vertex_positions, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update)
|
||||
{
|
||||
#if TRACE
|
||||
size_t stats[5] = {};
|
||||
size_t stats[6] = {};
|
||||
#endif
|
||||
|
||||
for (size_t i = 0; i < vertex_count; ++i)
|
||||
@@ -1648,7 +1656,6 @@ static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, s
|
||||
if (!vertex_update[i])
|
||||
continue;
|
||||
|
||||
// moving externally locked vertices is prohibited
|
||||
// moving vertices on an attribute discontinuity may result in extrapolating UV outside of the chart bounds
|
||||
// moving vertices on a border requires a stronger edge quadric to preserve the border geometry
|
||||
if (vertex_kind[i] == Kind_Locked || vertex_kind[i] == Kind_Seam || vertex_kind[i] == Kind_Border)
|
||||
@@ -1712,36 +1719,64 @@ static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, s
|
||||
continue;
|
||||
}
|
||||
|
||||
// reject updates that increase positional error too much; allow some tolerance to improve attribute quality
|
||||
if (quadricError(vertex_quadrics[i], p) > quadricError(vertex_quadrics[i], vp) * 1.5f + 1e-6f)
|
||||
{
|
||||
TRACESTATS(5);
|
||||
continue;
|
||||
}
|
||||
|
||||
TRACESTATS(1);
|
||||
vertex_positions[i] = p;
|
||||
}
|
||||
|
||||
#if TRACE
|
||||
printf("updated %d/%d positions; failed solve %d bounds %d flip %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]));
|
||||
printf("updated %d/%d positions; failed solve %d bounds %d flip %d error %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]), int(stats[5]));
|
||||
#endif
|
||||
}
|
||||
|
||||
if (attribute_count == 0)
|
||||
return;
|
||||
|
||||
static void solveAttributes(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned char* vertex_update)
|
||||
{
|
||||
for (size_t i = 0; i < vertex_count; ++i)
|
||||
{
|
||||
if (!vertex_update[i])
|
||||
continue;
|
||||
|
||||
// updating externally locked vertices is prohibited
|
||||
if (vertex_kind[i] == Kind_Locked)
|
||||
if (remap[i] != i)
|
||||
continue;
|
||||
|
||||
const Vector3& p = vertex_positions[remap[i]];
|
||||
const Quadric& A = attribute_quadrics[i];
|
||||
|
||||
float iw = A.w == 0 ? 0.f : 1.f / A.w;
|
||||
|
||||
for (size_t k = 0; k < attribute_count; ++k)
|
||||
{
|
||||
const QuadricGrad& G = attribute_gradients[i * attribute_count + k];
|
||||
unsigned int shared = ~0u;
|
||||
|
||||
vertex_attributes[i * attribute_count + k] = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw;
|
||||
// for complex vertices, preserve attribute continuity and use highest weight wedge if values were shared
|
||||
if (vertex_kind[i] == Kind_Complex)
|
||||
{
|
||||
shared = unsigned(i);
|
||||
|
||||
for (unsigned int v = wedge[i]; v != i; v = wedge[v])
|
||||
if (vertex_attributes[v * attribute_count + k] != vertex_attributes[i * attribute_count + k])
|
||||
shared = ~0u;
|
||||
else if (shared != ~0u && attribute_quadrics[v].w > attribute_quadrics[shared].w)
|
||||
shared = v;
|
||||
}
|
||||
|
||||
// update attributes for all wedges
|
||||
unsigned int v = unsigned(i);
|
||||
do
|
||||
{
|
||||
unsigned int r = (shared == ~0u) ? v : shared;
|
||||
|
||||
const Vector3& p = vertex_positions[i]; // same for all wedges
|
||||
const Quadric& A = attribute_quadrics[r];
|
||||
const QuadricGrad& G = attribute_gradients[r * attribute_count + k];
|
||||
|
||||
float iw = A.w == 0 ? 0.f : 1.f / A.w;
|
||||
float av = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw;
|
||||
|
||||
vertex_attributes[v * attribute_count + k] = av;
|
||||
v = wedge[v];
|
||||
} while (v != i);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2522,16 +2557,19 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
|
||||
{
|
||||
unsigned int v = result[i];
|
||||
|
||||
// recomputing externally locked vertices may result in floating point drift
|
||||
vertex_update[v] = vertex_kind[v] != Kind_Locked;
|
||||
// mark the vertex for finalizeVertices and root vertex for solve*
|
||||
vertex_update[remap[v]] = vertex_update[v] = 1;
|
||||
}
|
||||
|
||||
// edge adjacency may be stale as we haven't updated it after last series of edge collapses
|
||||
updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap);
|
||||
|
||||
solveQuadrics(vertex_positions, vertex_attributes, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update);
|
||||
solvePositions(vertex_positions, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update);
|
||||
|
||||
finalizeVertices(const_cast<float*>(vertex_positions_data), vertex_positions_stride, const_cast<float*>(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_update);
|
||||
if (attribute_count)
|
||||
solveAttributes(vertex_positions, vertex_attributes, vertex_count, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, vertex_update);
|
||||
|
||||
finalizeVertices(const_cast<float*>(vertex_positions_data), vertex_positions_stride, const_cast<float*>(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_kind, vertex_update, vertex_lock);
|
||||
}
|
||||
|
||||
// if debug visualization data is requested, fill it instead of index data; for simplicity, this doesn't work with sparsity
|
||||
|
||||
1
3rdparty/meshoptimizer/src/spatialorder.cpp
vendored
1
3rdparty/meshoptimizer/src/spatialorder.cpp
vendored
@@ -208,6 +208,7 @@ static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigne
|
||||
partitionPoints(axis, temp, sides, split, count);
|
||||
}
|
||||
|
||||
// recursion depth is logarithmic and bounded as we always split in approximately half
|
||||
splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size);
|
||||
splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size);
|
||||
}
|
||||
|
||||
100
3rdparty/meshoptimizer/src/vertexfilter.cpp
vendored
100
3rdparty/meshoptimizer/src/vertexfilter.cpp
vendored
@@ -550,6 +550,13 @@ inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
|
||||
r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
|
||||
return vmulq_f32(x, r);
|
||||
}
|
||||
|
||||
#ifndef __ARM_FEATURE_FMA
|
||||
inline float32x4_t vfmaq_f32(float32x4_t x, float32x4_t y, float32x4_t z)
|
||||
{
|
||||
return vaddq_f32(x, vmulq_f32(y, z));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef SIMD_NEON
|
||||
@@ -580,23 +587,21 @@ static void decodeFilterOctSimd8(signed char* data, size_t count)
|
||||
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
|
||||
|
||||
// compute normal length & scale
|
||||
float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
|
||||
float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
|
||||
float32x4_t rl = vrsqrteq_f32(ll);
|
||||
float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
|
||||
|
||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
|
||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||
|
||||
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
|
||||
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
|
||||
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
|
||||
int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
|
||||
int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
|
||||
int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
|
||||
|
||||
// combine xr/yr/zr into final value
|
||||
int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
|
||||
res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
|
||||
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
|
||||
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
|
||||
int32x4_t res = vsliq_n_s32(xr, vsliq_n_s32(yr, zr, 8), 8);
|
||||
res = vbslq_s32(vdupq_n_u32(0xff000000), n4, res);
|
||||
|
||||
vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
|
||||
}
|
||||
@@ -634,21 +639,25 @@ static void decodeFilterOctSimd16(short* data, size_t count)
|
||||
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
|
||||
|
||||
// compute normal length & scale
|
||||
float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
|
||||
float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
|
||||
#if !defined(__aarch64__) && !defined(_M_ARM64)
|
||||
float32x4_t rl = vrsqrteq_f32(ll);
|
||||
rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
|
||||
float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
|
||||
#else
|
||||
float32x4_t s = vdivq_f32(vdupq_n_f32(32767.f), vsqrtq_f32(ll));
|
||||
#endif
|
||||
|
||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||
|
||||
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
|
||||
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
|
||||
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
|
||||
int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
|
||||
int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
|
||||
int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
|
||||
|
||||
// mix x/z and y/0 to make 16-bit unpack easier
|
||||
int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
|
||||
int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
|
||||
int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
|
||||
|
||||
// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
|
||||
@@ -694,7 +703,7 @@ static void decodeFilterQuatSimd(short* data, size_t count)
|
||||
|
||||
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
|
||||
float32x4_t ws = vmulq_f32(s, s);
|
||||
float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
|
||||
float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z));
|
||||
float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
|
||||
|
||||
// compute final scale; note that all computations above are unscaled
|
||||
@@ -705,26 +714,32 @@ static void decodeFilterQuatSimd(short* data, size_t count)
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||
|
||||
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, ss), fsnap));
|
||||
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, ss), fsnap));
|
||||
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, ss), fsnap));
|
||||
int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, ss), fsnap));
|
||||
int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, ss));
|
||||
int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, ss));
|
||||
int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, ss));
|
||||
int32x4_t wr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, w, ss));
|
||||
|
||||
// mix x/z and w/y to make 16-bit unpack easier
|
||||
int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
|
||||
int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
|
||||
int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
|
||||
int32x4_t wyr = vsliq_n_s32(wr, yr, 16);
|
||||
|
||||
// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
|
||||
int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
|
||||
int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
|
||||
uint64x2_t res_0 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
|
||||
uint64x2_t res_1 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
|
||||
|
||||
// store results to stack so that we can rotate using scalar instructions
|
||||
// TODO: volatile works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/166808
|
||||
volatile uint64_t res[4];
|
||||
vst1q_u64(const_cast<uint64_t*>(&res[0]), res_0);
|
||||
vst1q_u64(const_cast<uint64_t*>(&res[2]), res_1);
|
||||
|
||||
// rotate and store
|
||||
uint64_t* out = (uint64_t*)&data[i * 4];
|
||||
uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
|
||||
|
||||
out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
|
||||
out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
|
||||
out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
|
||||
out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
|
||||
out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
|
||||
out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
|
||||
out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
|
||||
out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -778,19 +793,16 @@ static void decodeFilterColorSimd8(unsigned char* data, size_t count)
|
||||
int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
|
||||
|
||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
|
||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||
|
||||
int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap));
|
||||
int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap));
|
||||
int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap));
|
||||
int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap));
|
||||
int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
|
||||
int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
|
||||
int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
|
||||
int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
|
||||
|
||||
// repack rgba into final value
|
||||
int32x4_t res = vandq_s32(rr, vdupq_n_s32(0xff));
|
||||
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(gr, vdupq_n_s32(0xff)), 8));
|
||||
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(br, vdupq_n_s32(0xff)), 16));
|
||||
res = vorrq_s32(res, vshlq_n_s32(ar, 24));
|
||||
int32x4_t res = vsliq_n_s32(rr, vsliq_n_s32(gr, vsliq_n_s32(br, ar, 8), 8), 8);
|
||||
|
||||
vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
|
||||
}
|
||||
@@ -835,14 +847,14 @@ static void decodeFilterColorSimd16(unsigned short* data, size_t count)
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||
|
||||
int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap));
|
||||
int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap));
|
||||
int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap));
|
||||
int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap));
|
||||
int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
|
||||
int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
|
||||
int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
|
||||
int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
|
||||
|
||||
// mix r/b and g/a to make 16-bit unpack easier
|
||||
int32x4_t rbr = vorrq_s32(vandq_s32(rr, vdupq_n_s32(0xffff)), vshlq_n_s32(br, 16));
|
||||
int32x4_t gar = vorrq_s32(vandq_s32(gr, vdupq_n_s32(0xffff)), vshlq_n_s32(ar, 16));
|
||||
int32x4_t rbr = vsliq_n_s32(rr, br, 16);
|
||||
int32x4_t gar = vsliq_n_s32(gr, ar, 16);
|
||||
|
||||
// pack r/g/b/a using 16-bit unpacks
|
||||
int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]);
|
||||
@@ -1145,7 +1157,7 @@ static void decodeFilterColorSimd16(unsigned short* data, size_t count)
|
||||
v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
|
||||
|
||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
|
||||
|
||||
v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
|
||||
|
||||
Reference in New Issue
Block a user