mirror of
https://github.com/bkaradzic/bgfx.git
synced 2026-02-17 12:42:34 +01:00
Updated meshoptimizer.
This commit is contained in:
committed by
Branimir Karadžić
parent
0292095363
commit
c35f437910
22
3rdparty/meshoptimizer/src/clusterizer.cpp
vendored
22
3rdparty/meshoptimizer/src/clusterizer.cpp
vendored
@@ -640,7 +640,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u
|
|||||||
return offset + count;
|
return offset + count;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
|
static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth)
|
||||||
{
|
{
|
||||||
assert(count > 0);
|
assert(count > 0);
|
||||||
assert(offset < node_count);
|
assert(offset < node_count);
|
||||||
@@ -672,7 +672,8 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
|
|||||||
size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
|
size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
|
||||||
|
|
||||||
// when the partition is degenerate simply consolidate the points into a single node
|
// when the partition is degenerate simply consolidate the points into a single node
|
||||||
if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
|
// this also ensures recursion depth is bounded on pathological inputs
|
||||||
|
if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth)
|
||||||
return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
|
return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
|
||||||
|
|
||||||
KDNode& result = nodes[offset];
|
KDNode& result = nodes[offset];
|
||||||
@@ -681,13 +682,13 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
|
|||||||
result.axis = axis;
|
result.axis = axis;
|
||||||
|
|
||||||
// left subtree is right after our node
|
// left subtree is right after our node
|
||||||
size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
|
size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1);
|
||||||
|
|
||||||
// distance to the right subtree is represented explicitly
|
// distance to the right subtree is represented explicitly
|
||||||
assert(next_offset - offset > 1);
|
assert(next_offset - offset > 1);
|
||||||
result.children = unsigned(next_offset - offset - 1);
|
result.children = unsigned(next_offset - offset - 1);
|
||||||
|
|
||||||
return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
|
return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
|
static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
|
||||||
@@ -739,6 +740,7 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
|
|||||||
if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0)
|
if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0)
|
||||||
nodes[root].children = 0;
|
nodes[root].children = 0;
|
||||||
|
|
||||||
|
// recursion depth is bounded by tree depth (which is limited by construction)
|
||||||
kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
|
kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
|
||||||
|
|
||||||
// only process the other node if it can have a match based on closest distance so far
|
// only process the other node if it can have a match based on closest distance so far
|
||||||
@@ -765,6 +767,7 @@ static float boxMerge(BVHBoxT& box, const BVHBox& other)
|
|||||||
__m128 min = _mm_loadu_ps(box.min);
|
__m128 min = _mm_loadu_ps(box.min);
|
||||||
__m128 max = _mm_loadu_ps(box.max);
|
__m128 max = _mm_loadu_ps(box.max);
|
||||||
|
|
||||||
|
// note: over-read is safe because BVHBox array is allocated with padding
|
||||||
min = _mm_min_ps(min, _mm_loadu_ps(other.min));
|
min = _mm_min_ps(min, _mm_loadu_ps(other.min));
|
||||||
max = _mm_max_ps(max, _mm_loadu_ps(other.max));
|
max = _mm_max_ps(max, _mm_loadu_ps(other.max));
|
||||||
|
|
||||||
@@ -785,6 +788,7 @@ static float boxMerge(BVHBoxT& box, const BVHBox& other)
|
|||||||
float32x4_t min = vld1q_f32(box.min);
|
float32x4_t min = vld1q_f32(box.min);
|
||||||
float32x4_t max = vld1q_f32(box.max);
|
float32x4_t max = vld1q_f32(box.max);
|
||||||
|
|
||||||
|
// note: over-read is safe because BVHBox array is allocated with padding
|
||||||
min = vminq_f32(min, vld1q_f32(other.min));
|
min = vminq_f32(min, vld1q_f32(other.min));
|
||||||
max = vmaxq_f32(max, vld1q_f32(other.max));
|
max = vmaxq_f32(max, vld1q_f32(other.max));
|
||||||
|
|
||||||
@@ -1046,9 +1050,6 @@ static void bvhPartition(unsigned int* target, const unsigned int* order, const
|
|||||||
|
|
||||||
static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
|
static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
|
||||||
{
|
{
|
||||||
if (depth >= kMeshletMaxTreeDepth)
|
|
||||||
return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
|
|
||||||
|
|
||||||
if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices)
|
if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices)
|
||||||
return bvhPackLeaf(boundary, count);
|
return bvhPackLeaf(boundary, count);
|
||||||
|
|
||||||
@@ -1091,8 +1092,8 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// this may happen if SAH costs along the admissible splits are NaN
|
// this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs
|
||||||
if (bestk < 0)
|
if (bestk < 0 || depth >= kMeshletMaxTreeDepth)
|
||||||
return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
|
return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
|
||||||
|
|
||||||
// mark sides of split for partitioning
|
// mark sides of split for partitioning
|
||||||
@@ -1117,6 +1118,7 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or
|
|||||||
bvhPartition(axis, temp, sides, bestsplit, count);
|
bvhPartition(axis, temp, sides, bestsplit, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// recursion depth is bounded due to max depth check above
|
||||||
bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
|
bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
|
||||||
bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
|
bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
|
||||||
}
|
}
|
||||||
@@ -1191,7 +1193,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
|
|||||||
kdindices[i] = unsigned(i);
|
kdindices[i] = unsigned(i);
|
||||||
|
|
||||||
KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
|
KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
|
||||||
kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
|
kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0);
|
||||||
|
|
||||||
// find a specific corner of the mesh to use as a starting point for meshlet flow
|
// find a specific corner of the mesh to use as a starting point for meshlet flow
|
||||||
float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;
|
float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;
|
||||||
|
|||||||
17
3rdparty/meshoptimizer/src/meshoptimizer.h
vendored
17
3rdparty/meshoptimizer/src/meshoptimizer.h
vendored
@@ -360,13 +360,13 @@ MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, s
|
|||||||
* meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
|
* meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
|
||||||
* Each 32-bit component is decoded in isolation; stride must be divisible by 4.
|
* Each 32-bit component is decoded in isolation; stride must be divisible by 4.
|
||||||
*
|
*
|
||||||
* Experimental: meshopt_decodeFilterColor decodes YCoCg (+A) color encoding where RGB is converted to YCoCg space with variable bit quantization.
|
* meshopt_decodeFilterColor decodes YCoCg (+A) color encoding where RGB is converted to YCoCg space with variable bit quantization.
|
||||||
* Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8.
|
* Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8.
|
||||||
*/
|
*/
|
||||||
MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
|
MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
|
||||||
MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
|
MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
|
||||||
MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
|
MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
|
||||||
MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
|
MESHOPTIMIZER_API void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Vertex buffer filter encoders
|
* Vertex buffer filter encoders
|
||||||
@@ -384,7 +384,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t c
|
|||||||
* Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
|
* Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
|
||||||
* Input data must contain stride/4 floats for every vector (count*stride/4 total).
|
* Input data must contain stride/4 floats for every vector (count*stride/4 total).
|
||||||
*
|
*
|
||||||
* Experimental: meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with variable bit quantization.
|
* meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with variable bit quantization.
|
||||||
* Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8.
|
* Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8.
|
||||||
* Input data must contain 4 floats for every color (count*4 total).
|
* Input data must contain 4 floats for every color (count*4 total).
|
||||||
*/
|
*/
|
||||||
@@ -403,7 +403,7 @@ enum meshopt_EncodeExpMode
|
|||||||
MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
|
MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
|
||||||
MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
|
MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
|
||||||
MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
|
MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
|
||||||
MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
|
MESHOPTIMIZER_API void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Simplification options
|
* Simplification options
|
||||||
@@ -478,7 +478,7 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig
|
|||||||
MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
|
MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Experimental: Mesh simplifier with position/attribute update
|
* Mesh simplifier with position/attribute update
|
||||||
* Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
|
* Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
|
||||||
* Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance.
|
* Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance.
|
||||||
* The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
|
* The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
|
||||||
@@ -498,7 +498,7 @@ MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destinatio
|
|||||||
* options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
|
* options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
|
||||||
* result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
|
* result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
|
||||||
*/
|
*/
|
||||||
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
|
MESHOPTIMIZER_API size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Mesh simplifier (sloppy)
|
* Mesh simplifier (sloppy)
|
||||||
@@ -699,10 +699,9 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* me
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Meshlet optimizer
|
* Meshlet optimizer
|
||||||
* Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput
|
* Reorders meshlet vertices and triangles to maximize locality which can improve rasterizer throughput or ray tracing performance when using fast-build modes.
|
||||||
*
|
*
|
||||||
* meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these
|
* meshlet_triangles and meshlet_vertices must refer to meshlet data; when buildMeshlets* is used, these need to be computed from meshlet's vertex_offset and triangle_offset
|
||||||
* need to be computed from meshlet's vertex_offset and triangle_offset
|
|
||||||
* triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512)
|
* triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512)
|
||||||
*/
|
*/
|
||||||
MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
|
MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
|
||||||
|
|||||||
15
3rdparty/meshoptimizer/src/partition.cpp
vendored
15
3rdparty/meshoptimizer/src/partition.cpp
vendored
@@ -10,6 +10,9 @@
|
|||||||
namespace meshopt
|
namespace meshopt
|
||||||
{
|
{
|
||||||
|
|
||||||
|
// To avoid excessive recursion for malformed inputs, we switch to bisection after some depth
|
||||||
|
const int kMergeDepthCutoff = 40;
|
||||||
|
|
||||||
struct ClusterAdjacency
|
struct ClusterAdjacency
|
||||||
{
|
{
|
||||||
unsigned int* offsets;
|
unsigned int* offsets;
|
||||||
@@ -434,7 +437,7 @@ static size_t mergePartition(unsigned int* order, size_t count, const ClusterGro
|
|||||||
return m;
|
return m;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size)
|
static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size, int depth)
|
||||||
{
|
{
|
||||||
size_t total = 0;
|
size_t total = 0;
|
||||||
for (size_t i = 0; i < count; ++i)
|
for (size_t i = 0; i < count; ++i)
|
||||||
@@ -467,11 +470,13 @@ static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count
|
|||||||
size_t middle = mergePartition(order, count, groups, axis, split);
|
size_t middle = mergePartition(order, count, groups, axis, split);
|
||||||
|
|
||||||
// enforce balance for degenerate partitions
|
// enforce balance for degenerate partitions
|
||||||
if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2)
|
// this also ensures recursion depth is bounded on pathological inputs
|
||||||
|
if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2 || depth >= kMergeDepthCutoff)
|
||||||
middle = count / 2;
|
middle = count / 2;
|
||||||
|
|
||||||
mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size);
|
// recursion depth is logarithmic and bounded due to max depth check above
|
||||||
mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size);
|
mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
|
||||||
|
mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace meshopt
|
} // namespace meshopt
|
||||||
@@ -597,7 +602,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
|
|||||||
if (groups[i].size)
|
if (groups[i].size)
|
||||||
merge_order[merge_offset++] = unsigned(i);
|
merge_order[merge_offset++] = unsigned(i);
|
||||||
|
|
||||||
mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8);
|
mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// output each remaining group
|
// output each remaining group
|
||||||
|
|||||||
90
3rdparty/meshoptimizer/src/simplifier.cpp
vendored
90
3rdparty/meshoptimizer/src/simplifier.cpp
vendored
@@ -620,7 +620,7 @@ static void rescaleAttributes(float* result, const float* vertex_attributes_data
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_update)
|
static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_kind, const unsigned char* vertex_update, const unsigned char* vertex_lock)
|
||||||
{
|
{
|
||||||
size_t vertex_positions_stride_float = vertex_positions_stride / sizeof(float);
|
size_t vertex_positions_stride_float = vertex_positions_stride / sizeof(float);
|
||||||
size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float);
|
size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float);
|
||||||
@@ -632,12 +632,20 @@ static void finalizeVertices(float* vertex_positions_data, size_t vertex_positio
|
|||||||
|
|
||||||
unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
|
unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
|
||||||
|
|
||||||
const Vector3& p = vertex_positions[i];
|
// updating externally locked vertices is not allowed
|
||||||
float* v = vertex_positions_data + ri * vertex_positions_stride_float;
|
if (vertex_lock && (vertex_lock[ri] & meshopt_SimplifyVertex_Lock) != 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
v[0] = p.x * vertex_scale + vertex_offset[0];
|
// moving locked vertices may result in floating point drift
|
||||||
v[1] = p.y * vertex_scale + vertex_offset[1];
|
if (vertex_kind[i] != Kind_Locked)
|
||||||
v[2] = p.z * vertex_scale + vertex_offset[2];
|
{
|
||||||
|
const Vector3& p = vertex_positions[i];
|
||||||
|
float* v = vertex_positions_data + ri * vertex_positions_stride_float;
|
||||||
|
|
||||||
|
v[0] = p.x * vertex_scale + vertex_offset[0];
|
||||||
|
v[1] = p.y * vertex_scale + vertex_offset[1];
|
||||||
|
v[2] = p.z * vertex_scale + vertex_offset[2];
|
||||||
|
}
|
||||||
|
|
||||||
if (attribute_count)
|
if (attribute_count)
|
||||||
{
|
{
|
||||||
@@ -1637,10 +1645,10 @@ static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_cou
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update)
|
static void solvePositions(Vector3* vertex_positions, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update)
|
||||||
{
|
{
|
||||||
#if TRACE
|
#if TRACE
|
||||||
size_t stats[5] = {};
|
size_t stats[6] = {};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (size_t i = 0; i < vertex_count; ++i)
|
for (size_t i = 0; i < vertex_count; ++i)
|
||||||
@@ -1648,7 +1656,6 @@ static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, s
|
|||||||
if (!vertex_update[i])
|
if (!vertex_update[i])
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
// moving externally locked vertices is prohibited
|
|
||||||
// moving vertices on an attribute discontinuity may result in extrapolating UV outside of the chart bounds
|
// moving vertices on an attribute discontinuity may result in extrapolating UV outside of the chart bounds
|
||||||
// moving vertices on a border requires a stronger edge quadric to preserve the border geometry
|
// moving vertices on a border requires a stronger edge quadric to preserve the border geometry
|
||||||
if (vertex_kind[i] == Kind_Locked || vertex_kind[i] == Kind_Seam || vertex_kind[i] == Kind_Border)
|
if (vertex_kind[i] == Kind_Locked || vertex_kind[i] == Kind_Seam || vertex_kind[i] == Kind_Border)
|
||||||
@@ -1712,36 +1719,64 @@ static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, s
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reject updates that increase positional error too much; allow some tolerance to improve attribute quality
|
||||||
|
if (quadricError(vertex_quadrics[i], p) > quadricError(vertex_quadrics[i], vp) * 1.5f + 1e-6f)
|
||||||
|
{
|
||||||
|
TRACESTATS(5);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
TRACESTATS(1);
|
TRACESTATS(1);
|
||||||
vertex_positions[i] = p;
|
vertex_positions[i] = p;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if TRACE
|
#if TRACE
|
||||||
printf("updated %d/%d positions; failed solve %d bounds %d flip %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]));
|
printf("updated %d/%d positions; failed solve %d bounds %d flip %d error %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]), int(stats[5]));
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
if (attribute_count == 0)
|
static void solveAttributes(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned char* vertex_update)
|
||||||
return;
|
{
|
||||||
|
|
||||||
for (size_t i = 0; i < vertex_count; ++i)
|
for (size_t i = 0; i < vertex_count; ++i)
|
||||||
{
|
{
|
||||||
if (!vertex_update[i])
|
if (!vertex_update[i])
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
// updating externally locked vertices is prohibited
|
if (remap[i] != i)
|
||||||
if (vertex_kind[i] == Kind_Locked)
|
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
const Vector3& p = vertex_positions[remap[i]];
|
|
||||||
const Quadric& A = attribute_quadrics[i];
|
|
||||||
|
|
||||||
float iw = A.w == 0 ? 0.f : 1.f / A.w;
|
|
||||||
|
|
||||||
for (size_t k = 0; k < attribute_count; ++k)
|
for (size_t k = 0; k < attribute_count; ++k)
|
||||||
{
|
{
|
||||||
const QuadricGrad& G = attribute_gradients[i * attribute_count + k];
|
unsigned int shared = ~0u;
|
||||||
|
|
||||||
vertex_attributes[i * attribute_count + k] = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw;
|
// for complex vertices, preserve attribute continuity and use highest weight wedge if values were shared
|
||||||
|
if (vertex_kind[i] == Kind_Complex)
|
||||||
|
{
|
||||||
|
shared = unsigned(i);
|
||||||
|
|
||||||
|
for (unsigned int v = wedge[i]; v != i; v = wedge[v])
|
||||||
|
if (vertex_attributes[v * attribute_count + k] != vertex_attributes[i * attribute_count + k])
|
||||||
|
shared = ~0u;
|
||||||
|
else if (shared != ~0u && attribute_quadrics[v].w > attribute_quadrics[shared].w)
|
||||||
|
shared = v;
|
||||||
|
}
|
||||||
|
|
||||||
|
// update attributes for all wedges
|
||||||
|
unsigned int v = unsigned(i);
|
||||||
|
do
|
||||||
|
{
|
||||||
|
unsigned int r = (shared == ~0u) ? v : shared;
|
||||||
|
|
||||||
|
const Vector3& p = vertex_positions[i]; // same for all wedges
|
||||||
|
const Quadric& A = attribute_quadrics[r];
|
||||||
|
const QuadricGrad& G = attribute_gradients[r * attribute_count + k];
|
||||||
|
|
||||||
|
float iw = A.w == 0 ? 0.f : 1.f / A.w;
|
||||||
|
float av = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw;
|
||||||
|
|
||||||
|
vertex_attributes[v * attribute_count + k] = av;
|
||||||
|
v = wedge[v];
|
||||||
|
} while (v != i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2522,16 +2557,19 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
|
|||||||
{
|
{
|
||||||
unsigned int v = result[i];
|
unsigned int v = result[i];
|
||||||
|
|
||||||
// recomputing externally locked vertices may result in floating point drift
|
// mark the vertex for finalizeVertices and root vertex for solve*
|
||||||
vertex_update[v] = vertex_kind[v] != Kind_Locked;
|
vertex_update[remap[v]] = vertex_update[v] = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// edge adjacency may be stale as we haven't updated it after last series of edge collapses
|
// edge adjacency may be stale as we haven't updated it after last series of edge collapses
|
||||||
updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap);
|
updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap);
|
||||||
|
|
||||||
solveQuadrics(vertex_positions, vertex_attributes, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update);
|
solvePositions(vertex_positions, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update);
|
||||||
|
|
||||||
finalizeVertices(const_cast<float*>(vertex_positions_data), vertex_positions_stride, const_cast<float*>(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_update);
|
if (attribute_count)
|
||||||
|
solveAttributes(vertex_positions, vertex_attributes, vertex_count, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, vertex_update);
|
||||||
|
|
||||||
|
finalizeVertices(const_cast<float*>(vertex_positions_data), vertex_positions_stride, const_cast<float*>(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_kind, vertex_update, vertex_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
// if debug visualization data is requested, fill it instead of index data; for simplicity, this doesn't work with sparsity
|
// if debug visualization data is requested, fill it instead of index data; for simplicity, this doesn't work with sparsity
|
||||||
|
|||||||
1
3rdparty/meshoptimizer/src/spatialorder.cpp
vendored
1
3rdparty/meshoptimizer/src/spatialorder.cpp
vendored
@@ -208,6 +208,7 @@ static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigne
|
|||||||
partitionPoints(axis, temp, sides, split, count);
|
partitionPoints(axis, temp, sides, split, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// recursion depth is logarithmic and bounded as we always split in approximately half
|
||||||
splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size);
|
splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size);
|
||||||
splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size);
|
splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size);
|
||||||
}
|
}
|
||||||
|
|||||||
100
3rdparty/meshoptimizer/src/vertexfilter.cpp
vendored
100
3rdparty/meshoptimizer/src/vertexfilter.cpp
vendored
@@ -550,6 +550,13 @@ inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
|
|||||||
r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
|
r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
|
||||||
return vmulq_f32(x, r);
|
return vmulq_f32(x, r);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef __ARM_FEATURE_FMA
|
||||||
|
inline float32x4_t vfmaq_f32(float32x4_t x, float32x4_t y, float32x4_t z)
|
||||||
|
{
|
||||||
|
return vaddq_f32(x, vmulq_f32(y, z));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef SIMD_NEON
|
#ifdef SIMD_NEON
|
||||||
@@ -580,23 +587,21 @@ static void decodeFilterOctSimd8(signed char* data, size_t count)
|
|||||||
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
|
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
|
||||||
|
|
||||||
// compute normal length & scale
|
// compute normal length & scale
|
||||||
float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
|
float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
|
||||||
float32x4_t rl = vrsqrteq_f32(ll);
|
float32x4_t rl = vrsqrteq_f32(ll);
|
||||||
float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
|
float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
|
||||||
|
|
||||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
|
||||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||||
|
|
||||||
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
|
int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
|
||||||
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
|
int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
|
||||||
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
|
int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
|
||||||
|
|
||||||
// combine xr/yr/zr into final value
|
// combine xr/yr/zr into final value
|
||||||
int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
|
int32x4_t res = vsliq_n_s32(xr, vsliq_n_s32(yr, zr, 8), 8);
|
||||||
res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
|
res = vbslq_s32(vdupq_n_u32(0xff000000), n4, res);
|
||||||
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
|
|
||||||
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
|
|
||||||
|
|
||||||
vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
|
vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
|
||||||
}
|
}
|
||||||
@@ -634,21 +639,25 @@ static void decodeFilterOctSimd16(short* data, size_t count)
|
|||||||
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
|
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
|
||||||
|
|
||||||
// compute normal length & scale
|
// compute normal length & scale
|
||||||
float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
|
float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
|
||||||
|
#if !defined(__aarch64__) && !defined(_M_ARM64)
|
||||||
float32x4_t rl = vrsqrteq_f32(ll);
|
float32x4_t rl = vrsqrteq_f32(ll);
|
||||||
rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
|
rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
|
||||||
float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
|
float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
|
||||||
|
#else
|
||||||
|
float32x4_t s = vdivq_f32(vdupq_n_f32(32767.f), vsqrtq_f32(ll));
|
||||||
|
#endif
|
||||||
|
|
||||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||||
|
|
||||||
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
|
int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
|
||||||
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
|
int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
|
||||||
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
|
int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
|
||||||
|
|
||||||
// mix x/z and y/0 to make 16-bit unpack easier
|
// mix x/z and y/0 to make 16-bit unpack easier
|
||||||
int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
|
int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
|
||||||
int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
|
int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
|
||||||
|
|
||||||
// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
|
// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
|
||||||
@@ -694,7 +703,7 @@ static void decodeFilterQuatSimd(short* data, size_t count)
|
|||||||
|
|
||||||
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
|
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
|
||||||
float32x4_t ws = vmulq_f32(s, s);
|
float32x4_t ws = vmulq_f32(s, s);
|
||||||
float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
|
float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z));
|
||||||
float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
|
float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
|
||||||
|
|
||||||
// compute final scale; note that all computations above are unscaled
|
// compute final scale; note that all computations above are unscaled
|
||||||
@@ -705,26 +714,32 @@ static void decodeFilterQuatSimd(short* data, size_t count)
|
|||||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||||
|
|
||||||
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, ss), fsnap));
|
int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, ss));
|
||||||
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, ss), fsnap));
|
int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, ss));
|
||||||
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, ss), fsnap));
|
int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, ss));
|
||||||
int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, ss), fsnap));
|
int32x4_t wr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, w, ss));
|
||||||
|
|
||||||
// mix x/z and w/y to make 16-bit unpack easier
|
// mix x/z and w/y to make 16-bit unpack easier
|
||||||
int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
|
int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
|
||||||
int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
|
int32x4_t wyr = vsliq_n_s32(wr, yr, 16);
|
||||||
|
|
||||||
// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
|
// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
|
||||||
int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
|
uint64x2_t res_0 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
|
||||||
int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
|
uint64x2_t res_1 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
|
||||||
|
|
||||||
|
// store results to stack so that we can rotate using scalar instructions
|
||||||
|
// TODO: volatile works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/166808
|
||||||
|
volatile uint64_t res[4];
|
||||||
|
vst1q_u64(const_cast<uint64_t*>(&res[0]), res_0);
|
||||||
|
vst1q_u64(const_cast<uint64_t*>(&res[2]), res_1);
|
||||||
|
|
||||||
// rotate and store
|
// rotate and store
|
||||||
uint64_t* out = (uint64_t*)&data[i * 4];
|
uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
|
||||||
|
|
||||||
out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
|
out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
|
||||||
out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
|
out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
|
||||||
out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
|
out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
|
||||||
out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
|
out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -778,19 +793,16 @@ static void decodeFilterColorSimd8(unsigned char* data, size_t count)
|
|||||||
int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
|
int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
|
||||||
|
|
||||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
|
||||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||||
|
|
||||||
int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap));
|
int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
|
||||||
int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap));
|
int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
|
||||||
int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap));
|
int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
|
||||||
int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap));
|
int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
|
||||||
|
|
||||||
// repack rgba into final value
|
// repack rgba into final value
|
||||||
int32x4_t res = vandq_s32(rr, vdupq_n_s32(0xff));
|
int32x4_t res = vsliq_n_s32(rr, vsliq_n_s32(gr, vsliq_n_s32(br, ar, 8), 8), 8);
|
||||||
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(gr, vdupq_n_s32(0xff)), 8));
|
|
||||||
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(br, vdupq_n_s32(0xff)), 16));
|
|
||||||
res = vorrq_s32(res, vshlq_n_s32(ar, 24));
|
|
||||||
|
|
||||||
vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
|
vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
|
||||||
}
|
}
|
||||||
@@ -835,14 +847,14 @@ static void decodeFilterColorSimd16(unsigned short* data, size_t count)
|
|||||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||||
|
|
||||||
int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap));
|
int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
|
||||||
int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap));
|
int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
|
||||||
int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap));
|
int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
|
||||||
int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap));
|
int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
|
||||||
|
|
||||||
// mix r/b and g/a to make 16-bit unpack easier
|
// mix r/b and g/a to make 16-bit unpack easier
|
||||||
int32x4_t rbr = vorrq_s32(vandq_s32(rr, vdupq_n_s32(0xffff)), vshlq_n_s32(br, 16));
|
int32x4_t rbr = vsliq_n_s32(rr, br, 16);
|
||||||
int32x4_t gar = vorrq_s32(vandq_s32(gr, vdupq_n_s32(0xffff)), vshlq_n_s32(ar, 16));
|
int32x4_t gar = vsliq_n_s32(gr, ar, 16);
|
||||||
|
|
||||||
// pack r/g/b/a using 16-bit unpacks
|
// pack r/g/b/a using 16-bit unpacks
|
||||||
int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]);
|
int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]);
|
||||||
@@ -1145,7 +1157,7 @@ static void decodeFilterColorSimd16(unsigned short* data, size_t count)
|
|||||||
v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
|
v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
|
||||||
|
|
||||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||||
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
|
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||||
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
|
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
|
||||||
|
|
||||||
v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
|
v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
|
||||||
|
|||||||
Reference in New Issue
Block a user