diff --git a/3rdparty/meshoptimizer/LICENSE.md b/3rdparty/meshoptimizer/LICENSE.md index ef9f5919f..a5c3b1ccc 100644 --- a/3rdparty/meshoptimizer/LICENSE.md +++ b/3rdparty/meshoptimizer/LICENSE.md @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2016-2024 Arseny Kapoulkine +Copyright (c) 2016-2025 Arseny Kapoulkine Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/3rdparty/meshoptimizer/src/clusterizer.cpp b/3rdparty/meshoptimizer/src/clusterizer.cpp index 738add5f2..b01342781 100644 --- a/3rdparty/meshoptimizer/src/clusterizer.cpp +++ b/3rdparty/meshoptimizer/src/clusterizer.cpp @@ -70,11 +70,77 @@ static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned for (size_t i = 0; i < vertex_count; ++i) { assert(adjacency.offsets[i] >= adjacency.counts[i]); - adjacency.offsets[i] -= adjacency.counts[i]; } } +static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator) +{ + size_t face_count = index_count / 3; + + // sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices + const unsigned int sparse_seen = 1u << 31; + assert(index_count < sparse_seen); + + // allocate arrays + adjacency.counts = allocator.allocate(vertex_count); + adjacency.offsets = allocator.allocate(vertex_count); + adjacency.data = allocator.allocate(index_count); + + // fill triangle counts + for (size_t i = 0; i < index_count; ++i) + assert(indices[i] < vertex_count); + + for (size_t i = 0; i < index_count; ++i) + adjacency.counts[indices[i]] = 0; + + for (size_t i = 0; i < index_count; ++i) + adjacency.counts[indices[i]]++; + + // fill offset table + unsigned int offset = 0; + + // when using sparse mode this pass uses sparse_seen bit to tag visited vertices + for (size_t i = 0; i < index_count; ++i) + { + unsigned int v = indices[i]; + + if ((adjacency.counts[v] & sparse_seen) == 0) + { + adjacency.offsets[v] = offset; + offset += adjacency.counts[v]; + adjacency.counts[v] |= sparse_seen; + } + } + + assert(offset == index_count); + + // fill triangle data + for (size_t i = 0; i < face_count; ++i) + { + unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; + + adjacency.data[adjacency.offsets[a]++] = unsigned(i); + adjacency.data[adjacency.offsets[b]++] = unsigned(i); + adjacency.data[adjacency.offsets[c]++] = unsigned(i); + } + + // fix offsets that have been disturbed by the previous pass + // when using sparse mode this pass also fixes counts (that were marked with sparse_seen) + for (size_t i = 0; i < index_count; ++i) + { + unsigned int v = indices[i]; + + if (adjacency.counts[v] & sparse_seen) + { + adjacency.counts[v] &= ~sparse_seen; + + assert(adjacency.offsets[v] >= adjacency.counts[v]); + adjacency.offsets[v] -= adjacency.counts[v]; + } + } +} + static void computeBoundingSphere(float result[4], const float points[][3], size_t count) { assert(count > 0); @@ -552,10 +618,13 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve meshopt_Allocator allocator; TriangleAdjacency2 adjacency = {}; - buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator); + if (vertex_count > index_count && index_count < (1u << 31)) + buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator); + else + buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator); - unsigned int* live_triangles = allocator.allocate(vertex_count); - memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int)); + // live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match + unsigned int* live_triangles = adjacency.counts; size_t face_count = index_count / 3; @@ -625,12 +694,9 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc)); } - live_triangles[a]--; - live_triangles[b]--; - live_triangles[c]--; - // remove emitted triangle from adjacency data // this makes sure that we spend less time traversing these lists on subsequent iterations + // live triangle counts are updated as a byproduct of these adjustments for (size_t k = 0; k < 3; ++k) { unsigned int index = indices[best_triangle * 3 + k]; diff --git a/3rdparty/meshoptimizer/src/meshoptimizer.h b/3rdparty/meshoptimizer/src/meshoptimizer.h index 6243947cf..baeba1522 100644 --- a/3rdparty/meshoptimizer/src/meshoptimizer.h +++ b/3rdparty/meshoptimizer/src/meshoptimizer.h @@ -1,7 +1,7 @@ /** * meshoptimizer - version 0.22 * - * Copyright (C) 2016-2024, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Copyright (C) 2016-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) * Report bugs and download new versions at https://github.com/zeux/meshoptimizer * * This library is distributed under the MIT License. See notice at the end of this file. @@ -289,7 +289,7 @@ MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, /** * Set vertex encoder format version - * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) + * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+) */ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version); @@ -608,34 +608,6 @@ MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const */ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); -/** - * Set allocation callbacks - * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library. - * Note that all algorithms only allocate memory for temporary use. - * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first. - */ -MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*)); - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -/* Quantization into commonly supported data formats */ -#ifdef __cplusplus -/** - * Quantize a float in [0..1] range into an N-bit fixed point unorm value - * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion - * Maximum reconstruction error: 1/2^(N+1) - */ -inline int meshopt_quantizeUnorm(float v, int N); - -/** - * Quantize a float in [-1..1] range into an N-bit fixed point snorm value - * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions) - * Maximum reconstruction error: 1/2^N - */ -inline int meshopt_quantizeSnorm(float v, int N); - /** * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest @@ -656,6 +628,34 @@ MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N); * Preserves Inf/NaN, flushes denormals to zero */ MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h); + +/** + * Set allocation callbacks + * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library. + * Note that all algorithms only allocate memory for temporary use. + * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first. + */ +MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*)); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +/* Quantization into fixed point normalized formats; these are only available as inline C++ functions */ +#ifdef __cplusplus +/** + * Quantize a float in [0..1] range into an N-bit fixed point unorm value + * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion + * Maximum reconstruction error: 1/2^(N+1) + */ +inline int meshopt_quantizeUnorm(float v, int N); + +/** + * Quantize a float in [-1..1] range into an N-bit fixed point snorm value + * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions) + * Maximum reconstruction error: 1/2^N + */ +inline int meshopt_quantizeSnorm(float v, int N); #endif /** @@ -1123,7 +1123,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_ #endif /** - * Copyright (c) 2016-2024 Arseny Kapoulkine + * Copyright (c) 2016-2025 Arseny Kapoulkine * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation diff --git a/3rdparty/meshoptimizer/src/vertexcodec.cpp b/3rdparty/meshoptimizer/src/vertexcodec.cpp index d3fc7bb16..b0ea61f5b 100644 --- a/3rdparty/meshoptimizer/src/vertexcodec.cpp +++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp @@ -138,12 +138,9 @@ const int kEncodeDefaultLevel = 2; static size_t getVertexBlockSize(size_t vertex_size) { - // make sure the entire block fits into the scratch buffer - size_t result = kVertexBlockSizeBytes / vertex_size; - - // align to byte group size; we encode each byte as a byte group - // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size - result &= ~(kByteGroupSize - 1); + // make sure the entire block fits into the scratch buffer and is aligned to byte group size + // note: the block size is implicitly part of the format, so we can't change it without breaking compatibility + size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1); return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize; } @@ -179,13 +176,14 @@ static Stats* bytestats = NULL; static Stats vertexstats[256]; #endif -static bool canEncodeZero(const unsigned char* buffer, size_t buffer_size) +static bool encodeBytesGroupZero(const unsigned char* buffer) { - for (size_t i = 0; i < buffer_size; ++i) - if (buffer[i]) - return false; + assert(kByteGroupSize == sizeof(unsigned long long) * 2); - return true; + unsigned long long v[2]; + memcpy(v, buffer, sizeof(v)); + + return (v[0] | v[1]) == 0; } static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits) @@ -193,7 +191,7 @@ static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits) assert(bits >= 0 && bits <= 8); if (bits == 0) - return canEncodeZero(buffer, kByteGroupSize) ? 0 : size_t(-1); + return encodeBytesGroupZero(buffer) ? 0 : size_t(-1); if (bits == 8) return kByteGroupSize; @@ -389,6 +387,11 @@ static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count, vertex += vertex_size; } +#if TRACE + for (int j = 0; j < 32; ++j) + vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1)); +#endif + for (int j = 0; j < 8; ++j) { unsigned int bitr = rotate(bitg, j); @@ -455,9 +458,18 @@ static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel; } +static bool estimateControlZero(const unsigned char* buffer, size_t vertex_count_aligned) +{ + for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize) + if (!encodeBytesGroupZero(buffer + i)) + return false; + + return true; +} + static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level) { - if (canEncodeZero(buffer, vertex_count)) + if (estimateControlZero(buffer, vertex_count_aligned)) return 2; // zero encoding if (level == 0) @@ -522,18 +534,6 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data #if TRACE const unsigned char* olddata = data; bytestats = &vertexstats[k]; - - for (size_t ig = 0; ig < vertex_count; ig += kByteGroupSize) - { - unsigned char last = (ig == 0) ? last_vertex[k] : vertex_data[vertex_size * (ig - 1) + k]; - unsigned char delta = 0xff; - - for (size_t i = ig; i < ig + kByteGroupSize && i < vertex_count; ++i) - delta &= ~(vertex_data[vertex_size * i + k] ^ last); - - for (int j = 0; j < 8; ++j) - bytestats->bitc[j] += (vertex_count - ig < kByteGroupSize ? vertex_count - ig : kByteGroupSize) * ((delta >> j) & 1); - } #endif int ctrl = 0; @@ -1349,6 +1349,34 @@ inline uint8x16_t rotate32(uint8x16_t v, int r) uint32x4_t v32 = vreinterpretq_u32_u8(v); return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32)))); } + +template +SIMD_TARGET inline uint8x8_t rebase(uint8x8_t npi, uint8x16_t r0, uint8x16_t r1, uint8x16_t r2, uint8x16_t r3) +{ + switch (Channel) + { + case 0: + { + uint8x16_t rsum = vaddq_u8(vaddq_u8(r0, r1), vaddq_u8(r2, r3)); + uint8x8_t rsumx = vadd_u8(vget_low_u8(rsum), vget_high_u8(rsum)); + return vadd_u8(vadd_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4)); + } + case 1: + { + uint16x8_t rsum = vaddq_u16(vaddq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)), vaddq_u16(vreinterpretq_u16_u8(r2), vreinterpretq_u16_u8(r3))); + uint16x4_t rsumx = vadd_u16(vget_low_u16(rsum), vget_high_u16(rsum)); + return vreinterpret_u8_u16(vadd_u16(vadd_u16(vreinterpret_u16_u8(npi), rsumx), vext_u16(rsumx, rsumx, 2))); + } + case 2: + { + uint8x16_t rsum = veorq_u8(veorq_u8(r0, r1), veorq_u8(r2, r3)); + uint8x8_t rsumx = veor_u8(vget_low_u8(rsum), vget_high_u8(rsum)); + return veor_u8(veor_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4)); + } + default: + return npi; + } +} #endif #ifdef SIMD_WASM @@ -1443,7 +1471,7 @@ decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t #define TEMP __m128i #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) -#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) +#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) #define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) #define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size #endif @@ -1461,9 +1489,9 @@ decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t #define TEMP v128_t #define PREP() v128_t pi = wasm_v128_load(last_vertex) #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) -#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) +#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) #define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) -#define SAVE(i) *reinterpret_cast(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size +#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size #endif #define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) @@ -1482,6 +1510,7 @@ decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t transpose8(r0, r1, r2, r3); TEMP t0, t1, t2, t3; + TEMP npi = pi; UNZR(0); GRP4(0); @@ -1503,6 +1532,13 @@ decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t FIXD(0), FIXD(1), FIXD(2), FIXD(3); SAVE(0), SAVE(1), SAVE(2), SAVE(3); +#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32)) + // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations + pi = rebase(npi, r0, r1, r2, r3); +#else + (void)npi; +#endif + #undef UNZR #undef TEMP #undef PREP @@ -1724,13 +1760,12 @@ size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100); } -#if TRACE > 1 - printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]", - double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100, - double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100, - double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100, - double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100); -#endif + if (level >= 3) + printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]", + double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100, + double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100, + double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100, + double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100); printf("\n"); } @@ -1768,8 +1803,7 @@ size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size) void meshopt_encodeVertexVersion(int version) { - // note: this version is experimental and the binary format is not finalized; this should not be used in production! - assert(unsigned(version) <= 0 || version == 0xe); + assert(unsigned(version) <= 1); meshopt::gEncodeVertexVersion = version; } @@ -1810,7 +1844,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve return -1; int version = data_header & 0x0f; - if (version > 0 && version != 0xe) + if (version > 1) return -1; size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);