From fc70df09414fd27686712c6ca3de4beec9621a03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=91=D1=80=D0=B0=D0=BD=D0=B8=D0=BC=D0=B8=D1=80=20=D0=9A?= =?UTF-8?q?=D0=B0=D1=80=D0=B0=D1=9F=D0=B8=D1=9B?= Date: Sat, 28 Dec 2024 22:37:47 -0800 Subject: [PATCH] Updated meshoptimizer. --- 3rdparty/meshoptimizer/src/clusterizer.cpp | 45 +- 3rdparty/meshoptimizer/src/meshoptimizer.h | 28 +- 3rdparty/meshoptimizer/src/simplifier.cpp | 28 +- 3rdparty/meshoptimizer/src/vertexcodec.cpp | 978 ++++++++++++++++----- 4 files changed, 818 insertions(+), 261 deletions(-) diff --git a/3rdparty/meshoptimizer/src/clusterizer.cpp b/3rdparty/meshoptimizer/src/clusterizer.cpp index 52fe5a362..738add5f2 100644 --- a/3rdparty/meshoptimizer/src/clusterizer.cpp +++ b/3rdparty/meshoptimizer/src/clusterizer.cpp @@ -238,7 +238,7 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int bool result = false; - unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff); + int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff); if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles) { @@ -283,10 +283,10 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int return result; } -static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight, unsigned int* out_extra) +static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight) { unsigned int best_triangle = ~0u; - unsigned int best_extra = 5; + int best_priority = 5; float best_score = FLT_MAX; for (size_t i = 0; i < meshlet.vertex_count; ++i) @@ -301,20 +301,26 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co unsigned int triangle = neighbors[j]; unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2]; - unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff); + int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff); + assert(extra <= 2); + + int priority = -1; // triangles that don't add new vertices to meshlets are max. priority - if (extra != 0) - { - // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets - if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1) - extra = 0; - - extra++; - } + if (extra == 0) + priority = 0; + // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets + else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1) + priority = 1; + // if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow + else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2) + priority = 1 + extra; + // otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count + else + priority = 2 + extra; // since topology-based priority is always more important than the score, we can skip scoring in some cases - if (extra > best_extra) + if (priority > best_priority) continue; float score = 0; @@ -341,18 +347,15 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co // note that topology-based priority is always more important than the score // this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost - if (extra < best_extra || score < best_score) + if (priority < best_priority || score < best_score) { best_triangle = triangle; - best_extra = extra; + best_priority = priority; best_score = score; } } } - if (out_extra) - *out_extra = best_extra; - return best_triangle; } @@ -588,13 +591,13 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve { Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count); - unsigned int best_extra = 0; - unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight, &best_extra); + unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight); + int best_extra = best_triangle == ~0u ? -1 : (used[indices[best_triangle * 3 + 0]] == 0xff) + (used[indices[best_triangle * 3 + 1]] == 0xff) + (used[indices[best_triangle * 3 + 2]] == 0xff); // if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles)) { - best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f, NULL); + best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f); } // when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity diff --git a/3rdparty/meshoptimizer/src/meshoptimizer.h b/3rdparty/meshoptimizer/src/meshoptimizer.h index abf398931..6243947cf 100644 --- a/3rdparty/meshoptimizer/src/meshoptimizer.h +++ b/3rdparty/meshoptimizer/src/meshoptimizer.h @@ -1,5 +1,5 @@ /** - * meshoptimizer - version 0.21 + * meshoptimizer - version 0.22 * * Copyright (C) 2016-2024, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) * Report bugs and download new versions at https://github.com/zeux/meshoptimizer @@ -12,7 +12,7 @@ #include /* Version macro; major * 1000 + minor * 10 + patch */ -#define MESHOPTIMIZER_VERSION 210 /* 0.21 */ +#define MESHOPTIMIZER_VERSION 220 /* 0.22 */ /* If no API is defined, assume default */ #ifndef MESHOPTIMIZER_API @@ -277,6 +277,16 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size); MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size); +/** + * Experimental: Vertex buffer encoder + * Encodes vertex data just like meshopt_encodeVertexBuffer, but allows to override compression level. + * For compression level to take effect, the vertex encoding version must be set to 1 via meshopt_encodeVertexVersion. + * The default compression level implied by meshopt_encodeVertexBuffer is 2. + * + * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio. + */ +MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level); + /** * Set vertex encoder format version * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) @@ -306,9 +316,9 @@ MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t verte * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M. * Each 32-bit component is decoded in isolation; stride must be divisible by 4. */ -MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride); +MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride); +MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride); +MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride); /** * Vertex buffer filter encoders @@ -334,13 +344,13 @@ enum meshopt_EncodeExpMode meshopt_EncodeExpSharedVector, /* When encoding exponents, use shared value for each component of all vectors (best compression) */ meshopt_EncodeExpSharedComponent, - /* When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */ + /* Experimental: When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */ meshopt_EncodeExpClamped, }; -MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode); +MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data); +MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data); +MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode); /** * Simplification options diff --git a/3rdparty/meshoptimizer/src/simplifier.cpp b/3rdparty/meshoptimizer/src/simplifier.cpp index af64cbda4..d464fc607 100644 --- a/3rdparty/meshoptimizer/src/simplifier.cpp +++ b/3rdparty/meshoptimizer/src/simplifier.cpp @@ -1026,7 +1026,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, c return collapse_count; } -static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap) +static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback) { for (size_t i = 0; i < collapse_count; ++i) { @@ -1041,7 +1041,7 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const unsigned int j1 = c.bidi ? i0 : i1; float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]); - float ej = quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]); + float ej = c.bidi ? quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]) : FLT_MAX; #if TRACE >= 3 float di = ei, dj = ej; @@ -1049,9 +1049,25 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const if (attribute_count) { - // note: ideally we would evaluate max/avg of attribute errors for seam edges, but it's not clear if it's worth the extra cost ei += quadricError(attribute_quadrics[i0], &attribute_gradients[i0 * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]); - ej += quadricError(attribute_quadrics[j0], &attribute_gradients[j0 * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]); + ej += c.bidi ? quadricError(attribute_quadrics[j0], &attribute_gradients[j0 * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]) : 0; + + // note: seam edges need to aggregate attribute errors between primary and secondary edges, as attribute quadrics are separate + if (vertex_kind[i0] == Kind_Seam) + { + // for seam collapses we need to find the seam pair; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges) + unsigned int s0 = wedge[i0]; + unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0]; + + assert(s0 != i0 && wedge[s0] == i0); + assert(s1 != ~0u && remap[s1] == remap[i1]); + + // note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe + s1 = (s1 != ~0u) ? s1 : wedge[i1]; + + ei += quadricError(attribute_quadrics[s0], &attribute_gradients[s0 * attribute_count], attribute_count, vertex_positions[s1], &vertex_attributes[s1 * attribute_count]); + ej += c.bidi ? quadricError(attribute_quadrics[s1], &attribute_gradients[s1 * attribute_count], attribute_count, vertex_positions[s0], &vertex_attributes[s0 * attribute_count]) : 0; + } } // pick edge direction with minimal error @@ -1206,7 +1222,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* } else if (kind == Kind_Seam) { - // for seam collapses we need to move the seam pair together; this is a bit tricky to compute since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges) + // for seam collapses we need to move the seam pair together; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges) unsigned int s0 = wedge[i0]; unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0]; assert(s0 != i0 && wedge[s0] == i0); @@ -1964,7 +1980,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic printf("pass %d:%c", int(pass_count++), TRACE >= 2 ? '\n' : ' '); #endif - rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap); + rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, loop, loopback); sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count); diff --git a/3rdparty/meshoptimizer/src/vertexcodec.cpp b/3rdparty/meshoptimizer/src/vertexcodec.cpp index 1dbd2e35f..d3fc7bb16 100644 --- a/3rdparty/meshoptimizer/src/vertexcodec.cpp +++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp @@ -60,6 +60,15 @@ #define SIMD_LATENCYOPT #endif +// In switch dispatch, marking default case as unreachable allows to remove redundant bounds checks +#if defined(__GNUC__) +#define SIMD_UNREACHABLE() __builtin_unreachable() +#elif defined(_MSC_VER) +#define SIMD_UNREACHABLE() __assume(false) +#else +#define SIMD_UNREACHABLE() assert(!"Unreachable") +#endif + #endif // !MESHOPTIMIZER_NO_SIMD #ifdef SIMD_SSE @@ -119,7 +128,13 @@ const size_t kVertexBlockSizeBytes = 8192; const size_t kVertexBlockMaxSize = 256; const size_t kByteGroupSize = 16; const size_t kByteGroupDecodeLimit = 24; -const size_t kTailMaxSize = 32; +const size_t kTailMinSizeV0 = 32; +const size_t kTailMinSizeV1 = 24; + +static const int kBitsV0[4] = {0, 2, 4, 8}; +static const int kBitsV1[5] = {0, 1, 2, 4, 8}; + +const int kEncodeDefaultLevel = 2; static size_t getVertexBlockSize(size_t vertex_size) { @@ -133,14 +148,21 @@ static size_t getVertexBlockSize(size_t vertex_size) return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize; } -inline unsigned char zigzag8(unsigned char v) +inline unsigned int rotate(unsigned int v, int r) { - return ((signed char)(v) >> 7) ^ (v << 1); + return (v << r) | (v >> ((32 - r) & 31)); } -inline unsigned char unzigzag8(unsigned char v) +template +inline T zigzag(T v) { - return -(v & 1) ^ (v >> 1); + return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1); +} + +template +inline T unzigzag(T v) +{ + return (0 - (v & 1)) ^ (v >> 1); } #if TRACE @@ -148,17 +170,18 @@ struct Stats { size_t size; size_t header; // bytes for header - size_t bitg[4]; // bytes for bit groups + size_t bitg[9]; // bytes for bit groups size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group + size_t ctrl[4]; // number of control groups }; static Stats* bytestats = NULL; static Stats vertexstats[256]; #endif -static bool encodeBytesGroupZero(const unsigned char* buffer) +static bool canEncodeZero(const unsigned char* buffer, size_t buffer_size) { - for (size_t i = 0; i < kByteGroupSize; ++i) + for (size_t i = 0; i < buffer_size; ++i) if (buffer[i]) return false; @@ -167,10 +190,10 @@ static bool encodeBytesGroupZero(const unsigned char* buffer) static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits) { - assert(bits >= 1 && bits <= 8); + assert(bits >= 0 && bits <= 8); - if (bits == 1) - return encodeBytesGroupZero(buffer) ? 0 : size_t(-1); + if (bits == 0) + return canEncodeZero(buffer, kByteGroupSize) ? 0 : size_t(-1); if (bits == 8) return kByteGroupSize; @@ -187,9 +210,10 @@ static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits) static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits) { - assert(bits >= 1 && bits <= 8); + assert(bits >= 0 && bits <= 8); + assert(kByteGroupSize % 8 == 0); - if (bits == 1) + if (bits == 0) return data; if (bits == 8) @@ -217,21 +241,27 @@ static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* byte |= enc; } + // encode 1-bit groups in reverse bit order + // this makes them faster to decode alongside other groups + if (bits == 1) + byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); + *data++ = byte; } for (size_t i = 0; i < kByteGroupSize; ++i) { - if (buffer[i] >= sentinel) - { - *data++ = buffer[i]; - } + unsigned char v = buffer[i]; + + // branchless append of out-of-range values + *data = v; + data += v >= sentinel; } return data; } -static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size) +static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size, const int bits[4]) { assert(buffer_size % kByteGroupSize == 0); @@ -247,39 +277,40 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, memset(header, 0, header_size); + int last_bits = -1; + for (size_t i = 0; i < buffer_size; i += kByteGroupSize) { if (size_t(data_end - data) < kByteGroupDecodeLimit) return NULL; - int best_bits = 8; - size_t best_size = encodeBytesGroupMeasure(buffer + i, 8); + int best_bitk = 3; + size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]); - for (int bits = 1; bits < 8; bits *= 2) + for (int bitk = 0; bitk < 3; ++bitk) { - size_t size = encodeBytesGroupMeasure(buffer + i, bits); + size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]); - if (size < best_size) + // favor consistent bit selection across groups, but never replace literals + if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8)) { - best_bits = bits; + best_bitk = bitk; best_size = size; } } - int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2 ? 1 : (best_bits == 4 ? 2 : 3)); - assert((1 << bitslog2) == best_bits); - size_t header_offset = i / kByteGroupSize; + header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2); - header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2); - + int best_bits = bits[best_bitk]; unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits); assert(data + best_size == next); data = next; + last_bits = best_bits; #if TRACE - bytestats->bitg[bitslog2] += best_size; + bytestats->bitg[best_bits] += best_size; #endif } @@ -290,30 +321,203 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, return data; } -static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256]) +template +static void encodeDeltas1(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int rot) +{ + size_t k0 = k & ~(sizeof(T) - 1); + int ks = (k & (sizeof(T) - 1)) * 8; + + T p = last_vertex[k0]; + for (size_t j = 1; j < sizeof(T); ++j) + p |= T(last_vertex[k0 + j]) << (j * 8); + + const unsigned char* vertex = vertex_data + k0; + + for (size_t i = 0; i < vertex_count; ++i) + { + T v = vertex[0]; + for (size_t j = 1; j < sizeof(T); ++j) + v |= vertex[j] << (j * 8); + + T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p)); + + buffer[i] = (unsigned char)(d >> ks); + p = v; + vertex += vertex_size; + } +} + +static void encodeDeltas(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int channel) +{ + switch (channel & 3) + { + case 0: + return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0); + case 1: + return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0); + case 2: + return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4); + default: + assert(!"Unsupported channel encoding"); // unreachable + } +} + +static int estimateBits(unsigned char v) +{ + return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8; +} + +static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t group_size) +{ + size_t sizes[8] = {}; + + const unsigned char* vertex = vertex_data + k; + unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24); + + for (size_t i = 0; i < vertex_count; i += group_size) + { + unsigned int bitg = 0; + + // calculate bit consistency mask for the group + for (size_t j = 0; j < group_size && i + j < vertex_count; ++j) + { + unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24); + unsigned int d = v ^ last; + + bitg |= d; + last = v; + vertex += vertex_size; + } + + for (int j = 0; j < 8; ++j) + { + unsigned int bitr = rotate(bitg, j); + + sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8)); + sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24)); + } + } + + int best_rot = 0; + for (int rot = 1; rot < 8; ++rot) + best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot; + + return best_rot; +} + +static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t vertex_block_size, size_t block_skip, int max_channel, int xor_rot) +{ + unsigned char block[kVertexBlockMaxSize]; + assert(vertex_block_size <= kVertexBlockMaxSize); + + unsigned char last_vertex[256] = {}; + + size_t sizes[3] = {}; + assert(max_channel <= 3); + + for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip) + { + size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i; + size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1); + + memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size); + + // we sometimes encode elements we didn't fill when rounding to kByteGroupSize + if (block_size < block_size_aligned) + memset(block + block_size, 0, block_size_aligned - block_size); + + for (int channel = 0; channel < max_channel; ++channel) + for (size_t j = 0; j < 4; ++j) + { + encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4)); + + for (size_t ig = 0; ig < block_size; ig += kByteGroupSize) + { + // to maximize encoding performance we only evaluate 1/2/4/8 bit groups + size_t size1 = encodeBytesGroupMeasure(block + ig, 1); + size_t size2 = encodeBytesGroupMeasure(block + ig, 2); + size_t size4 = encodeBytesGroupMeasure(block + ig, 4); + size_t size8 = encodeBytesGroupMeasure(block + ig, 8); + + size_t best_size = size1 < size2 ? size1 : size2; + best_size = best_size < size4 ? best_size : size4; + best_size = best_size < size8 ? best_size : size8; + + sizes[channel] += best_size; + } + } + } + + int best_channel = 0; + for (int channel = 1; channel < max_channel; ++channel) + best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel; + + return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel; +} + +static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level) +{ + if (canEncodeZero(buffer, vertex_count)) + return 2; // zero encoding + + if (level == 0) + return 1; // 1248 encoding in level 0 for encoding speed + + // round number of groups to 4 to get number of header bytes + size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4; + + size_t est_bytes0 = header_size, est_bytes1 = header_size; + + for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize) + { + // assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance + size_t size0 = encodeBytesGroupMeasure(buffer + i, 0); + size_t size1 = encodeBytesGroupMeasure(buffer + i, 1); + size_t size2 = encodeBytesGroupMeasure(buffer + i, 2); + size_t size4 = encodeBytesGroupMeasure(buffer + i, 4); + size_t size8 = encodeBytesGroupMeasure(buffer + i, 8); + + // both control modes have access to 1/2/4 bit encoding + size_t size12 = size1 < size2 ? size1 : size2; + size_t size124 = size12 < size4 ? size12 : size4; + + // each control mode has access to 0/8 bit encoding respectively + est_bytes0 += size124 < size0 ? size124 : size0; + est_bytes1 += size124 < size8 ? size124 : size8; + } + + // pick shortest control entry but prefer literal encoding + if (est_bytes0 < vertex_count || est_bytes1 < vertex_count) + return est_bytes0 < est_bytes1 ? 0 : 1; + else + return 3; // literal encoding +} + +static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version, int level) { assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); + assert(vertex_size % 4 == 0); unsigned char buffer[kVertexBlockMaxSize]; assert(sizeof(buffer) % kByteGroupSize == 0); + size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); + // we sometimes encode elements we didn't fill when rounding to kByteGroupSize memset(buffer, 0, sizeof(buffer)); + size_t control_size = version == 0 ? 0 : vertex_size / 4; + if (size_t(data_end - data) < control_size) + return NULL; + + unsigned char* control = data; + data += control_size; + + memset(control, 0, control_size); + for (size_t k = 0; k < vertex_size; ++k) { - size_t vertex_offset = k; - - unsigned char p = last_vertex[k]; - - for (size_t i = 0; i < vertex_count; ++i) - { - buffer[i] = zigzag8(vertex_data[vertex_offset] - p); - - p = vertex_data[vertex_offset]; - - vertex_offset += vertex_size; - } + encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]); #if TRACE const unsigned char* olddata = data; @@ -332,9 +536,35 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data } #endif - data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1)); - if (!data) - return NULL; + int ctrl = 0; + + if (version != 0) + { + ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level); + + assert(unsigned(ctrl) < 4); + control[k / 4] |= ctrl << ((k % 4) * 2); + +#if TRACE + vertexstats[k].ctrl[ctrl]++; +#endif + } + + if (ctrl == 3) + { + // literal encoding + if (size_t(data_end - data) < vertex_count) + return NULL; + + memcpy(data, buffer, vertex_count); + data += vertex_count; + } + else if (ctrl != 2) // non-zero encoding + { + data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl); + if (!data) + return NULL; + } #if TRACE bytestats = NULL; @@ -348,7 +578,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data } #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM)) -static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2) +static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bits) { #define READ() byte = *data++ #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1) @@ -356,12 +586,24 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned unsigned char byte, enc, encv; const unsigned char* data_var; - switch (bitslog2) + switch (bits) { case 0: memset(buffer, 0, kByteGroupSize); return data; case 1: + data_var = data + 2; + + // 2 groups with 8 1-bit values in each byte (reversed from the order in other groups) + READ(); + byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); + NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1); + READ(); + byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); + NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1); + + return data_var; + case 2: data_var = data + 4; // 4 groups with 4 2-bit values in each byte @@ -371,7 +613,7 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2); return data_var; - case 2: + case 4: data_var = data + 8; // 8 groups with 2 4-bit values in each byte @@ -385,11 +627,11 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned READ(), NEXT(4), NEXT(4); return data_var; - case 3: + case 8: memcpy(buffer, data, kByteGroupSize); return data + kByteGroupSize; default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value + assert(!"Unexpected bit length"); // unreachable return data; } @@ -397,18 +639,16 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned #undef NEXT } -static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size) +static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, const int* bits) { assert(buffer_size % kByteGroupSize == 0); - const unsigned char* header = data; - // round number of groups to 4 to get number of header bytes size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; - if (size_t(data_end - data) < header_size) return NULL; + const unsigned char* header = data; data += header_size; for (size_t i = 0; i < buffer_size; i += kByteGroupSize) @@ -417,43 +657,108 @@ static const unsigned char* decodeBytes(const unsigned char* data, const unsigne return NULL; size_t header_offset = i / kByteGroupSize; + int bitsk = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3; - int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3; - - data = decodeBytesGroup(data, buffer + i, bitslog2); + data = decodeBytesGroup(data, buffer + i, bits[bitsk]); } return data; } -static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256]) +template +static void decodeDeltas1(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count, size_t vertex_size, const unsigned char* last_vertex, int rot) +{ + for (size_t k = 0; k < 4; k += sizeof(T)) + { + size_t vertex_offset = k; + + T p = last_vertex[0]; + for (size_t j = 1; j < sizeof(T); ++j) + p |= last_vertex[j] << (8 * j); + + for (size_t i = 0; i < vertex_count; ++i) + { + T v = buffer[i]; + for (size_t j = 1; j < sizeof(T); ++j) + v |= buffer[i + vertex_count * j] << (8 * j); + + v = Xor ? T(rotate(v, rot)) ^ p : unzigzag(v) + p; + + for (size_t j = 0; j < sizeof(T); ++j) + transposed[vertex_offset + j] = (unsigned char)(v >> (j * 8)); + + p = v; + + vertex_offset += vertex_size; + } + + buffer += vertex_count * sizeof(T); + last_vertex += sizeof(T); + } +} + +static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version) { assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); - unsigned char buffer[kVertexBlockMaxSize]; + unsigned char buffer[kVertexBlockMaxSize * 4]; unsigned char transposed[kVertexBlockSizeBytes]; size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); assert(vertex_count <= vertex_count_aligned); - for (size_t k = 0; k < vertex_size; ++k) + size_t control_size = version == 0 ? 0 : vertex_size / 4; + if (size_t(data_end - data) < control_size) + return NULL; + + const unsigned char* control = data; + data += control_size; + + for (size_t k = 0; k < vertex_size; k += 4) { - data = decodeBytes(data, data_end, buffer, vertex_count_aligned); - if (!data) - return NULL; + unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4]; - size_t vertex_offset = k; - - unsigned char p = last_vertex[k]; - - for (size_t i = 0; i < vertex_count; ++i) + for (size_t j = 0; j < 4; ++j) { - unsigned char v = unzigzag8(buffer[i]) + p; + int ctrl = (ctrl_byte >> (j * 2)) & 3; - transposed[vertex_offset] = v; - p = v; + if (ctrl == 3) + { + // literal encoding + if (size_t(data_end - data) < vertex_count) + return NULL; - vertex_offset += vertex_size; + memcpy(buffer + j * vertex_count, data, vertex_count); + data += vertex_count; + } + else if (ctrl == 2) + { + // zero encoding + memset(buffer + j * vertex_count, 0, vertex_count); + } + else + { + data = decodeBytes(data, data_end, buffer + j * vertex_count, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl); + if (!data) + return NULL; + } + } + + int channel = version == 0 ? 0 : channels[k / 4]; + + switch (channel & 3) + { + case 0: + decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0); + break; + case 1: + decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0); + break; + case 2: + decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31); + break; + default: + return NULL; // invalid channel type } } @@ -499,7 +804,7 @@ static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables(); #ifdef SIMD_SSE SIMD_TARGET -static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1) +inline __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1) { __m128i sm0 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask0])); __m128i sm1 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask1])); @@ -511,11 +816,12 @@ static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1) } SIMD_TARGET -static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) { - switch (bitslog2) + switch (hbits) { case 0: + case 4: { __m128i result = _mm_setzero_si128(); @@ -525,6 +831,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 1: + case 6: { #ifdef __GNUC__ typedef int __attribute__((aligned(1))) unaligned_int; @@ -557,7 +864,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi unsigned char mask1 = (unsigned char)(mask16 >> 8); __m128i shuf = decodeShuffleMask(mask0, mask1); - __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel)); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); @@ -570,6 +876,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 2: + case 7: { #ifdef SIMD_LATENCYOPT unsigned long long data64; @@ -593,7 +900,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi unsigned char mask1 = (unsigned char)(mask16 >> 8); __m128i shuf = decodeShuffleMask(mask0, mask1); - __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel)); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); @@ -606,6 +912,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 3: + case 8: { __m128i result = _mm_loadu_si128(reinterpret_cast(data)); @@ -614,26 +921,46 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi return data + 16; } + case 5: + { + __m128i rest = _mm_loadu_si128(reinterpret_cast(data + 2)); + + unsigned char mask0 = data[0]; + unsigned char mask1 = data[1]; + + __m128i shuf = decodeShuffleMask(mask0, mask1); + __m128i result = _mm_shuffle_epi8(rest, shuf); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); + + return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value - return data; + SIMD_UNREACHABLE(); // unreachable } } #endif #ifdef SIMD_AVX -static const __m128i decodeBytesGroupConfig[] = { - _mm_set1_epi8(3), - _mm_set1_epi8(15), - _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24), - _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56), +static const __m128i kDecodeBytesGroupConfig[8][2] = { + {_mm_setzero_si128(), _mm_setzero_si128()}, + {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)}, + {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)}, + {_mm_setzero_si128(), _mm_setzero_si128()}, + {_mm_setzero_si128(), _mm_setzero_si128()}, + {_mm_set1_epi8(1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)}, + {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)}, + {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)}, }; -static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +SIMD_TARGET +inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) { - switch (bitslog2) + switch (hbits) { case 0: + case 4: { __m128i result = _mm_setzero_si128(); @@ -642,16 +969,19 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi return data; } - case 1: - case 2: + case 5: // 1-bit + case 1: // 2-bit + case 6: + case 2: // 4-bit + case 7: { - const unsigned char* skip = data + (bitslog2 << 2); + const unsigned char* skip = data + (2 << (hbits < 3 ? hbits : hbits - 5)); __m128i selb = _mm_loadl_epi64(reinterpret_cast(data)); __m128i rest = _mm_loadu_si128(reinterpret_cast(skip)); - __m128i sent = decodeBytesGroupConfig[bitslog2 - 1]; - __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1]; + __m128i sent = kDecodeBytesGroupConfig[hbits][0]; + __m128i ctrl = kDecodeBytesGroupConfig[hbits][1]; __m128i selw = _mm_shuffle_epi32(selb, 0x44); __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw)); @@ -665,6 +995,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 3: + case 8: { __m128i result = _mm_loadu_si128(reinterpret_cast(data)); @@ -674,14 +1005,14 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value - return data; + SIMD_UNREACHABLE(); // unreachable } } #endif #ifdef SIMD_NEON -static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1) +SIMD_TARGET +inline uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1) { uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]); uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]); @@ -692,7 +1023,8 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8 return vcombine_u8(r0, r1); } -static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1) +SIMD_TARGET +inline void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1) { // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 const uint64_t magic = 0x000103070f1f3f80ull; @@ -703,11 +1035,13 @@ static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& m mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56); } -static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +SIMD_TARGET +inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) { - switch (bitslog2) + switch (hbits) { case 0: + case 4: { uint8x16_t result = vdupq_n_u8(0); @@ -717,6 +1051,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 1: + case 6: { #ifdef SIMD_LATENCYOPT unsigned int data32; @@ -754,6 +1089,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 2: + case 7: { #ifdef SIMD_LATENCYOPT unsigned long long data64; @@ -788,6 +1124,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 3: + case 8: { uint8x16_t result = vld1q_u8(data); @@ -796,30 +1133,42 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi return data + 16; } + case 5: + { + unsigned char mask0 = data[0]; + unsigned char mask1 = data[1]; + + uint8x8_t rest0 = vld1_u8(data + 2); + uint8x8_t rest1 = vld1_u8(data + 2 + kDecodeBytesGroupCount[mask0]); + + uint8x16_t result = shuffleBytes(mask0, mask1, rest0, rest1); + + vst1q_u8(buffer, result); + + return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value - return data; + SIMD_UNREACHABLE(); // unreachable } } #endif #ifdef SIMD_WASM SIMD_TARGET -static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1) +inline v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1) { v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]); v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]); - v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]); - sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - + v128_t sm1off = wasm_v128_load8_splat(&kDecodeBytesGroupCount[mask0]); v128_t sm1r = wasm_i8x16_add(sm1, sm1off); return wasmx_unpacklo_v64x2(sm0, sm1r); } SIMD_TARGET -static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1) +inline void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1) { // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 const uint64_t magic = 0x000103070f1f3f80ull; @@ -829,11 +1178,12 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1 } SIMD_TARGET -static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) { - switch (bitslog2) + switch (hbits) { case 0: + case 4: { v128_t result = wasm_i8x16_splat(0); @@ -843,6 +1193,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 1: + case 6: { v128_t sel2 = wasm_v128_load(data); v128_t rest = wasm_v128_load(data + 4); @@ -857,7 +1208,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi wasmMoveMask(mask, mask0, mask1); v128_t shuf = decodeShuffleMask(mask0, mask1); - v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask); wasm_v128_store(buffer, result); @@ -866,6 +1216,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 2: + case 7: { v128_t sel4 = wasm_v128_load(data); v128_t rest = wasm_v128_load(data + 8); @@ -879,7 +1230,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi wasmMoveMask(mask, mask0, mask1); v128_t shuf = decodeShuffleMask(mask0, mask1); - v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask); wasm_v128_store(buffer, result); @@ -888,6 +1238,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 3: + case 8: { v128_t result = wasm_v128_load(data); @@ -896,16 +1247,30 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi return data + 16; } + case 5: + { + v128_t rest = wasm_v128_load(data + 2); + + unsigned char mask0 = data[0]; + unsigned char mask1 = data[1]; + + v128_t shuf = decodeShuffleMask(mask0, mask1); + v128_t result = wasm_i8x16_swizzle(rest, shuf); + + wasm_v128_store(buffer, result); + + return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value - return data; + SIMD_UNREACHABLE(); // unreachable } } #endif #if defined(SIMD_SSE) || defined(SIMD_AVX) SIMD_TARGET -static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) +inline void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) { __m128i t0 = _mm_unpacklo_epi8(x0, x1); __m128i t1 = _mm_unpackhi_epi8(x0, x1); @@ -919,17 +1284,33 @@ static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) } SIMD_TARGET -static __m128i unzigzag8(__m128i v) +inline __m128i unzigzag8(__m128i v) { __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1))); __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127)); return _mm_xor_si128(xl, xr); } + +SIMD_TARGET +inline __m128i unzigzag16(__m128i v) +{ + __m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1))); + __m128i xr = _mm_srli_epi16(v, 1); + + return _mm_xor_si128(xl, xr); +} + +SIMD_TARGET +inline __m128i rotate32(__m128i v, int r) +{ + return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r)); +} #endif #ifdef SIMD_NEON -static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3) +SIMD_TARGET +inline void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3) { uint8x16x2_t t01 = vzipq_u8(x0, x1); uint8x16x2_t t23 = vzipq_u8(x2, x3); @@ -943,18 +1324,36 @@ static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_ x3 = vreinterpretq_u8_u16(x23.val[1]); } -static uint8x16_t unzigzag8(uint8x16_t v) +SIMD_TARGET +inline uint8x16_t unzigzag8(uint8x16_t v) { uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1))))); uint8x16_t xr = vshrq_n_u8(v, 1); return veorq_u8(xl, xr); } + +SIMD_TARGET +inline uint8x16_t unzigzag16(uint8x16_t v) +{ + uint16x8_t vv = vreinterpretq_u16_u8(v); + uint8x16_t xl = vreinterpretq_u8_s16(vnegq_s16(vreinterpretq_s16_u16(vandq_u16(vv, vdupq_n_u16(1))))); + uint8x16_t xr = vreinterpretq_u8_u16(vshrq_n_u16(vv, 1)); + + return veorq_u8(xl, xr); +} + +SIMD_TARGET +inline uint8x16_t rotate32(uint8x16_t v, int r) +{ + uint32x4_t v32 = vreinterpretq_u32_u8(v); + return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32)))); +} #endif #ifdef SIMD_WASM SIMD_TARGET -static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3) +inline void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3) { v128_t t0 = wasmx_unpacklo_v8x16(x0, x1); v128_t t1 = wasmx_unpackhi_v8x16(x0, x1); @@ -968,44 +1367,57 @@ static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3) } SIMD_TARGET -static v128_t unzigzag8(v128_t v) +inline v128_t unzigzag8(v128_t v) { v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1))); v128_t xr = wasm_u8x16_shr(v, 1); return wasm_v128_xor(xl, xr); } + +SIMD_TARGET +inline v128_t unzigzag16(v128_t v) +{ + v128_t xl = wasm_i16x8_neg(wasm_v128_and(v, wasm_i16x8_splat(1))); + v128_t xr = wasm_u16x8_shr(v, 1); + + return wasm_v128_xor(xl, xr); +} + +SIMD_TARGET +inline v128_t rotate32(v128_t v, int r) +{ + return wasm_v128_or(wasm_i32x4_shl(v, r), wasm_i32x4_shr(v, 32 - r)); +} #endif #if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM) SIMD_TARGET -static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size) +static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, int hshift) { assert(buffer_size % kByteGroupSize == 0); assert(kByteGroupSize == 16); - const unsigned char* header = data; - // round number of groups to 4 to get number of header bytes size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; - if (size_t(data_end - data) < header_size) return NULL; + const unsigned char* header = data; data += header_size; size_t i = 0; - // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b + // fast-path: process 4 groups at a time, do a shared bounds check for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4) { size_t header_offset = i / kByteGroupSize; unsigned char header_byte = header[header_offset / 4]; - data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3); - data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3); - data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3); - data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3)); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3)); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3)); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3)); } // slow-path: process remaining groups @@ -1015,17 +1427,94 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns return NULL; size_t header_offset = i / kByteGroupSize; + unsigned char header_byte = header[header_offset / 4]; - int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3; - - data = decodeBytesGroupSimd(data, buffer + i, bitslog2); + data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3)); } return data; } +template +SIMD_TARGET static void +decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count_aligned, size_t vertex_size, unsigned char last_vertex[4], int rot) +{ +#if defined(SIMD_SSE) || defined(SIMD_AVX) +#define TEMP __m128i +#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) +#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) +#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) +#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) +#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size +#endif + +#ifdef SIMD_NEON +#define TEMP uint8x8_t +#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex), vdup_n_u32(0), 0)) +#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) +#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) +#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i)) +#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size +#endif + +#ifdef SIMD_WASM +#define TEMP v128_t +#define PREP() v128_t pi = wasm_v128_load(last_vertex) +#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) +#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) +#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) +#define SAVE(i) *reinterpret_cast(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size +#endif + +#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) + + PREP(); + + unsigned char* savep = transposed; + + for (size_t j = 0; j < vertex_count_aligned; j += 16) + { + LOAD(0); + LOAD(1); + LOAD(2); + LOAD(3); + + transpose8(r0, r1, r2, r3); + + TEMP t0, t1, t2, t3; + + UNZR(0); + GRP4(0); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + + UNZR(1); + GRP4(1); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + + UNZR(2); + GRP4(2); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + + UNZR(3); + GRP4(3); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + +#undef UNZR +#undef TEMP +#undef PREP +#undef LOAD +#undef GRP4 +#undef FIXD +#undef SAVE + } +} + SIMD_TARGET -static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256]) +static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version) { assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); @@ -1034,84 +1523,61 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); + size_t control_size = version == 0 ? 0 : vertex_size / 4; + if (size_t(data_end - data) < control_size) + return NULL; + + const unsigned char* control = data; + data += control_size; + for (size_t k = 0; k < vertex_size; k += 4) { + unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4]; + for (size_t j = 0; j < 4; ++j) { - data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned); - if (!data) - return NULL; + int ctrl = (ctrl_byte >> (j * 2)) & 3; + + if (ctrl == 3) + { + // literal encoding; safe to over-copy due to tail + if (size_t(data_end - data) < vertex_count_aligned) + return NULL; + + memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned); + data += vertex_count; + } + else if (ctrl == 2) + { + // zero encoding + memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned); + } + else + { + // for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8 + int hshift = version == 0 ? 0 : 4 + ctrl; + + data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift); + if (!data) + return NULL; + } } -#if defined(SIMD_SSE) || defined(SIMD_AVX) -#define TEMP __m128i -#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex + k)) -#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) -#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) -#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i) -#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size -#endif + int channel = version == 0 ? 0 : channels[k / 4]; -#ifdef SIMD_NEON -#define TEMP uint8x8_t -#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex + k), vdup_n_u32(0), 0)) -#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) -#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) -#define FIXD(i) t##i = pi = vadd_u8(pi, t##i) -#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size -#endif - -#ifdef SIMD_WASM -#define TEMP v128_t -#define PREP() v128_t pi = wasm_v128_load(last_vertex + k) -#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) -#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) -#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i) -#define SAVE(i) *reinterpret_cast(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size -#endif - - PREP(); - - unsigned char* savep = transposed + k; - - for (size_t j = 0; j < vertex_count_aligned; j += 16) + switch (channel & 3) { - LOAD(0); - LOAD(1); - LOAD(2); - LOAD(3); - - r0 = unzigzag8(r0); - r1 = unzigzag8(r1); - r2 = unzigzag8(r2); - r3 = unzigzag8(r3); - - transpose8(r0, r1, r2, r3); - - TEMP t0, t1, t2, t3; - - GRP4(0); - FIXD(0), FIXD(1), FIXD(2), FIXD(3); - SAVE(0), SAVE(1), SAVE(2), SAVE(3); - - GRP4(1); - FIXD(0), FIXD(1), FIXD(2), FIXD(3); - SAVE(0), SAVE(1), SAVE(2), SAVE(3); - - GRP4(2); - FIXD(0), FIXD(1), FIXD(2), FIXD(3); - SAVE(0), SAVE(1), SAVE(2), SAVE(3); - - GRP4(3); - FIXD(0), FIXD(1), FIXD(2), FIXD(3); - SAVE(0), SAVE(1), SAVE(2), SAVE(3); - -#undef TEMP -#undef PREP -#undef LOAD -#undef GRP4 -#undef FIXD -#undef SAVE + case 0: + decodeDeltas4Simd<0>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0); + break; + case 1: + decodeDeltas4Simd<1>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0); + break; + case 2: + decodeDeltas4Simd<2>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31); + break; + default: + return NULL; // invalid channel type } } @@ -1140,12 +1606,13 @@ static unsigned int cpuid = getCpuFeatures(); } // namespace meshopt -size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size) +size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level) { using namespace meshopt; assert(vertex_size > 0 && vertex_size <= 256); assert(vertex_size % 4 == 0); + assert(level >= 0 && level <= 9); // only a subset of this range is used right now #if TRACE memset(vertexstats, 0, sizeof(vertexstats)); @@ -1156,7 +1623,7 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con unsigned char* data = buffer; unsigned char* data_end = buffer + buffer_size; - if (size_t(data_end - data) < 1 + vertex_size) + if (size_t(data_end - data) < 1) return 0; int version = gEncodeVertexVersion; @@ -1172,34 +1639,52 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con size_t vertex_block_size = getVertexBlockSize(vertex_size); + unsigned char channels[64] = {}; + if (version != 0 && level > 1 && vertex_count > 1) + for (size_t k = 0; k < vertex_size; k += 4) + { + int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0; + int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot); + + assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8)); + channels[k / 4] = (unsigned char)channel; + } + size_t vertex_offset = 0; while (vertex_offset < vertex_count) { size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; - data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex); + data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level); if (!data) return 0; vertex_offset += block_size; } - size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size; + size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4); + size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1; + size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; - if (size_t(data_end - data) < tail_size) + if (size_t(data_end - data) < tail_size_pad) return 0; - // write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder - if (vertex_size < kTailMaxSize) + if (tail_size < tail_size_pad) { - memset(data, 0, kTailMaxSize - vertex_size); - data += kTailMaxSize - vertex_size; + memset(data, 0, tail_size_pad - tail_size); + data += tail_size_pad - tail_size; } memcpy(data, first_vertex, vertex_size); data += vertex_size; + if (version != 0) + { + memcpy(data, channels, vertex_size / 4); + data += vertex_size / 4; + } + assert(data >= buffer + tail_size); assert(data <= buffer + buffer_size); @@ -1212,17 +1697,41 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8); - size_t total_k = vsk.header + vsk.bitg[0] + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[3]; + size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8]; + double total_kr = total_k ? 1.0 / double(total_k) : 0; - printf(" |\thdr [%5.1f%%] bitg 1-3 [%4.1f%% %4.1f%% %4.1f%%]", - double(vsk.header) / double(total_k) * 100, double(vsk.bitg[1]) / double(total_k) * 100, - double(vsk.bitg[2]) / double(total_k) * 100, double(vsk.bitg[3]) / double(total_k) * 100); + if (version != 0) + { + int channel = channels[k / 4]; - printf(" |\tbitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]", + if ((channel & 3) == 2 && k % 4 == 0) + printf(" | ^%d", channel >> 4); + else + printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : ".")); + } + + printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]", + double(vsk.header) * total_kr * 100, + double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100, + double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100); + + size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3]; + + if (total_ctrl) + { + printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%", + double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100, + double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100); + } + +#if TRACE > 1 + printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]", double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100, double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100, double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100, double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100); +#endif + printf("\n"); } #endif @@ -1230,6 +1739,11 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con return data - buffer; } +size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size) +{ + return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, meshopt::kEncodeDefaultLevel); +} + size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size) { using namespace meshopt; @@ -1240,17 +1754,22 @@ size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size) size_t vertex_block_size = getVertexBlockSize(vertex_size); size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size; + size_t vertex_block_control_size = vertex_size / 4; size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4; size_t vertex_block_data_size = vertex_block_size; - size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size; + size_t tail_size = vertex_size + (vertex_size / 4); + size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1; + size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; + assert(tail_size_pad >= kByteGroupDecodeLimit); - return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size; + return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad; } void meshopt_encodeVertexVersion(int version) { - assert(unsigned(version) <= 0); + // note: this version is experimental and the binary format is not finalized; this should not be used in production! + assert(unsigned(version) <= 0 || version == 0xe); meshopt::gEncodeVertexVersion = version; } @@ -1262,7 +1781,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve assert(vertex_size > 0 && vertex_size <= 256); assert(vertex_size % 4 == 0); - const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL; + const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL; #if defined(SIMD_SSE) && defined(SIMD_FALLBACK) decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock; @@ -1282,7 +1801,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve const unsigned char* data = buffer; const unsigned char* data_end = buffer + buffer_size; - if (size_t(data_end - data) < 1 + vertex_size) + if (size_t(data_end - data) < 1) return -2; unsigned char data_header = *data++; @@ -1291,11 +1810,22 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve return -1; int version = data_header & 0x0f; - if (version > 0) + if (version > 0 && version != 0xe) return -1; + size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4); + size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1; + size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; + + if (size_t(data_end - data) < tail_size_pad) + return -2; + + const unsigned char* tail = data_end - tail_size; + unsigned char last_vertex[256]; - memcpy(last_vertex, data_end - vertex_size, vertex_size); + memcpy(last_vertex, tail, vertex_size); + + const unsigned char* channels = version == 0 ? NULL : tail + vertex_size; size_t vertex_block_size = getVertexBlockSize(vertex_size); @@ -1305,16 +1835,14 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve { size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; - data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex); + data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version); if (!data) return -2; vertex_offset += block_size; } - size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size; - - if (size_t(data_end - data) != tail_size) + if (size_t(data_end - data) != tail_size_pad) return -3; return 0;