From fc70df09414fd27686712c6ca3de4beec9621a03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=91=D1=80=D0=B0=D0=BD=D0=B8=D0=BC=D0=B8=D1=80=20=D0=9A?=
 =?UTF-8?q?=D0=B0=D1=80=D0=B0=D1=9F=D0=B8=D1=9B?=
 <branimirkaradzic@gmail.com>
Date: Sat, 28 Dec 2024 22:37:47 -0800
Subject: [PATCH] Updated meshoptimizer.

---
 3rdparty/meshoptimizer/src/clusterizer.cpp |  45 +-
 3rdparty/meshoptimizer/src/meshoptimizer.h |  28 +-
 3rdparty/meshoptimizer/src/simplifier.cpp  |  28 +-
 3rdparty/meshoptimizer/src/vertexcodec.cpp | 978 ++++++++++++++++-----
 4 files changed, 818 insertions(+), 261 deletions(-)

diff --git a/3rdparty/meshoptimizer/src/clusterizer.cpp b/3rdparty/meshoptimizer/src/clusterizer.cpp
index 52fe5a362..738add5f2 100644
--- a/3rdparty/meshoptimizer/src/clusterizer.cpp
+++ b/3rdparty/meshoptimizer/src/clusterizer.cpp
@@ -238,7 +238,7 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int
 
 	bool result = false;
 
-	unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+	int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
 
 	if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
 	{
@@ -283,10 +283,10 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int
 	return result;
 }
 
-static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight, unsigned int* out_extra)
+static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight)
 {
 	unsigned int best_triangle = ~0u;
-	unsigned int best_extra = 5;
+	int best_priority = 5;
 	float best_score = FLT_MAX;
 
 	for (size_t i = 0; i < meshlet.vertex_count; ++i)
@@ -301,20 +301,26 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co
 			unsigned int triangle = neighbors[j];
 			unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
 
-			unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
+			int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
+			assert(extra <= 2);
+
+			int priority = -1;
 
 			// triangles that don't add new vertices to meshlets are max. priority
-			if (extra != 0)
-			{
-				// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
-				if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
-					extra = 0;
-
-				extra++;
-			}
+			if (extra == 0)
+				priority = 0;
+			// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
+			else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
+				priority = 1;
+			// if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow
+			else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2)
+				priority = 1 + extra;
+			// otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count
+			else
+				priority = 2 + extra;
 
 			// since topology-based priority is always more important than the score, we can skip scoring in some cases
-			if (extra > best_extra)
+			if (priority > best_priority)
 				continue;
 
 			float score = 0;
@@ -341,18 +347,15 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co
 
 			// note that topology-based priority is always more important than the score
 			// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
-			if (extra < best_extra || score < best_score)
+			if (priority < best_priority || score < best_score)
 			{
 				best_triangle = triangle;
-				best_extra = extra;
+				best_priority = priority;
 				best_score = score;
 			}
 		}
 	}
 
-	if (out_extra)
-		*out_extra = best_extra;
-
 	return best_triangle;
 }
 
@@ -588,13 +591,13 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 	{
 		Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
 
-		unsigned int best_extra = 0;
-		unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight, &best_extra);
+		unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight);
+		int best_extra = best_triangle == ~0u ? -1 : (used[indices[best_triangle * 3 + 0]] == 0xff) + (used[indices[best_triangle * 3 + 1]] == 0xff) + (used[indices[best_triangle * 3 + 2]] == 0xff);
 
 		// if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring
 		if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
 		{
-			best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f, NULL);
+			best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f);
 		}
 
 		// when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
diff --git a/3rdparty/meshoptimizer/src/meshoptimizer.h b/3rdparty/meshoptimizer/src/meshoptimizer.h
index abf398931..6243947cf 100644
--- a/3rdparty/meshoptimizer/src/meshoptimizer.h
+++ b/3rdparty/meshoptimizer/src/meshoptimizer.h
@@ -1,5 +1,5 @@
 /**
- * meshoptimizer - version 0.21
+ * meshoptimizer - version 0.22
  *
  * Copyright (C) 2016-2024, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
  * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
@@ -12,7 +12,7 @@
 #include <stddef.h>
 
 /* Version macro; major * 1000 + minor * 10 + patch */
-#define MESHOPTIMIZER_VERSION 210 /* 0.21 */
+#define MESHOPTIMIZER_VERSION 220 /* 0.22 */
 
 /* If no API is defined, assume default */
 #ifndef MESHOPTIMIZER_API
@@ -277,6 +277,16 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde
 MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
 MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
 
+/**
+ * Experimental: Vertex buffer encoder
+ * Encodes vertex data just like meshopt_encodeVertexBuffer, but allows to override compression level.
+ * For compression level to take effect, the vertex encoding version must be set to 1 via meshopt_encodeVertexVersion.
+ * The default compression level implied by meshopt_encodeVertexBuffer is 2.
+ *
+ * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio.
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level);
+
 /**
  * Set vertex encoder format version
  * version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
@@ -306,9 +316,9 @@ MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t verte
  * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
  * Each 32-bit component is decoded in isolation; stride must be divisible by 4.
  */
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
 
 /**
  * Vertex buffer filter encoders
@@ -334,13 +344,13 @@ enum meshopt_EncodeExpMode
 	meshopt_EncodeExpSharedVector,
 	/* When encoding exponents, use shared value for each component of all vectors (best compression) */
 	meshopt_EncodeExpSharedComponent,
-	/* When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */
+	/* Experimental: When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */
 	meshopt_EncodeExpClamped,
 };
 
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
+MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
+MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
+MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
 
 /**
  * Simplification options
diff --git a/3rdparty/meshoptimizer/src/simplifier.cpp b/3rdparty/meshoptimizer/src/simplifier.cpp
index af64cbda4..d464fc607 100644
--- a/3rdparty/meshoptimizer/src/simplifier.cpp
+++ b/3rdparty/meshoptimizer/src/simplifier.cpp
@@ -1026,7 +1026,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, c
 	return collapse_count;
 }
 
-static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap)
+static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
 {
 	for (size_t i = 0; i < collapse_count; ++i)
 	{
@@ -1041,7 +1041,7 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
 		unsigned int j1 = c.bidi ? i0 : i1;
 
 		float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]);
-		float ej = quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]);
+		float ej = c.bidi ? quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]) : FLT_MAX;
 
 #if TRACE >= 3
 		float di = ei, dj = ej;
@@ -1049,9 +1049,25 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
 
 		if (attribute_count)
 		{
-			// note: ideally we would evaluate max/avg of attribute errors for seam edges, but it's not clear if it's worth the extra cost
 			ei += quadricError(attribute_quadrics[i0], &attribute_gradients[i0 * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]);
-			ej += quadricError(attribute_quadrics[j0], &attribute_gradients[j0 * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]);
+			ej += c.bidi ? quadricError(attribute_quadrics[j0], &attribute_gradients[j0 * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]) : 0;
+
+			// note: seam edges need to aggregate attribute errors between primary and secondary edges, as attribute quadrics are separate
+			if (vertex_kind[i0] == Kind_Seam)
+			{
+				// for seam collapses we need to find the seam pair; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
+				unsigned int s0 = wedge[i0];
+				unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0];
+
+				assert(s0 != i0 && wedge[s0] == i0);
+				assert(s1 != ~0u && remap[s1] == remap[i1]);
+
+				// note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe
+				s1 = (s1 != ~0u) ? s1 : wedge[i1];
+
+				ei += quadricError(attribute_quadrics[s0], &attribute_gradients[s0 * attribute_count], attribute_count, vertex_positions[s1], &vertex_attributes[s1 * attribute_count]);
+				ej += c.bidi ? quadricError(attribute_quadrics[s1], &attribute_gradients[s1 * attribute_count], attribute_count, vertex_positions[s0], &vertex_attributes[s0 * attribute_count]) : 0;
+			}
 		}
 
 		// pick edge direction with minimal error
@@ -1206,7 +1222,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
 		}
 		else if (kind == Kind_Seam)
 		{
-			// for seam collapses we need to move the seam pair together; this is a bit tricky to compute since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
+			// for seam collapses we need to move the seam pair together; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
 			unsigned int s0 = wedge[i0];
 			unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0];
 			assert(s0 != i0 && wedge[s0] == i0);
@@ -1964,7 +1980,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 		printf("pass %d:%c", int(pass_count++), TRACE >= 2 ? '\n' : ' ');
 #endif
 
-		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap);
+		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, loop, loopback);
 
 		sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count);
 
diff --git a/3rdparty/meshoptimizer/src/vertexcodec.cpp b/3rdparty/meshoptimizer/src/vertexcodec.cpp
index 1dbd2e35f..d3fc7bb16 100644
--- a/3rdparty/meshoptimizer/src/vertexcodec.cpp
+++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp
@@ -60,6 +60,15 @@
 #define SIMD_LATENCYOPT
 #endif
 
+// In switch dispatch, marking default case as unreachable allows to remove redundant bounds checks
+#if defined(__GNUC__)
+#define SIMD_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define SIMD_UNREACHABLE() __assume(false)
+#else
+#define SIMD_UNREACHABLE() assert(!"Unreachable")
+#endif
+
 #endif // !MESHOPTIMIZER_NO_SIMD
 
 #ifdef SIMD_SSE
@@ -119,7 +128,13 @@ const size_t kVertexBlockSizeBytes = 8192;
 const size_t kVertexBlockMaxSize = 256;
 const size_t kByteGroupSize = 16;
 const size_t kByteGroupDecodeLimit = 24;
-const size_t kTailMaxSize = 32;
+const size_t kTailMinSizeV0 = 32;
+const size_t kTailMinSizeV1 = 24;
+
+static const int kBitsV0[4] = {0, 2, 4, 8};
+static const int kBitsV1[5] = {0, 1, 2, 4, 8};
+
+const int kEncodeDefaultLevel = 2;
 
 static size_t getVertexBlockSize(size_t vertex_size)
 {
@@ -133,14 +148,21 @@ static size_t getVertexBlockSize(size_t vertex_size)
 	return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
 }
 
-inline unsigned char zigzag8(unsigned char v)
+inline unsigned int rotate(unsigned int v, int r)
 {
-	return ((signed char)(v) >> 7) ^ (v << 1);
+	return (v << r) | (v >> ((32 - r) & 31));
 }
 
-inline unsigned char unzigzag8(unsigned char v)
+template <typename T>
+inline T zigzag(T v)
 {
-	return -(v & 1) ^ (v >> 1);
+	return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);
+}
+
+template <typename T>
+inline T unzigzag(T v)
+{
+	return (0 - (v & 1)) ^ (v >> 1);
 }
 
 #if TRACE
@@ -148,17 +170,18 @@ struct Stats
 {
 	size_t size;
 	size_t header;  // bytes for header
-	size_t bitg[4]; // bytes for bit groups
+	size_t bitg[9]; // bytes for bit groups
 	size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group
+	size_t ctrl[4]; // number of control groups
 };
 
 static Stats* bytestats = NULL;
 static Stats vertexstats[256];
 #endif
 
-static bool encodeBytesGroupZero(const unsigned char* buffer)
+static bool canEncodeZero(const unsigned char* buffer, size_t buffer_size)
 {
-	for (size_t i = 0; i < kByteGroupSize; ++i)
+	for (size_t i = 0; i < buffer_size; ++i)
 		if (buffer[i])
 			return false;
 
@@ -167,10 +190,10 @@ static bool encodeBytesGroupZero(const unsigned char* buffer)
 
 static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
 {
-	assert(bits >= 1 && bits <= 8);
+	assert(bits >= 0 && bits <= 8);
 
-	if (bits == 1)
-		return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
+	if (bits == 0)
+		return canEncodeZero(buffer, kByteGroupSize) ? 0 : size_t(-1);
 
 	if (bits == 8)
 		return kByteGroupSize;
@@ -187,9 +210,10 @@ static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
 
 static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
 {
-	assert(bits >= 1 && bits <= 8);
+	assert(bits >= 0 && bits <= 8);
+	assert(kByteGroupSize % 8 == 0);
 
-	if (bits == 1)
+	if (bits == 0)
 		return data;
 
 	if (bits == 8)
@@ -217,21 +241,27 @@ static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char*
 			byte |= enc;
 		}
 
+		// encode 1-bit groups in reverse bit order
+		// this makes them faster to decode alongside other groups
+		if (bits == 1)
+			byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+
 		*data++ = byte;
 	}
 
 	for (size_t i = 0; i < kByteGroupSize; ++i)
 	{
-		if (buffer[i] >= sentinel)
-		{
-			*data++ = buffer[i];
-		}
+		unsigned char v = buffer[i];
+
+		// branchless append of out-of-range values
+		*data = v;
+		data += v >= sentinel;
 	}
 
 	return data;
 }
 
-static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
+static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size, const int bits[4])
 {
 	assert(buffer_size % kByteGroupSize == 0);
 
@@ -247,39 +277,40 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
 
 	memset(header, 0, header_size);
 
+	int last_bits = -1;
+
 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
 	{
 		if (size_t(data_end - data) < kByteGroupDecodeLimit)
 			return NULL;
 
-		int best_bits = 8;
-		size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
+		int best_bitk = 3;
+		size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]);
 
-		for (int bits = 1; bits < 8; bits *= 2)
+		for (int bitk = 0; bitk < 3; ++bitk)
 		{
-			size_t size = encodeBytesGroupMeasure(buffer + i, bits);
+			size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]);
 
-			if (size < best_size)
+			// favor consistent bit selection across groups, but never replace literals
+			if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8))
 			{
-				best_bits = bits;
+				best_bitk = bitk;
 				best_size = size;
 			}
 		}
 
-		int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2 ? 1 : (best_bits == 4 ? 2 : 3));
-		assert((1 << bitslog2) == best_bits);
-
 		size_t header_offset = i / kByteGroupSize;
+		header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2);
 
-		header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
-
+		int best_bits = bits[best_bitk];
 		unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
 
 		assert(data + best_size == next);
 		data = next;
+		last_bits = best_bits;
 
 #if TRACE
-		bytestats->bitg[bitslog2] += best_size;
+		bytestats->bitg[best_bits] += best_size;
 #endif
 	}
 
@@ -290,30 +321,203 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
 	return data;
 }
 
-static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+template <typename T, bool Xor>
+static void encodeDeltas1(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int rot)
+{
+	size_t k0 = k & ~(sizeof(T) - 1);
+	int ks = (k & (sizeof(T) - 1)) * 8;
+
+	T p = last_vertex[k0];
+	for (size_t j = 1; j < sizeof(T); ++j)
+		p |= T(last_vertex[k0 + j]) << (j * 8);
+
+	const unsigned char* vertex = vertex_data + k0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		T v = vertex[0];
+		for (size_t j = 1; j < sizeof(T); ++j)
+			v |= vertex[j] << (j * 8);
+
+		T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
+
+		buffer[i] = (unsigned char)(d >> ks);
+		p = v;
+		vertex += vertex_size;
+	}
+}
+
+static void encodeDeltas(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int channel)
+{
+	switch (channel & 3)
+	{
+	case 0:
+		return encodeDeltas1<unsigned char, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
+	case 1:
+		return encodeDeltas1<unsigned short, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
+	case 2:
+		return encodeDeltas1<unsigned int, true>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4);
+	default:
+		assert(!"Unsupported channel encoding"); // unreachable
+	}
+}
+
+static int estimateBits(unsigned char v)
+{
+	return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8;
+}
+
+static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t group_size)
+{
+	size_t sizes[8] = {};
+
+	const unsigned char* vertex = vertex_data + k;
+	unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
+
+	for (size_t i = 0; i < vertex_count; i += group_size)
+	{
+		unsigned int bitg = 0;
+
+		// calculate bit consistency mask for the group
+		for (size_t j = 0; j < group_size && i + j < vertex_count; ++j)
+		{
+			unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
+			unsigned int d = v ^ last;
+
+			bitg |= d;
+			last = v;
+			vertex += vertex_size;
+		}
+
+		for (int j = 0; j < 8; ++j)
+		{
+			unsigned int bitr = rotate(bitg, j);
+
+			sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8));
+			sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24));
+		}
+	}
+
+	int best_rot = 0;
+	for (int rot = 1; rot < 8; ++rot)
+		best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot;
+
+	return best_rot;
+}
+
+static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t vertex_block_size, size_t block_skip, int max_channel, int xor_rot)
+{
+	unsigned char block[kVertexBlockMaxSize];
+	assert(vertex_block_size <= kVertexBlockMaxSize);
+
+	unsigned char last_vertex[256] = {};
+
+	size_t sizes[3] = {};
+	assert(max_channel <= 3);
+
+	for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip)
+	{
+		size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i;
+		size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+		memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size);
+
+		// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
+		if (block_size < block_size_aligned)
+			memset(block + block_size, 0, block_size_aligned - block_size);
+
+		for (int channel = 0; channel < max_channel; ++channel)
+			for (size_t j = 0; j < 4; ++j)
+			{
+				encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4));
+
+				for (size_t ig = 0; ig < block_size; ig += kByteGroupSize)
+				{
+					// to maximize encoding performance we only evaluate 1/2/4/8 bit groups
+					size_t size1 = encodeBytesGroupMeasure(block + ig, 1);
+					size_t size2 = encodeBytesGroupMeasure(block + ig, 2);
+					size_t size4 = encodeBytesGroupMeasure(block + ig, 4);
+					size_t size8 = encodeBytesGroupMeasure(block + ig, 8);
+
+					size_t best_size = size1 < size2 ? size1 : size2;
+					best_size = best_size < size4 ? best_size : size4;
+					best_size = best_size < size8 ? best_size : size8;
+
+					sizes[channel] += best_size;
+				}
+			}
+	}
+
+	int best_channel = 0;
+	for (int channel = 1; channel < max_channel; ++channel)
+		best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel;
+
+	return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel;
+}
+
+static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level)
+{
+	if (canEncodeZero(buffer, vertex_count))
+		return 2; // zero encoding
+
+	if (level == 0)
+		return 1; // 1248 encoding in level 0 for encoding speed
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4;
+
+	size_t est_bytes0 = header_size, est_bytes1 = header_size;
+
+	for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
+	{
+		// assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance
+		size_t size0 = encodeBytesGroupMeasure(buffer + i, 0);
+		size_t size1 = encodeBytesGroupMeasure(buffer + i, 1);
+		size_t size2 = encodeBytesGroupMeasure(buffer + i, 2);
+		size_t size4 = encodeBytesGroupMeasure(buffer + i, 4);
+		size_t size8 = encodeBytesGroupMeasure(buffer + i, 8);
+
+		// both control modes have access to 1/2/4 bit encoding
+		size_t size12 = size1 < size2 ? size1 : size2;
+		size_t size124 = size12 < size4 ? size12 : size4;
+
+		// each control mode has access to 0/8 bit encoding respectively
+		est_bytes0 += size124 < size0 ? size124 : size0;
+		est_bytes1 += size124 < size8 ? size124 : size8;
+	}
+
+	// pick shortest control entry but prefer literal encoding
+	if (est_bytes0 < vertex_count || est_bytes1 < vertex_count)
+		return est_bytes0 < est_bytes1 ? 0 : 1;
+	else
+		return 3; // literal encoding
+}
+
+static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version, int level)
 {
 	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+	assert(vertex_size % 4 == 0);
 
 	unsigned char buffer[kVertexBlockMaxSize];
 	assert(sizeof(buffer) % kByteGroupSize == 0);
 
+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
 	// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
 	memset(buffer, 0, sizeof(buffer));
 
+	size_t control_size = version == 0 ? 0 : vertex_size / 4;
+	if (size_t(data_end - data) < control_size)
+		return NULL;
+
+	unsigned char* control = data;
+	data += control_size;
+
+	memset(control, 0, control_size);
+
 	for (size_t k = 0; k < vertex_size; ++k)
 	{
-		size_t vertex_offset = k;
-
-		unsigned char p = last_vertex[k];
-
-		for (size_t i = 0; i < vertex_count; ++i)
-		{
-			buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
-
-			p = vertex_data[vertex_offset];
-
-			vertex_offset += vertex_size;
-		}
+		encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]);
 
 #if TRACE
 		const unsigned char* olddata = data;
@@ -332,9 +536,35 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
 		}
 #endif
 
-		data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
-		if (!data)
-			return NULL;
+		int ctrl = 0;
+
+		if (version != 0)
+		{
+			ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level);
+
+			assert(unsigned(ctrl) < 4);
+			control[k / 4] |= ctrl << ((k % 4) * 2);
+
+#if TRACE
+			vertexstats[k].ctrl[ctrl]++;
+#endif
+		}
+
+		if (ctrl == 3)
+		{
+			// literal encoding
+			if (size_t(data_end - data) < vertex_count)
+				return NULL;
+
+			memcpy(data, buffer, vertex_count);
+			data += vertex_count;
+		}
+		else if (ctrl != 2) // non-zero encoding
+		{
+			data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
+			if (!data)
+				return NULL;
+		}
 
 #if TRACE
 		bytestats = NULL;
@@ -348,7 +578,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
 }
 
 #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
-static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
+static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bits)
 {
 #define READ() byte = *data++
 #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
@@ -356,12 +586,24 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
 	unsigned char byte, enc, encv;
 	const unsigned char* data_var;
 
-	switch (bitslog2)
+	switch (bits)
 	{
 	case 0:
 		memset(buffer, 0, kByteGroupSize);
 		return data;
 	case 1:
+		data_var = data + 2;
+
+		// 2 groups with 8 1-bit values in each byte (reversed from the order in other groups)
+		READ();
+		byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+		NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
+		READ();
+		byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+		NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
+
+		return data_var;
+	case 2:
 		data_var = data + 4;
 
 		// 4 groups with 4 2-bit values in each byte
@@ -371,7 +613,7 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
 		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
 
 		return data_var;
-	case 2:
+	case 4:
 		data_var = data + 8;
 
 		// 8 groups with 2 4-bit values in each byte
@@ -385,11 +627,11 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
 		READ(), NEXT(4), NEXT(4);
 
 		return data_var;
-	case 3:
+	case 8:
 		memcpy(buffer, data, kByteGroupSize);
 		return data + kByteGroupSize;
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		assert(!"Unexpected bit length"); // unreachable
 		return data;
 	}
 
@@ -397,18 +639,16 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
 #undef NEXT
 }
 
-static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, const int* bits)
 {
 	assert(buffer_size % kByteGroupSize == 0);
 
-	const unsigned char* header = data;
-
 	// round number of groups to 4 to get number of header bytes
 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
-
 	if (size_t(data_end - data) < header_size)
 		return NULL;
 
+	const unsigned char* header = data;
 	data += header_size;
 
 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
@@ -417,43 +657,108 @@ static const unsigned char* decodeBytes(const unsigned char* data, const unsigne
 			return NULL;
 
 		size_t header_offset = i / kByteGroupSize;
+		int bitsk = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
 
-		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
-
-		data = decodeBytesGroup(data, buffer + i, bitslog2);
+		data = decodeBytesGroup(data, buffer + i, bits[bitsk]);
 	}
 
 	return data;
 }
 
-static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+template <typename T, bool Xor>
+static void decodeDeltas1(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count, size_t vertex_size, const unsigned char* last_vertex, int rot)
+{
+	for (size_t k = 0; k < 4; k += sizeof(T))
+	{
+		size_t vertex_offset = k;
+
+		T p = last_vertex[0];
+		for (size_t j = 1; j < sizeof(T); ++j)
+			p |= last_vertex[j] << (8 * j);
+
+		for (size_t i = 0; i < vertex_count; ++i)
+		{
+			T v = buffer[i];
+			for (size_t j = 1; j < sizeof(T); ++j)
+				v |= buffer[i + vertex_count * j] << (8 * j);
+
+			v = Xor ? T(rotate(v, rot)) ^ p : unzigzag(v) + p;
+
+			for (size_t j = 0; j < sizeof(T); ++j)
+				transposed[vertex_offset + j] = (unsigned char)(v >> (j * 8));
+
+			p = v;
+
+			vertex_offset += vertex_size;
+		}
+
+		buffer += vertex_count * sizeof(T);
+		last_vertex += sizeof(T);
+	}
+}
+
+static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
 {
 	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
 
-	unsigned char buffer[kVertexBlockMaxSize];
+	unsigned char buffer[kVertexBlockMaxSize * 4];
 	unsigned char transposed[kVertexBlockSizeBytes];
 
 	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
 	assert(vertex_count <= vertex_count_aligned);
 
-	for (size_t k = 0; k < vertex_size; ++k)
+	size_t control_size = version == 0 ? 0 : vertex_size / 4;
+	if (size_t(data_end - data) < control_size)
+		return NULL;
+
+	const unsigned char* control = data;
+	data += control_size;
+
+	for (size_t k = 0; k < vertex_size; k += 4)
 	{
-		data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
-		if (!data)
-			return NULL;
+		unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
 
-		size_t vertex_offset = k;
-
-		unsigned char p = last_vertex[k];
-
-		for (size_t i = 0; i < vertex_count; ++i)
+		for (size_t j = 0; j < 4; ++j)
 		{
-			unsigned char v = unzigzag8(buffer[i]) + p;
+			int ctrl = (ctrl_byte >> (j * 2)) & 3;
 
-			transposed[vertex_offset] = v;
-			p = v;
+			if (ctrl == 3)
+			{
+				// literal encoding
+				if (size_t(data_end - data) < vertex_count)
+					return NULL;
 
-			vertex_offset += vertex_size;
+				memcpy(buffer + j * vertex_count, data, vertex_count);
+				data += vertex_count;
+			}
+			else if (ctrl == 2)
+			{
+				// zero encoding
+				memset(buffer + j * vertex_count, 0, vertex_count);
+			}
+			else
+			{
+				data = decodeBytes(data, data_end, buffer + j * vertex_count, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
+				if (!data)
+					return NULL;
+			}
+		}
+
+		int channel = version == 0 ? 0 : channels[k / 4];
+
+		switch (channel & 3)
+		{
+		case 0:
+			decodeDeltas1<unsigned char, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
+			break;
+		case 1:
+			decodeDeltas1<unsigned short, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
+			break;
+		case 2:
+			decodeDeltas1<unsigned int, true>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
+			break;
+		default:
+			return NULL; // invalid channel type
 		}
 	}
 
@@ -499,7 +804,7 @@ static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
 
 #ifdef SIMD_SSE
 SIMD_TARGET
-static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+inline __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 {
 	__m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
 	__m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
@@ -511,11 +816,12 @@ static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 }
 
 SIMD_TARGET
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
 {
-	switch (bitslog2)
+	switch (hbits)
 	{
 	case 0:
+	case 4:
 	{
 		__m128i result = _mm_setzero_si128();
 
@@ -525,6 +831,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 1:
+	case 6:
 	{
 #ifdef __GNUC__
 		typedef int __attribute__((aligned(1))) unaligned_int;
@@ -557,7 +864,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		unsigned char mask1 = (unsigned char)(mask16 >> 8);
 
 		__m128i shuf = decodeShuffleMask(mask0, mask1);
-
 		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
 
 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
@@ -570,6 +876,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 2:
+	case 7:
 	{
 #ifdef SIMD_LATENCYOPT
 		unsigned long long data64;
@@ -593,7 +900,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		unsigned char mask1 = (unsigned char)(mask16 >> 8);
 
 		__m128i shuf = decodeShuffleMask(mask0, mask1);
-
 		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
 
 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
@@ -606,6 +912,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 3:
+	case 8:
 	{
 		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
 
@@ -614,26 +921,46 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		return data + 16;
 	}
 
+	case 5:
+	{
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 2));
+
+		unsigned char mask0 = data[0];
+		unsigned char mask1 = data[1];
+
+		__m128i shuf = decodeShuffleMask(mask0, mask1);
+		__m128i result = _mm_shuffle_epi8(rest, shuf);
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
-		return data;
+		SIMD_UNREACHABLE(); // unreachable
 	}
 }
 #endif
 
 #ifdef SIMD_AVX
-static const __m128i decodeBytesGroupConfig[] = {
-    _mm_set1_epi8(3),
-    _mm_set1_epi8(15),
-    _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
-    _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
+static const __m128i kDecodeBytesGroupConfig[8][2] = {
+    {_mm_setzero_si128(), _mm_setzero_si128()},
+    {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
+    {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
+    {_mm_setzero_si128(), _mm_setzero_si128()},
+    {_mm_setzero_si128(), _mm_setzero_si128()},
+    {_mm_set1_epi8(1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)},
+    {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
+    {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
 };
 
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+SIMD_TARGET
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
 {
-	switch (bitslog2)
+	switch (hbits)
 	{
 	case 0:
+	case 4:
 	{
 		__m128i result = _mm_setzero_si128();
 
@@ -642,16 +969,19 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		return data;
 	}
 
-	case 1:
-	case 2:
+	case 5: // 1-bit
+	case 1: // 2-bit
+	case 6:
+	case 2: // 4-bit
+	case 7:
 	{
-		const unsigned char* skip = data + (bitslog2 << 2);
+		const unsigned char* skip = data + (2 << (hbits < 3 ? hbits : hbits - 5));
 
 		__m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
 
-		__m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
-		__m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
+		__m128i sent = kDecodeBytesGroupConfig[hbits][0];
+		__m128i ctrl = kDecodeBytesGroupConfig[hbits][1];
 
 		__m128i selw = _mm_shuffle_epi32(selb, 0x44);
 		__m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
@@ -665,6 +995,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 3:
+	case 8:
 	{
 		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
 
@@ -674,14 +1005,14 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
-		return data;
+		SIMD_UNREACHABLE(); // unreachable
 	}
 }
 #endif
 
 #ifdef SIMD_NEON
-static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
+SIMD_TARGET
+inline uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
 {
 	uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
 	uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
@@ -692,7 +1023,8 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8
 	return vcombine_u8(r0, r1);
 }
 
-static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
+SIMD_TARGET
+inline void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
 {
 	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
 	const uint64_t magic = 0x000103070f1f3f80ull;
@@ -703,11 +1035,13 @@ static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& m
 	mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
 }
 
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+SIMD_TARGET
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
 {
-	switch (bitslog2)
+	switch (hbits)
 	{
 	case 0:
+	case 4:
 	{
 		uint8x16_t result = vdupq_n_u8(0);
 
@@ -717,6 +1051,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 1:
+	case 6:
 	{
 #ifdef SIMD_LATENCYOPT
 		unsigned int data32;
@@ -754,6 +1089,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 2:
+	case 7:
 	{
 #ifdef SIMD_LATENCYOPT
 		unsigned long long data64;
@@ -788,6 +1124,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 3:
+	case 8:
 	{
 		uint8x16_t result = vld1q_u8(data);
 
@@ -796,30 +1133,42 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		return data + 16;
 	}
 
+	case 5:
+	{
+		unsigned char mask0 = data[0];
+		unsigned char mask1 = data[1];
+
+		uint8x8_t rest0 = vld1_u8(data + 2);
+		uint8x8_t rest1 = vld1_u8(data + 2 + kDecodeBytesGroupCount[mask0]);
+
+		uint8x16_t result = shuffleBytes(mask0, mask1, rest0, rest1);
+
+		vst1q_u8(buffer, result);
+
+		return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
-		return data;
+		SIMD_UNREACHABLE(); // unreachable
 	}
 }
 #endif
 
 #ifdef SIMD_WASM
 SIMD_TARGET
-static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+inline v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 {
 	v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
 	v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
 
-	v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
-	sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-
+	v128_t sm1off = wasm_v128_load8_splat(&kDecodeBytesGroupCount[mask0]);
 	v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
 
 	return wasmx_unpacklo_v64x2(sm0, sm1r);
 }
 
 SIMD_TARGET
-static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
+inline void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
 {
 	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
 	const uint64_t magic = 0x000103070f1f3f80ull;
@@ -829,11 +1178,12 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
 }
 
 SIMD_TARGET
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
 {
-	switch (bitslog2)
+	switch (hbits)
 	{
 	case 0:
+	case 4:
 	{
 		v128_t result = wasm_i8x16_splat(0);
 
@@ -843,6 +1193,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 1:
+	case 6:
 	{
 		v128_t sel2 = wasm_v128_load(data);
 		v128_t rest = wasm_v128_load(data + 4);
@@ -857,7 +1208,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		wasmMoveMask(mask, mask0, mask1);
 
 		v128_t shuf = decodeShuffleMask(mask0, mask1);
-
 		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
 
 		wasm_v128_store(buffer, result);
@@ -866,6 +1216,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 2:
+	case 7:
 	{
 		v128_t sel4 = wasm_v128_load(data);
 		v128_t rest = wasm_v128_load(data + 8);
@@ -879,7 +1230,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		wasmMoveMask(mask, mask0, mask1);
 
 		v128_t shuf = decodeShuffleMask(mask0, mask1);
-
 		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
 
 		wasm_v128_store(buffer, result);
@@ -888,6 +1238,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 3:
+	case 8:
 	{
 		v128_t result = wasm_v128_load(data);
 
@@ -896,16 +1247,30 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		return data + 16;
 	}
 
+	case 5:
+	{
+		v128_t rest = wasm_v128_load(data + 2);
+
+		unsigned char mask0 = data[0];
+		unsigned char mask1 = data[1];
+
+		v128_t shuf = decodeShuffleMask(mask0, mask1);
+		v128_t result = wasm_i8x16_swizzle(rest, shuf);
+
+		wasm_v128_store(buffer, result);
+
+		return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
-		return data;
+		SIMD_UNREACHABLE(); // unreachable
 	}
 }
 #endif
 
 #if defined(SIMD_SSE) || defined(SIMD_AVX)
 SIMD_TARGET
-static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
+inline void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
 {
 	__m128i t0 = _mm_unpacklo_epi8(x0, x1);
 	__m128i t1 = _mm_unpackhi_epi8(x0, x1);
@@ -919,17 +1284,33 @@ static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
 }
 
 SIMD_TARGET
-static __m128i unzigzag8(__m128i v)
+inline __m128i unzigzag8(__m128i v)
 {
 	__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
 	__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
 
 	return _mm_xor_si128(xl, xr);
 }
+
+SIMD_TARGET
+inline __m128i unzigzag16(__m128i v)
+{
+	__m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1)));
+	__m128i xr = _mm_srli_epi16(v, 1);
+
+	return _mm_xor_si128(xl, xr);
+}
+
+SIMD_TARGET
+inline __m128i rotate32(__m128i v, int r)
+{
+	return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r));
+}
 #endif
 
 #ifdef SIMD_NEON
-static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
+SIMD_TARGET
+inline void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
 {
 	uint8x16x2_t t01 = vzipq_u8(x0, x1);
 	uint8x16x2_t t23 = vzipq_u8(x2, x3);
@@ -943,18 +1324,36 @@ static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_
 	x3 = vreinterpretq_u8_u16(x23.val[1]);
 }
 
-static uint8x16_t unzigzag8(uint8x16_t v)
+SIMD_TARGET
+inline uint8x16_t unzigzag8(uint8x16_t v)
 {
 	uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
 	uint8x16_t xr = vshrq_n_u8(v, 1);
 
 	return veorq_u8(xl, xr);
 }
+
+SIMD_TARGET
+inline uint8x16_t unzigzag16(uint8x16_t v)
+{
+	uint16x8_t vv = vreinterpretq_u16_u8(v);
+	uint8x16_t xl = vreinterpretq_u8_s16(vnegq_s16(vreinterpretq_s16_u16(vandq_u16(vv, vdupq_n_u16(1)))));
+	uint8x16_t xr = vreinterpretq_u8_u16(vshrq_n_u16(vv, 1));
+
+	return veorq_u8(xl, xr);
+}
+
+SIMD_TARGET
+inline uint8x16_t rotate32(uint8x16_t v, int r)
+{
+	uint32x4_t v32 = vreinterpretq_u32_u8(v);
+	return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32))));
+}
 #endif
 
 #ifdef SIMD_WASM
 SIMD_TARGET
-static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
+inline void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
 {
 	v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
 	v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
@@ -968,44 +1367,57 @@ static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
 }
 
 SIMD_TARGET
-static v128_t unzigzag8(v128_t v)
+inline v128_t unzigzag8(v128_t v)
 {
 	v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
 	v128_t xr = wasm_u8x16_shr(v, 1);
 
 	return wasm_v128_xor(xl, xr);
 }
+
+SIMD_TARGET
+inline v128_t unzigzag16(v128_t v)
+{
+	v128_t xl = wasm_i16x8_neg(wasm_v128_and(v, wasm_i16x8_splat(1)));
+	v128_t xr = wasm_u16x8_shr(v, 1);
+
+	return wasm_v128_xor(xl, xr);
+}
+
+SIMD_TARGET
+inline v128_t rotate32(v128_t v, int r)
+{
+	return wasm_v128_or(wasm_i32x4_shl(v, r), wasm_i32x4_shr(v, 32 - r));
+}
 #endif
 
 #if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
 SIMD_TARGET
-static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, int hshift)
 {
 	assert(buffer_size % kByteGroupSize == 0);
 	assert(kByteGroupSize == 16);
 
-	const unsigned char* header = data;
-
 	// round number of groups to 4 to get number of header bytes
 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
-
 	if (size_t(data_end - data) < header_size)
 		return NULL;
 
+	const unsigned char* header = data;
 	data += header_size;
 
 	size_t i = 0;
 
-	// fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
+	// fast-path: process 4 groups at a time, do a shared bounds check
 	for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
 	{
 		size_t header_offset = i / kByteGroupSize;
 		unsigned char header_byte = header[header_offset / 4];
 
-		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
-		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
-		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
-		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3));
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3));
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3));
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3));
 	}
 
 	// slow-path: process remaining groups
@@ -1015,17 +1427,94 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
 			return NULL;
 
 		size_t header_offset = i / kByteGroupSize;
+		unsigned char header_byte = header[header_offset / 4];
 
-		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
-
-		data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
+		data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3));
 	}
 
 	return data;
 }
 
+template <int Channel>
+SIMD_TARGET static void
+decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count_aligned, size_t vertex_size, unsigned char last_vertex[4], int rot)
+{
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
+#define TEMP __m128i
+#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
+#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
+#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
+#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
+#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
+#endif
+
+#ifdef SIMD_NEON
+#define TEMP uint8x8_t
+#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
+#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
+#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
+#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
+#endif
+
+#ifdef SIMD_WASM
+#define TEMP v128_t
+#define PREP() v128_t pi = wasm_v128_load(last_vertex)
+#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
+#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
+#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
+#endif
+
+#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
+
+	PREP();
+
+	unsigned char* savep = transposed;
+
+	for (size_t j = 0; j < vertex_count_aligned; j += 16)
+	{
+		LOAD(0);
+		LOAD(1);
+		LOAD(2);
+		LOAD(3);
+
+		transpose8(r0, r1, r2, r3);
+
+		TEMP t0, t1, t2, t3;
+
+		UNZR(0);
+		GRP4(0);
+		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+		UNZR(1);
+		GRP4(1);
+		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+		UNZR(2);
+		GRP4(2);
+		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+		UNZR(3);
+		GRP4(3);
+		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+#undef UNZR
+#undef TEMP
+#undef PREP
+#undef LOAD
+#undef GRP4
+#undef FIXD
+#undef SAVE
+	}
+}
+
 SIMD_TARGET
-static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
 {
 	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
 
@@ -1034,84 +1523,61 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
 
 	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
 
+	size_t control_size = version == 0 ? 0 : vertex_size / 4;
+	if (size_t(data_end - data) < control_size)
+		return NULL;
+
+	const unsigned char* control = data;
+	data += control_size;
+
 	for (size_t k = 0; k < vertex_size; k += 4)
 	{
+		unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
+
 		for (size_t j = 0; j < 4; ++j)
 		{
-			data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
-			if (!data)
-				return NULL;
+			int ctrl = (ctrl_byte >> (j * 2)) & 3;
+
+			if (ctrl == 3)
+			{
+				// literal encoding; safe to over-copy due to tail
+				if (size_t(data_end - data) < vertex_count_aligned)
+					return NULL;
+
+				memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned);
+				data += vertex_count;
+			}
+			else if (ctrl == 2)
+			{
+				// zero encoding
+				memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned);
+			}
+			else
+			{
+				// for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8
+				int hshift = version == 0 ? 0 : 4 + ctrl;
+
+				data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift);
+				if (!data)
+					return NULL;
+			}
 		}
 
-#if defined(SIMD_SSE) || defined(SIMD_AVX)
-#define TEMP __m128i
-#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
-#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
-#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
-#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
-#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
-#endif
+		int channel = version == 0 ? 0 : channels[k / 4];
 
-#ifdef SIMD_NEON
-#define TEMP uint8x8_t
-#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
-#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
-#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
-#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
-#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
-#endif
-
-#ifdef SIMD_WASM
-#define TEMP v128_t
-#define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
-#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
-#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
-#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
-#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
-#endif
-
-		PREP();
-
-		unsigned char* savep = transposed + k;
-
-		for (size_t j = 0; j < vertex_count_aligned; j += 16)
+		switch (channel & 3)
 		{
-			LOAD(0);
-			LOAD(1);
-			LOAD(2);
-			LOAD(3);
-
-			r0 = unzigzag8(r0);
-			r1 = unzigzag8(r1);
-			r2 = unzigzag8(r2);
-			r3 = unzigzag8(r3);
-
-			transpose8(r0, r1, r2, r3);
-
-			TEMP t0, t1, t2, t3;
-
-			GRP4(0);
-			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
-			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
-			GRP4(1);
-			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
-			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
-			GRP4(2);
-			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
-			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
-			GRP4(3);
-			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
-			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
-#undef TEMP
-#undef PREP
-#undef LOAD
-#undef GRP4
-#undef FIXD
-#undef SAVE
+		case 0:
+			decodeDeltas4Simd<0>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
+			break;
+		case 1:
+			decodeDeltas4Simd<1>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
+			break;
+		case 2:
+			decodeDeltas4Simd<2>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
+			break;
+		default:
+			return NULL; // invalid channel type
 		}
 	}
 
@@ -1140,12 +1606,13 @@ static unsigned int cpuid = getCpuFeatures();
 
 } // namespace meshopt
 
-size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
+size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level)
 {
 	using namespace meshopt;
 
 	assert(vertex_size > 0 && vertex_size <= 256);
 	assert(vertex_size % 4 == 0);
+	assert(level >= 0 && level <= 9); // only a subset of this range is used right now
 
 #if TRACE
 	memset(vertexstats, 0, sizeof(vertexstats));
@@ -1156,7 +1623,7 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 	unsigned char* data = buffer;
 	unsigned char* data_end = buffer + buffer_size;
 
-	if (size_t(data_end - data) < 1 + vertex_size)
+	if (size_t(data_end - data) < 1)
 		return 0;
 
 	int version = gEncodeVertexVersion;
@@ -1172,34 +1639,52 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 
 	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 
+	unsigned char channels[64] = {};
+	if (version != 0 && level > 1 && vertex_count > 1)
+		for (size_t k = 0; k < vertex_size; k += 4)
+		{
+			int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0;
+			int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot);
+
+			assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8));
+			channels[k / 4] = (unsigned char)channel;
+		}
+
 	size_t vertex_offset = 0;
 
 	while (vertex_offset < vertex_count)
 	{
 		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
 
-		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level);
 		if (!data)
 			return 0;
 
 		vertex_offset += block_size;
 	}
 
-	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+	size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
+	size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
+	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
 
-	if (size_t(data_end - data) < tail_size)
+	if (size_t(data_end - data) < tail_size_pad)
 		return 0;
 
-	// write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
-	if (vertex_size < kTailMaxSize)
+	if (tail_size < tail_size_pad)
 	{
-		memset(data, 0, kTailMaxSize - vertex_size);
-		data += kTailMaxSize - vertex_size;
+		memset(data, 0, tail_size_pad - tail_size);
+		data += tail_size_pad - tail_size;
 	}
 
 	memcpy(data, first_vertex, vertex_size);
 	data += vertex_size;
 
+	if (version != 0)
+	{
+		memcpy(data, channels, vertex_size / 4);
+		data += vertex_size / 4;
+	}
+
 	assert(data >= buffer + tail_size);
 	assert(data <= buffer + buffer_size);
 
@@ -1212,17 +1697,41 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 
 		printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
 
-		size_t total_k = vsk.header + vsk.bitg[0] + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[3];
+		size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8];
+		double total_kr = total_k ? 1.0 / double(total_k) : 0;
 
-		printf(" |\thdr [%5.1f%%] bitg 1-3 [%4.1f%% %4.1f%% %4.1f%%]",
-		    double(vsk.header) / double(total_k) * 100, double(vsk.bitg[1]) / double(total_k) * 100,
-		    double(vsk.bitg[2]) / double(total_k) * 100, double(vsk.bitg[3]) / double(total_k) * 100);
+		if (version != 0)
+		{
+			int channel = channels[k / 4];
 
-		printf(" |\tbitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
+			if ((channel & 3) == 2 && k % 4 == 0)
+				printf(" | ^%d", channel >> 4);
+			else
+				printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : "."));
+		}
+
+		printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]",
+		    double(vsk.header) * total_kr * 100,
+		    double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100,
+		    double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100);
+
+		size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3];
+
+		if (total_ctrl)
+		{
+			printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%",
+			    double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100,
+			    double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100);
+		}
+
+#if TRACE > 1
+		printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
 		    double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
 		    double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
 		    double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
 		    double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);
+#endif
+
 		printf("\n");
 	}
 #endif
@@ -1230,6 +1739,11 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 	return data - buffer;
 }
 
+size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, meshopt::kEncodeDefaultLevel);
+}
+
 size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
 {
 	using namespace meshopt;
@@ -1240,17 +1754,22 @@ size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
 	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 	size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
 
+	size_t vertex_block_control_size = vertex_size / 4;
 	size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
 	size_t vertex_block_data_size = vertex_block_size;
 
-	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+	size_t tail_size = vertex_size + (vertex_size / 4);
+	size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1;
+	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
+	assert(tail_size_pad >= kByteGroupDecodeLimit);
 
-	return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
+	return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad;
 }
 
 void meshopt_encodeVertexVersion(int version)
 {
-	assert(unsigned(version) <= 0);
+	// note: this version is experimental and the binary format is not finalized; this should not be used in production!
+	assert(unsigned(version) <= 0 || version == 0xe);
 
 	meshopt::gEncodeVertexVersion = version;
 }
@@ -1262,7 +1781,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 	assert(vertex_size > 0 && vertex_size <= 256);
 	assert(vertex_size % 4 == 0);
 
-	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL;
+	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL;
 
 #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
 	decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
@@ -1282,7 +1801,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 	const unsigned char* data = buffer;
 	const unsigned char* data_end = buffer + buffer_size;
 
-	if (size_t(data_end - data) < 1 + vertex_size)
+	if (size_t(data_end - data) < 1)
 		return -2;
 
 	unsigned char data_header = *data++;
@@ -1291,11 +1810,22 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 		return -1;
 
 	int version = data_header & 0x0f;
-	if (version > 0)
+	if (version > 0 && version != 0xe)
 		return -1;
 
+	size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
+	size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
+	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
+
+	if (size_t(data_end - data) < tail_size_pad)
+		return -2;
+
+	const unsigned char* tail = data_end - tail_size;
+
 	unsigned char last_vertex[256];
-	memcpy(last_vertex, data_end - vertex_size, vertex_size);
+	memcpy(last_vertex, tail, vertex_size);
+
+	const unsigned char* channels = version == 0 ? NULL : tail + vertex_size;
 
 	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 
@@ -1305,16 +1835,14 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 	{
 		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
 
-		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version);
 		if (!data)
 			return -2;
 
 		vertex_offset += block_size;
 	}
 
-	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
-
-	if (size_t(data_end - data) != tail_size)
+	if (size_t(data_end - data) != tail_size_pad)
 		return -3;
 
 	return 0;