From db5dbddf99e8931bd1711666c77578c54058bbf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=91=D1=80=D0=B0=D0=BD=D0=B8=D0=BC=D0=B8=D1=80=20=D0=9A?= =?UTF-8?q?=D0=B0=D1=80=D0=B0=D1=9F=D0=B8=D1=9B?= Date: Mon, 4 Nov 2019 19:49:29 -0800 Subject: [PATCH] Updated meshoptimizer. --- 3rdparty/meshoptimizer/.travis.yml | 37 +- 3rdparty/meshoptimizer/README.md | 2 +- 3rdparty/meshoptimizer/demo/main.cpp | 8 +- 3rdparty/meshoptimizer/js/meshopt_decoder.js | 3 + 3rdparty/meshoptimizer/src/vertexcodec.cpp | 362 ++++++++++++++++--- 5 files changed, 336 insertions(+), 76 deletions(-) diff --git a/3rdparty/meshoptimizer/.travis.yml b/3rdparty/meshoptimizer/.travis.yml index d6e5c2ade..3c46e945a 100644 --- a/3rdparty/meshoptimizer/.travis.yml +++ b/3rdparty/meshoptimizer/.travis.yml @@ -1,24 +1,23 @@ language: cpp -matrix: - include: - - os: linux - compiler: gcc - - os: linux - compiler: clang - - os: linux - compiler: gcc - arch: arm64 - - os: osx - compiler: clang - - os: windows - compiler: cl - env: - - TARGET="Visual Studio 15 2017" - - os: windows - compiler: cl - env: - - TARGET="Visual Studio 15 2017 Win64" +jobs: + - os: linux + compiler: gcc + - os: linux + compiler: clang + - os: linux + compiler: gcc + arch: arm64 + - os: osx + compiler: clang + - os: windows + compiler: cl + env: + - TARGET="Visual Studio 15 2017" + - os: windows + compiler: cl + env: + - TARGET="Visual Studio 15 2017 Win64" script: - if [[ "$TRAVIS_COMPILER" == "gcc" ]]; then make -j2 config=coverage test; fi diff --git a/3rdparty/meshoptimizer/README.md b/3rdparty/meshoptimizer/README.md index 45ed73657..b499147fb 100644 --- a/3rdparty/meshoptimizer/README.md +++ b/3rdparty/meshoptimizer/README.md @@ -1,4 +1,4 @@ -# meshoptimizer [![Actions Status](https://github.com/zeux/meshoptimizer/workflows/build/badge.svg)](https://github.com/zeux/meshoptimizer/actions) [![Build Status](https://travis-ci.org/zeux/meshoptimizer.svg?branch=master)](https://travis-ci.org/zeux/meshoptimizer) [![codecov.io](https://codecov.io/github/zeux/meshoptimizer/coverage.svg?branch=master)](https://codecov.io/github/zeux/meshoptimizer?branch=master) ![MIT](https://img.shields.io/badge/license-MIT-blue.svg) [![GitHub](https://img.shields.io/badge/repo-github-green.svg)](https://github.com/zeux/meshoptimizer) +# 🐇 meshoptimizer [![Actions Status](https://github.com/zeux/meshoptimizer/workflows/build/badge.svg)](https://github.com/zeux/meshoptimizer/actions) [![Build Status](https://travis-ci.org/zeux/meshoptimizer.svg?branch=master)](https://travis-ci.org/zeux/meshoptimizer) [![codecov.io](https://codecov.io/github/zeux/meshoptimizer/coverage.svg?branch=master)](https://codecov.io/github/zeux/meshoptimizer?branch=master) ![MIT](https://img.shields.io/badge/license-MIT-blue.svg) [![GitHub](https://img.shields.io/badge/repo-github-green.svg)](https://github.com/zeux/meshoptimizer) ## Purpose diff --git a/3rdparty/meshoptimizer/demo/main.cpp b/3rdparty/meshoptimizer/demo/main.cpp index 9b3bd0c5c..009bc98ff 100644 --- a/3rdparty/meshoptimizer/demo/main.cpp +++ b/3rdparty/meshoptimizer/demo/main.cpp @@ -1016,7 +1016,13 @@ void processDev(const char* path) if (!loadMesh(mesh, path)) return; - simplifyPoints(mesh); + Mesh copy = mesh; + meshopt_optimizeVertexCache(©.indices[0], ©.indices[0], copy.indices.size(), copy.vertices.size()); + meshopt_optimizeVertexFetch(©.vertices[0], ©.indices[0], copy.indices.size(), ©.vertices[0], copy.vertices.size(), sizeof(Vertex)); + + encodeIndex(copy); + encodeVertex(copy, ""); + encodeVertex(copy, "O"); } int main(int argc, char** argv) diff --git a/3rdparty/meshoptimizer/js/meshopt_decoder.js b/3rdparty/meshoptimizer/js/meshopt_decoder.js index fc7bb5720..aab8b4318 100644 --- a/3rdparty/meshoptimizer/js/meshopt_decoder.js +++ b/3rdparty/meshoptimizer/js/meshopt_decoder.js @@ -20,6 +20,9 @@ var MeshoptDecoder = (function() { .then(bytes => WebAssembly.instantiate(bytes, { env })) .then(function(result) { instance = result.instance; + if (instance.exports.__wasm_call_ctors) { + instance.exports.__wasm_call_ctors(); + } env.emscripten_notify_memory_growth(0); }); diff --git a/3rdparty/meshoptimizer/src/vertexcodec.cpp b/3rdparty/meshoptimizer/src/vertexcodec.cpp index 2e0e4e7b5..44c0c8787 100644 --- a/3rdparty/meshoptimizer/src/vertexcodec.cpp +++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp @@ -12,7 +12,12 @@ #define SIMD_SSE #endif -#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) +#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__) +#undef SIMD_SSE +#define SIMD_AVX +#endif + +#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) #define SIMD_SSE #define SIMD_FALLBACK #include // __cpuid @@ -22,10 +27,19 @@ #define SIMD_NEON #endif +// WebAssembly SIMD implementation requires a few bleeding edge intrinsics that are only available in Chrome Canary +#if defined(__wasm_simd128__) && defined(__wasm_unimplemented_simd128__) +#define SIMD_WASM +#endif + #ifdef SIMD_SSE #include #endif +#ifdef SIMD_AVX +#include +#endif + #ifdef SIMD_NEON #if defined(_MSC_VER) && defined(_M_ARM64) #include @@ -34,6 +48,10 @@ #endif #endif +#ifdef SIMD_WASM +#include +#endif + #ifndef TRACE #define TRACE 0 #endif @@ -42,6 +60,14 @@ #include #endif +#ifdef SIMD_WASM +#define wasm_v32x4_splat(v, i) wasm_v8x16_shuffle(v, v, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3) +#define wasm_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23) +#define wasm_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31) +#define wasm_unpacklo_v16x8(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23) +#define wasm_unpackhi_v16x8(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31) +#endif + namespace meshopt { @@ -267,7 +293,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data return data; } -#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON)) +#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX)) static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2) { #define READ() byte = *data++ @@ -384,7 +410,7 @@ static const unsigned char* decodeVertexBlock(const unsigned char* data, const u } #endif -#if defined(SIMD_SSE) || defined(SIMD_NEON) +#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) static unsigned char kDecodeBytesGroupShuffle[256][8]; static unsigned char kDecodeBytesGroupCount[256]; @@ -424,27 +450,6 @@ static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1) return _mm_unpacklo_epi64(sm0, sm1r); } -static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) -{ - __m128i t0 = _mm_unpacklo_epi8(x0, x1); - __m128i t1 = _mm_unpackhi_epi8(x0, x1); - __m128i t2 = _mm_unpacklo_epi8(x2, x3); - __m128i t3 = _mm_unpackhi_epi8(x2, x3); - - x0 = _mm_unpacklo_epi16(t0, t2); - x1 = _mm_unpackhi_epi16(t0, t2); - x2 = _mm_unpacklo_epi16(t1, t3); - x3 = _mm_unpackhi_epi16(t1, t3); -} - -static __m128i unzigzag8(__m128i v) -{ - __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1))); - __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127)); - - return _mm_xor_si128(xl, xr); -} - static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) { switch (bitslog2) @@ -511,9 +516,66 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi case 3: { - __m128i rest = _mm_loadu_si128(reinterpret_cast(data)); + __m128i result = _mm_loadu_si128(reinterpret_cast(data)); - __m128i result = rest; + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); + + return data + 16; + } + + default: + assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value + return data; + } +} +#endif + +#ifdef SIMD_AVX +static const __m128i decodeBytesGroupConfig[] = { + _mm_set1_epi8(3), + _mm_set1_epi8(15), + _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24), + _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56), +}; + +static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +{ + switch (bitslog2) + { + case 0: + { + __m128i result = _mm_setzero_si128(); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); + + return data; + } + + case 1: + case 2: + { + const unsigned char* skip = data + (bitslog2 << 2); + + __m128i selb = _mm_loadl_epi64(reinterpret_cast(data)); + __m128i rest = _mm_loadu_si128(reinterpret_cast(skip)); + + __m128i sent = decodeBytesGroupConfig[bitslog2 - 1]; + __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1]; + + __m128i selw = _mm_shuffle_epi32(selb, 0x44); + __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw)); + __mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ); + + __m128i result = _mm_mask_expand_epi8(sel, mask16, rest); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); + + return skip + _mm_popcnt_u32(mask16); + } + + case 3: + { + __m128i result = _mm_loadu_si128(reinterpret_cast(data)); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); @@ -561,28 +623,6 @@ static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& m #endif } -static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3) -{ - uint8x16x2_t t01 = vzipq_u8(x0, x1); - uint8x16x2_t t23 = vzipq_u8(x2, x3); - - uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0])); - uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1])); - - x0 = vreinterpretq_u8_u16(x01.val[0]); - x1 = vreinterpretq_u8_u16(x01.val[1]); - x2 = vreinterpretq_u8_u16(x23.val[0]); - x3 = vreinterpretq_u8_u16(x23.val[1]); -} - -static uint8x16_t unzigzag8(uint8x16_t v) -{ - uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1))))); - uint8x16_t xr = vshrq_n_u8(v, 1); - - return veorq_u8(xl, xr); -} - static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) { switch (bitslog2) @@ -639,9 +679,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi case 3: { - uint8x16_t rest = vld1q_u8(data); - - uint8x16_t result = rest; + uint8x16_t result = vld1q_u8(data); vst1q_u8(buffer, result); @@ -655,7 +693,205 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } #endif -#if defined(SIMD_SSE) || defined(SIMD_NEON) +#ifdef SIMD_WASM +static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1) +{ + // TODO: 8b buffer overrun - should we use splat or extend buffers? + v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]); + v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]); + + // TODO: we should use v8x16_load_splat + v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]); + sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + + v128_t sm1r = wasm_i8x16_add(sm1, sm1off); + + return wasm_v8x16_shuffle(sm0, sm1r, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); +} + +static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1) +{ + uint64_t mbits = 0x8040201008040201ull; + + uint64_t m0_8 = wasm_i64x2_extract_lane(mask, 0) & mbits; + uint64_t m1_8 = wasm_i64x2_extract_lane(mask, 1) & mbits; + + uint32_t m0_4 = m0_8 | (m0_8 >> 32); + uint32_t m1_4 = m1_8 | (m1_8 >> 32); + + uint16_t m0_2 = m0_4 | (m0_4 >> 16); + uint16_t m1_2 = m1_4 | (m1_4 >> 16); + + mask0 = m0_2 | (m0_2 >> 8); + mask1 = m1_2 | (m1_2 >> 8); +} + +static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +{ + unsigned char byte, enc, encv; + const unsigned char* data_var; + + switch (bitslog2) + { + case 0: + { + v128_t result = wasm_i8x16_splat(0); + + wasm_v128_store(buffer, result); + + return data; + } + + case 1: + { + // TODO: test 4b load splat + v128_t sel2 = wasm_v128_load(data); + v128_t rest = wasm_v128_load(data + 4); + + v128_t sel22 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2); + v128_t sel2222 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22); + v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3)); + + v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3)); + + if (!wasm_i8x16_any_true(mask)) + { + wasm_v128_store(buffer, sel); + + return data + 4; + } + + unsigned char mask0, mask1; + wasmMoveMask(mask, mask0, mask1); + + v128_t shuf = decodeShuffleMask(mask0, mask1); + + // TODO: test or/andnot + v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask); + + wasm_v128_store(buffer, result); + + return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + + case 2: + { + // TODO: test 8b load splat + v128_t sel4 = wasm_v128_load(data); + v128_t rest = wasm_v128_load(data + 8); + + v128_t sel44 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4); + v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15)); + + v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15)); + + if (!wasm_i8x16_any_true(mask)) + { + wasm_v128_store(buffer, sel); + + return data + 8; + } + + unsigned char mask0, mask1; + wasmMoveMask(mask, mask0, mask1); + + v128_t shuf = decodeShuffleMask(mask0, mask1); + + // TODO: test or/andnot + v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask); + + wasm_v128_store(buffer, result); + + return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + + case 3: + { + v128_t result = wasm_v128_load(data); + + wasm_v128_store(buffer, result); + + return data + 16; + } + + default: + assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value + return data; + } +} +#endif + +#if defined(SIMD_SSE) || defined(SIMD_AVX) +static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) +{ + __m128i t0 = _mm_unpacklo_epi8(x0, x1); + __m128i t1 = _mm_unpackhi_epi8(x0, x1); + __m128i t2 = _mm_unpacklo_epi8(x2, x3); + __m128i t3 = _mm_unpackhi_epi8(x2, x3); + + x0 = _mm_unpacklo_epi16(t0, t2); + x1 = _mm_unpackhi_epi16(t0, t2); + x2 = _mm_unpacklo_epi16(t1, t3); + x3 = _mm_unpackhi_epi16(t1, t3); +} + +static __m128i unzigzag8(__m128i v) +{ + __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1))); + __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127)); + + return _mm_xor_si128(xl, xr); +} +#endif + +#ifdef SIMD_NEON +static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3) +{ + uint8x16x2_t t01 = vzipq_u8(x0, x1); + uint8x16x2_t t23 = vzipq_u8(x2, x3); + + uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0])); + uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1])); + + x0 = vreinterpretq_u8_u16(x01.val[0]); + x1 = vreinterpretq_u8_u16(x01.val[1]); + x2 = vreinterpretq_u8_u16(x23.val[0]); + x3 = vreinterpretq_u8_u16(x23.val[1]); +} + +static uint8x16_t unzigzag8(uint8x16_t v) +{ + uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1))))); + uint8x16_t xr = vshrq_n_u8(v, 1); + + return veorq_u8(xl, xr); +} +#endif + +#ifdef SIMD_WASM +static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3) +{ + v128_t t0 = wasm_unpacklo_v8x16(x0, x1); + v128_t t1 = wasm_unpackhi_v8x16(x0, x1); + v128_t t2 = wasm_unpacklo_v8x16(x2, x3); + v128_t t3 = wasm_unpackhi_v8x16(x2, x3); + + x0 = wasm_unpacklo_v16x8(t0, t2); + x1 = wasm_unpackhi_v16x8(t0, t2); + x2 = wasm_unpacklo_v16x8(t1, t3); + x3 = wasm_unpackhi_v16x8(t1, t3); +} + +static v128_t unzigzag8(v128_t v) +{ + v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1))); + v128_t xr = wasm_u8x16_shr(v, 1); + + return wasm_v128_xor(xl, xr); +} +#endif + +#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM) static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size) { assert(buffer_size % kByteGroupSize == 0); @@ -719,7 +955,7 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con return 0; } -#ifdef SIMD_SSE +#if defined(SIMD_SSE) || defined(SIMD_AVX) #define TEMP __m128i #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex + k)) #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) @@ -735,6 +971,15 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con #define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) #define FIXD(i) t##i = pi = vadd_u8(pi, t##i) #define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size +#endif + +#ifdef SIMD_WASM +#define TEMP v128_t +#define PREP() v128_t pi = wasm_v128_load(last_vertex + k) // TODO: use wasm_v32x4_load_splat to avoid buffer overrun +#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) +#define GRP4(i) t0 = wasm_v32x4_splat(r##i, 0), t1 = wasm_v32x4_splat(r##i, 1), t2 = wasm_v32x4_splat(r##i, 2), t3 = wasm_v32x4_splat(r##i, 3) +#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i) +#define SAVE(i) *reinterpret_cast(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size #endif PREP(); @@ -906,14 +1151,21 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve int cpuinfo[4] = {}; __cpuid(cpuinfo, 1); decode = (cpuinfo[2] & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock; -#elif defined(SIMD_SSE) || defined(SIMD_NEON) +#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM) decode = decodeVertexBlockSimd; #else decode = decodeVertexBlock; #endif -#if defined(SIMD_SSE) || defined(SIMD_NEON) +#if defined(SIMD_WASM) + // TODO: workaround for https://github.com/emscripten-core/emscripten/issues/9767 + if (!gDecodeBytesGroupInitialized) + gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables(); +#endif + +#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) assert(gDecodeBytesGroupInitialized); + (void)gDecodeBytesGroupInitialized; #endif unsigned char* vertex_data = static_cast(destination);