diff --git a/3rdparty/meshoptimizer/Makefile b/3rdparty/meshoptimizer/Makefile index 3e43811b0..82843a962 100644 --- a/3rdparty/meshoptimizer/Makefile +++ b/3rdparty/meshoptimizer/Makefile @@ -24,6 +24,10 @@ CFLAGS=-g -Wall -Wextra -Werror -std=c89 CXXFLAGS=-g -Wall -Wextra -Wshadow -Wno-missing-field-initializers -Werror -std=c++98 LDFLAGS= +WASM_SOURCES=src/vertexcodec.cpp src/indexcodec.cpp +WASM_EXPORTS=["_meshopt_decodeVertexBuffer","_meshopt_decodeIndexBuffer","_sbrk","__start"] +WASM_FLAGS=-O3 -DNDEBUG -s EXPORTED_FUNCTIONS='$(WASM_EXPORTS)' -s ALLOW_MEMORY_GROWTH=1 -s TOTAL_STACK=24576 -s TOTAL_MEMORY=65536 + ifeq ($(config),iphone) IPHONESDK=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk CFLAGS+=-arch armv7 -arch arm64 -isysroot $(IPHONESDK) @@ -70,10 +74,17 @@ format: gltfpack: $(GLTFPACK_OBJECTS) $(LIBRARY) $(CXX) $^ $(LDFLAGS) -o $@ -js/meshopt_decoder.js: src/vertexcodec.cpp src/indexcodec.cpp +build/decoder_base.wasm: $(WASM_SOURCES) @mkdir -p build - emcc $(filter %.cpp,$^) -O3 -DNDEBUG -s EXPORTED_FUNCTIONS='["_meshopt_decodeVertexBuffer", "_meshopt_decodeIndexBuffer", "_sbrk"]' -s ALLOW_MEMORY_GROWTH=1 -s TOTAL_STACK=24576 -s TOTAL_MEMORY=65536 -o build/meshopt_decoder.wasm - sed -i "s#\(var wasm = \)\".*\";#\\1\"$$(cat build/meshopt_decoder.wasm | base64 -w 0)\";#" $@ + emcc $^ $(WASM_FLAGS) -o $@ + +build/decoder_simd.wasm: $(WASM_SOURCES) + @mkdir -p build + emcc $^ $(WASM_FLAGS) -o $@ -munimplemented-simd128 -mbulk-memory + +js/meshopt_decoder.js: build/decoder_base.wasm build/decoder_simd.wasm + sed -i "s#\(var wasm_base = \)\".*\";#\\1\"$$(cat build/decoder_base.wasm | hexdump -v -e '1/1 "%02X"')\";#" $@ + sed -i "s#\(var wasm_simd = \)\".*\";#\\1\"$$(cat build/decoder_simd.wasm | hexdump -v -e '1/1 "%02X"')\";#" $@ $(EXECUTABLE): $(DEMO_OBJECTS) $(LIBRARY) $(CXX) $^ $(LDFLAGS) -o $@ diff --git a/3rdparty/meshoptimizer/js/meshopt_decoder.js b/3rdparty/meshoptimizer/js/meshopt_decoder.js index aab8b4318..8070dec8d 100644 --- a/3rdparty/meshoptimizer/js/meshopt_decoder.js +++ b/3rdparty/meshoptimizer/js/meshopt_decoder.js @@ -2,7 +2,16 @@ // Copyright (C) 2016-2019, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) var MeshoptDecoder = (function() { "use strict"; - var wasm = "AGFzbQEAAAABIwZgAX8AYAAAYAV/f39/fwF/YAN/f38Bf2ABfwF/YAN/f38AAicBA2Vudh9lbXNjcmlwdGVuX25vdGlmeV9tZW1vcnlfZ3Jvd3RoAAADCQgDAQQEBQACAgUDAQABBggBfwFBgMwBCwdeBQZtZW1vcnkCABFfX3dhc21fY2FsbF9jdG9ycwACGm1lc2hvcHRfZGVjb2RlVmVydGV4QnVmZmVyAAgZbWVzaG9wdF9kZWNvZGVJbmRleEJ1ZmZlcgAHBHNicmsAAwrCJAiCBAEDfyACQYDAAE8EQCAAIAEgAhAFIAAPCyAAIAJqIQMCQCAAIAFzQQNxRQRAAkAgAkEBSARAIAAhAgwBCyAAQQNxRQRAIAAhAgwBCyAAIQIDQCACIAEtAAA6AAAgAUEBaiEBIAJBAWoiAiADTw0BIAJBA3ENAAsLAkAgA0F8cSIEQcAASQ0AIAIgBEFAaiIFSw0AA0AgAiABKAIANgIAIAIgASgCBDYCBCACIAEoAgg2AgggAiABKAIMNgIMIAIgASgCEDYCECACIAEoAhQ2AhQgAiABKAIYNgIYIAIgASgCHDYCHCACIAEoAiA2AiAgAiABKAIkNgIkIAIgASgCKDYCKCACIAEoAiw2AiwgAiABKAIwNgIwIAIgASgCNDYCNCACIAEoAjg2AjggAiABKAI8NgI8IAFBQGshASACQUBrIgIgBU0NAAsLIAIgBE8NAQNAIAIgASgCADYCACABQQRqIQEgAkEEaiICIARJDQALDAELIANBBEkEQCAAIQIMAQsgA0F8aiIEIABJBEAgACECDAELIAAhAgNAIAIgAS0AADoAACACIAEtAAE6AAEgAiABLQACOgACIAIgAS0AAzoAAyABQQRqIQEgAkEEaiICIARNDQALCyACIANJBEADQCACIAEtAAA6AAAgAUEBaiEBIAJBAWoiAiADRw0ACwsgAAsDAAELOwECfz8AIQECQEGADCgCACICIABqIgAgAUEQdE0NACAAEAQNAEGACEEwNgIAQX8PC0GADCAANgIAIAILIwAgAD8AQRB0a0H//wNqQRB2QABBf0YEQEEADwtBABAAQQELOwEBfyACBEADQCAAIAEgAkGAICACQYAgSRsiAxABIQAgAUGAIGohASAAQYAgaiEAIAIgA2siAg0ACwsLxgIBAn8gAEGAAWoiAUF/akH/AToAACAAQf8BOgAAIAFBfmpB/wE6AAAgAEH/AToAASABQX1qQf8BOgAAIABB/wE6AAIgAUF8akH/AToAACAAQf8BOgADIABBACAAa0EDcSIBaiIAQX82AgAgAEGAASABa0F8cSICaiIBQXxqQX82AgACQCACQQlJDQAgAEF/NgIIIABBfzYCBCABQXhqQX82AgAgAUF0akF/NgIAIAJBGUkNACAAQX82AhggAEF/NgIUIABBfzYCECAAQX82AgwgAUFwakF/NgIAIAFBbGpBfzYCACABQWhqQX82AgAgAUFkakF/NgIAIAIgAEEEcUEYciICayIBQSBJDQAgACACaiEAA0AgAEJ/NwMYIABCfzcDECAAQn83AwggAEJ/NwMAIABBIGohACABQWBqIgFBH0sNAAsLC4YQAQ5/IwBBwAFrIgkkAAJ/QX4gAUEDbiIGQRFqIARLDQAaQX8gAy0AAEHgAUcNABogCUFAaxAGIAlCfzcDOCAJQn83AzAgCUJ/NwMoIAlCfzcDICAJQn83AxggCUJ/NwMQIAlCfzcDCCAJQn83AwAgAyAEakFwaiERIANBAWoiEiAGaiELIAEEQCACQQJHIQ9BACEDQQAhAkEAIQQDQEF+IAsgEUsNAhoCfyASLQAAIgpB7wFNBEAgCUFAayAKQQR2QX9zIAxqQQ9xQQN0aiIGKAIEIQUgBigCACENIApBD3EiBkEPRwRAIAkgCkF/cyAEakEPcUECdGooAgAgAyAGGyEIIAZFIQoCQCAPRQRAIAAgAkEBdGoiBiANOwEAIAYgBTsBAiAGIAg7AQQMAQsgACACQQJ0aiIGIA02AgAgBiAINgIIIAYgBTYCBAsgAyAKaiEDIAlBQGsgDEEDdGoiBiAFNgIEIAYgCDYCACAJIARBAnRqIAg2AgAgCUFAayAMQQFqQQ9xIgVBA3RqIgYgDTYCACAGIAg2AgQgBCAKaiEEIAVBAWoMAgsgCywAACIGQf8BcSEHAn8gC0EBaiAGQX9KDQAaIAdB/wBxIAssAAEiBkH/AHFBB3RyIQcgC0ECaiAGQX9KDQAaIAssAAIiBkH/AHFBDnQgB3IhByALQQNqIAZBf0oNABogCywAAyIGQf8AcUEVdCAHciEHIAtBBGogBkF/Sg0AGiALLQAEQRx0IAdyIQcgC0EFagshC0EAIAdBAXFrIAdBAXZzIA5qIQ4CQCAPRQRAIAAgAkEBdGoiBiANOwEAIAYgBTsBAiAGIA47AQQMAQsgACACQQJ0aiIGIA02AgAgBiAONgIIIAYgBTYCBAsgCUFAayAMQQN0aiIGIAU2AgQgBiAONgIAIAkgBEECdGogDjYCACAJQUBrIAxBAWpBD3EiBUEDdGoiBiANNgIAIAYgDjYCBCAEQQFqIQQgBUEBagwBCyAKQf0BTQRAIAkgBCARIApBD3FqLQAAIghBBHYiBWtBD3FBAnRqKAIAIANBAWoiBiAFGyENIAkgBCAIa0EPcUECdGooAgAgBiAFRSIFaiIKIAhBD3EiBhshByAGRSEIAkAgD0UEQCAAIAJBAXRqIgYgAzsBACAGIA07AQIgBiAHOwEEDAELIAAgAkECdGoiBiADNgIAIAYgBzYCCCAGIA02AgQLIAkgBEECdGogAzYCACAJQUBrIAxBA3RqIgYgAzYCBCAGIA02AgAgCSAEQQFqIgZBD3FBAnRqIA02AgAgCUFAayAMQQFqQQ9xQQN0aiIEIAc2AgAgBCANNgIEIAkgBSAGakEPcSIFQQJ0aiAHNgIAIAlBQGsgDEECakEPcSIGQQN0aiIEIAM2AgAgBCAHNgIEIAUgCGohBCAIIApqIQMgBkEBagwBCyADIApB/gFGIgVqIQcgCy0AACIIQQ9xIRACQCAIQQR2Ig1FBEAgB0EBaiEKDAELIAchCiAJIAQgDWtBD3FBAnRqKAIAIQcLAkAgEEUEQCAKQQFqIQYMAQsgCiEGIAkgBCAIa0EPcUECdGooAgAhCgsCQCAFBEAgC0EBaiEIDAELIAssAAEiBUH/AXEhAwJ/IAtBAmogBUF/Sg0AGiADQf8AcSALLAACIgVB/wBxQQd0ciEDIAtBA2ogBUF/Sg0AGiALLAADIgVB/wBxQQ50IANyIQMgC0EEaiAFQX9KDQAaIAssAAQiBUH/AHFBFXQgA3IhAyALQQVqIAVBf0oNABogCy0ABUEcdCADciEDIAtBBmoLIQhBACADQQFxayADQQF2cyAOaiIOIQMLAkAgDUEPRwRAIAghBQwBCyAILAAAIgVB/wFxIQcCfyAIQQFqIAVBf0oNABogB0H/AHEgCCwAASIFQf8AcUEHdHIhByAIQQJqIAVBf0oNABogCCwAAiIFQf8AcUEOdCAHciEHIAhBA2ogBUF/Sg0AGiAILAADIgVB/wBxQRV0IAdyIQcgCEEEaiAFQX9KDQAaIAgtAARBHHQgB3IhByAIQQVqCyEFQQAgB0EBcWsgB0EBdnMgDmoiDiEHCwJAIBBBD0cEQCAFIQsMAQsgBSwAACIIQf8BcSEKAn8gBUEBaiAIQX9KDQAaIApB/wBxIAUsAAEiCEH/AHFBB3RyIQogBUECaiAIQX9KDQAaIAUsAAIiCEH/AHFBDnQgCnIhCiAFQQNqIAhBf0oNABogBSwAAyIIQf8AcUEVdCAKciEKIAVBBGogCEF/Sg0AGiAFLQAEQRx0IApyIQogBUEFagshC0EAIApBAXFrIApBAXZzIA5qIg4hCgsCQCAPRQRAIAAgAkEBdGoiBSADOwEAIAUgBzsBAiAFIAo7AQQMAQsgACACQQJ0aiIFIAM2AgAgBSAKNgIIIAUgBzYCBAsgCUFAayAMQQN0aiIFIAM2AgQgBSAHNgIAIAkgBEECdGogAzYCACAJQUBrIAxBAWpBD3FBA3RqIgUgCjYCACAFIAc2AgQgCSAEQQFqIgVBD3FBAnRqIAc2AgAgCUFAayAMQQJqQQ9xQQN0aiIEIAM2AgAgBCAKNgIEIAkgBSANRSANQQ9GcmoiA0EPcUECdGogCjYCACADIBBFIBBBD0ZyaiEEIAYhAyAMQQNqCyEMIBJBAWohEiAMQQ9xIQwgBEEPcSEEIAJBA2oiAiABSQ0ACwtBAEF9IAsgEUYbCyEMIAlBwAFqJAAgDAvLDAEPfyMAQYDEAGsiECQAAn9BfiACQQFqIARLDQAaQX8gAy0AAEGgAUcNABogECADIARqIg8gAmsgAhABIQtBgMAAIAJuQfD/AHEiBEGAAiAEQYACSRshESADQQFqIQkCQANAIAwgAU8NASARIAEgDGsgDCARaiABSRshDQJAAkAgAkUEQCAJIQQMAQsgDUEPaiIDQXBxIRIgA0EEdkEDakECdiETQQAhDiAJIQoDQCAPIAprIBNJBEBBACEJDAMLIAogE2ohBEEAIQlBACEDIBIEQANAIA8gBGtBIEkNBCALQYDCAGogA2ohCAJAAkACQAJAAkAgCiADQQZ2ai0AACADQQN2QQZxdkEDcUEBaw4DAQIDAAsgCEIANwMAIAhCADcDCAwDCyAIIAQtAAQgBC0AACIGQQZ2IgUgBUEDRiIFGzoAACAIIARBBGogBWoiBS0AACAGQQR2QQNxIgcgB0EDRiIHGzoAASAIIAUgB2oiBS0AACAGQQJ2QQNxIgcgB0EDRiIHGzoAAiAIIAUgB2oiBS0AACAGQQNxIgYgBkEDRiIGGzoAAyAIIAUgBmoiBS0AACAELQABIgZBBnYiByAHQQNGIgcbOgAEIAggBSAHaiIFLQAAIAZBBHZBA3EiByAHQQNGIgcbOgAFIAggBSAHaiIFLQAAIAZBAnZBA3EiByAHQQNGIgcbOgAGIAggBSAHaiIFLQAAIAZBA3EiBiAGQQNGIgYbOgAHIAggBSAGaiIFLQAAIAQtAAIiBkEGdiIHIAdBA0YiBxs6AAggCCAFIAdqIgUtAAAgBkEEdkEDcSIHIAdBA0YiBxs6AAkgCCAFIAdqIgUtAAAgBkECdkEDcSIHIAdBA0YiBxs6AAogCCAFIAdqIgUtAAAgBkEDcSIGIAZBA0YiBhs6AAsgCCAFIAZqIgYtAAAgBC0AAyIEQQZ2IgUgBUEDRiIFGzoADCAIIAUgBmoiBi0AACAEQQR2QQNxIgUgBUEDRiIFGzoADSAIIAUgBmoiBi0AACAEQQJ2QQNxIgUgBUEDRiIFGzoADiAIIAUgBmoiCC0AACAEQQNxIgQgBEEDRiIEGzoADyAEIAhqIQQMAgsgCCAELQAIIAQtAAAiBkEEdiIFIAVBD0YiBRs6AAAgCCAEQQhqIAVqIgUtAAAgBkEPcSIGIAZBD0YiBhs6AAEgCCAFIAZqIgYtAAAgBC0AASIFQQR2IgcgB0EPRiIHGzoAAiAIIAYgB2oiBi0AACAFQQ9xIgUgBUEPRiIFGzoAAyAIIAUgBmoiBi0AACAELQACIgVBBHYiByAHQQ9GIgcbOgAEIAggBiAHaiIGLQAAIAVBD3EiBSAFQQ9GIgUbOgAFIAggBSAGaiIGLQAAIAQtAAMiBUEEdiIHIAdBD0YiBxs6AAYgCCAGIAdqIgYtAAAgBUEPcSIFIAVBD0YiBRs6AAcgCCAFIAZqIgYtAAAgBC0ABCIFQQR2IgcgB0EPRiIHGzoACCAIIAYgB2oiBi0AACAFQQ9xIgUgBUEPRiIFGzoACSAIIAUgBmoiBi0AACAELQAFIgVBBHYiByAHQQ9GIgcbOgAKIAggBiAHaiIGLQAAIAVBD3EiBSAFQQ9GIgUbOgALIAggBSAGaiIGLQAAIAQtAAYiBUEEdiIHIAdBD0YiBxs6AAwgCCAGIAdqIgYtAAAgBUEPcSIFIAVBD0YiBRs6AA0gCCAFIAZqIgYtAAAgBC0AByIEQQR2IgUgBUEPRiIFGzoADiAIIAUgBmoiCC0AACAEQQ9xIgQgBEEPRiIEGzoADyAEIAhqIQQMAQsgCCAEKQAANwAAIAggBCkACDcACCAEQRBqIQQLIANBEGoiAyASSQ0ACwsgBEUNAiANBEAgCyAOai0AACEKIA4hAwNAIAtBgAJqIANqIAogC0GAwgBqIAlqLQAAIgpBAXZBACAKQQFxa3NqIgo6AAAgAiADaiEDIAlBAWoiCSANRw0ACwsgBCEKIA5BAWoiDiACRw0ACwsgACACIAxsaiALQYACaiACIA1sEAEaIAsgC0GAAmogDUF/aiACbGogAhABGiAEIQkLIA1BACAJGyAMaiEMIAkNAAtBfgwBC0EAQX0gDyAJayACQSAgAkEgSxtGGwshCSAQQYDEAGokACAJCwsJAQBBgAwLAsBm"; + var wasm_basevar wasm_simdvar detector = new Uint8Array([0,97,115,109,1,0,0,0,1,4,1,96,0,0,3,3,2,0,0,5,3,1,0,1,12,1,0,10,22,2,12,0,65,0,65,0,65,0,252,10,0,0,11,7,0,65,0,253,4,26,11]); + + var wasm = wasm_base; + + if (WebAssembly.validate(detector)) { + wasm = wasm_simd; + console.log("Warning: meshopt_decoder is using experimental SIMD support"); + } var instance, heap; @@ -13,19 +22,21 @@ var MeshoptDecoder = (function() { }; var promise = - (typeof fetch === 'function' ? - fetch('data:application/octet-stream;base64,' + wasm) - .then(response => response.arrayBuffer()) : - Promise.resolve(Buffer.from(wasm, 'base64').buffer)) - .then(bytes => WebAssembly.instantiate(bytes, { env })) + WebAssembly.instantiate(unhex(wasm), { env }) .then(function(result) { instance = result.instance; - if (instance.exports.__wasm_call_ctors) { - instance.exports.__wasm_call_ctors(); - } + instance.exports._start(); env.emscripten_notify_memory_growth(0); }); + function unhex(data) { + var bytes = new Uint8Array(data.length / 2); + for (var i = 0; i < data.length; i += 2) { + bytes[i / 2] = parseInt(data.substr(i, 2), 16); + } + return bytes.buffer; + } + function decode(fun, target, count, size, source) { var sbrk = instance.exports.sbrk; var tp = sbrk(count * size); diff --git a/3rdparty/meshoptimizer/src/vertexcodec.cpp b/3rdparty/meshoptimizer/src/vertexcodec.cpp index 44c0c8787..ed9e41fc9 100644 --- a/3rdparty/meshoptimizer/src/vertexcodec.cpp +++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp @@ -61,11 +61,14 @@ #endif #ifdef SIMD_WASM -#define wasm_v32x4_splat(v, i) wasm_v8x16_shuffle(v, v, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3) -#define wasm_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23) -#define wasm_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31) -#define wasm_unpacklo_v16x8(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23) -#define wasm_unpackhi_v16x8(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31) +#define wasmx_shuffle_v32x4(v, i, j, k, l) wasm_v8x16_shuffle(v, v, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * j, 4 * j + 1, 4 * j + 2, 4 * j + 3, 4 * k, 4 * k + 1, 4 * k + 2, 4 * k + 3, 4 * l, 4 * l + 1, 4 * l + 2, 4 * l + 3) +#define wasmx_splat_v32x4(v, i) wasmx_shuffle_v32x4(v, i, i, i, i) +#define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23) +#define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31) +#define wasmx_unpacklo_v16x8(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23) +#define wasmx_unpackhi_v16x8(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31) +#define wasmx_unpacklo_v64x2(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23) +#define wasmx_unpackhi_v64x2(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31) #endif namespace meshopt @@ -414,6 +417,9 @@ static const unsigned char* decodeVertexBlock(const unsigned char* data, const u static unsigned char kDecodeBytesGroupShuffle[256][8]; static unsigned char kDecodeBytesGroupCount[256]; +#ifdef EMSCRIPTEN +__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop! +#endif static bool decodeBytesGroupBuildTables() { for (int mask = 0; mask < 256; ++mask) @@ -706,24 +712,23 @@ static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1) v128_t sm1r = wasm_i8x16_add(sm1, sm1off); - return wasm_v8x16_shuffle(sm0, sm1r, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); + return wasmx_unpacklo_v64x2(sm0, sm1r); } static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1) { - uint64_t mbits = 0x8040201008040201ull; + v128_t mask_0 = wasmx_shuffle_v32x4(mask, 0, 2, 1, 3); - uint64_t m0_8 = wasm_i64x2_extract_lane(mask, 0) & mbits; - uint64_t m1_8 = wasm_i64x2_extract_lane(mask, 1) & mbits; + // TODO: when Chrome supports v128.const we can try doing vectorized and? + uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull; + uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull; - uint32_t m0_4 = m0_8 | (m0_8 >> 32); - uint32_t m1_4 = m1_8 | (m1_8 >> 32); + uint64_t mask_2 = mask_1a | mask_1b; + uint64_t mask_4 = mask_2 | (mask_2 >> 16); + uint64_t mask_8 = mask_4 | (mask_4 >> 8); - uint16_t m0_2 = m0_4 | (m0_4 >> 16); - uint16_t m1_2 = m1_4 | (m1_4 >> 16); - - mask0 = m0_2 | (m0_2 >> 8); - mask1 = m1_2 | (m1_2 >> 8); + mask0 = uint8_t(mask_8); + mask1 = uint8_t(mask_8 >> 32); } static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) @@ -748,19 +753,12 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi v128_t sel2 = wasm_v128_load(data); v128_t rest = wasm_v128_load(data + 4); - v128_t sel22 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2); - v128_t sel2222 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22); + v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2); + v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22); v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3)); v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3)); - if (!wasm_i8x16_any_true(mask)) - { - wasm_v128_store(buffer, sel); - - return data + 4; - } - unsigned char mask0, mask1; wasmMoveMask(mask, mask0, mask1); @@ -780,18 +778,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi v128_t sel4 = wasm_v128_load(data); v128_t rest = wasm_v128_load(data + 8); - v128_t sel44 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4); + v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4); v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15)); v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15)); - if (!wasm_i8x16_any_true(mask)) - { - wasm_v128_store(buffer, sel); - - return data + 8; - } - unsigned char mask0, mask1; wasmMoveMask(mask, mask0, mask1); @@ -871,15 +862,15 @@ static uint8x16_t unzigzag8(uint8x16_t v) #ifdef SIMD_WASM static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3) { - v128_t t0 = wasm_unpacklo_v8x16(x0, x1); - v128_t t1 = wasm_unpackhi_v8x16(x0, x1); - v128_t t2 = wasm_unpacklo_v8x16(x2, x3); - v128_t t3 = wasm_unpackhi_v8x16(x2, x3); + v128_t t0 = wasmx_unpacklo_v8x16(x0, x1); + v128_t t1 = wasmx_unpackhi_v8x16(x0, x1); + v128_t t2 = wasmx_unpacklo_v8x16(x2, x3); + v128_t t3 = wasmx_unpackhi_v8x16(x2, x3); - x0 = wasm_unpacklo_v16x8(t0, t2); - x1 = wasm_unpackhi_v16x8(t0, t2); - x2 = wasm_unpacklo_v16x8(t1, t3); - x3 = wasm_unpackhi_v16x8(t1, t3); + x0 = wasmx_unpacklo_v16x8(t0, t2); + x1 = wasmx_unpackhi_v16x8(t0, t2); + x2 = wasmx_unpacklo_v16x8(t1, t3); + x3 = wasmx_unpackhi_v16x8(t1, t3); } static v128_t unzigzag8(v128_t v) @@ -977,7 +968,7 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con #define TEMP v128_t #define PREP() v128_t pi = wasm_v128_load(last_vertex + k) // TODO: use wasm_v32x4_load_splat to avoid buffer overrun #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) -#define GRP4(i) t0 = wasm_v32x4_splat(r##i, 0), t1 = wasm_v32x4_splat(r##i, 1), t2 = wasm_v32x4_splat(r##i, 2), t3 = wasm_v32x4_splat(r##i, 3) +#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) #define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i) #define SAVE(i) *reinterpret_cast(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size #endif @@ -1157,12 +1148,6 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve decode = decodeVertexBlock; #endif -#if defined(SIMD_WASM) - // TODO: workaround for https://github.com/emscripten-core/emscripten/issues/9767 - if (!gDecodeBytesGroupInitialized) - gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables(); -#endif - #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) assert(gDecodeBytesGroupInitialized); (void)gDecodeBytesGroupInitialized;