From 686190d7ff5957f1999a6bf84801df5943f02f95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=91=D1=80=D0=B0=D0=BD=D0=B8=D0=BC=D0=B8=D1=80=20=D0=9A?=
 =?UTF-8?q?=D0=B0=D1=80=D0=B0=D1=9F=D0=B8=D1=9B?=
 <branimirkaradzic@gmail.com>
Date: Sun, 14 Sep 2025 09:07:27 -0700
Subject: [PATCH] Updated spirv-cross.

---
 3rdparty/spirv-cross/main.cpp                 |   1 +
 3rdparty/spirv-cross/spirv.h                  | 126 ++-
 3rdparty/spirv-cross/spirv.hpp                | 130 ++-
 3rdparty/spirv-cross/spirv_cfg.cpp            | 124 ++-
 3rdparty/spirv-cross/spirv_cfg.hpp            |  26 +-
 3rdparty/spirv-cross/spirv_common.hpp         |  87 +-
 3rdparty/spirv-cross/spirv_cross.cpp          |  31 +-
 3rdparty/spirv-cross/spirv_cross.hpp          |   2 +
 3rdparty/spirv-cross/spirv_cross_c.cpp        |   8 +
 3rdparty/spirv-cross/spirv_cross_c.h          |   3 +-
 .../spirv-cross/spirv_cross_parsed_ir.cpp     |  31 +-
 .../spirv-cross/spirv_cross_parsed_ir.hpp     |   1 +
 3rdparty/spirv-cross/spirv_glsl.cpp           | 764 ++++++++++++++++--
 3rdparty/spirv-cross/spirv_glsl.hpp           |  18 +-
 3rdparty/spirv-cross/spirv_hlsl.cpp           |   4 +-
 3rdparty/spirv-cross/spirv_msl.cpp            | 386 ++++++---
 3rdparty/spirv-cross/spirv_msl.hpp            |  21 +-
 3rdparty/spirv-cross/spirv_parser.cpp         | 126 ++-
 3rdparty/spirv-cross/spirv_reflect.cpp        |  11 +-
 19 files changed, 1606 insertions(+), 294 deletions(-)

diff --git a/3rdparty/spirv-cross/main.cpp b/3rdparty/spirv-cross/main.cpp
index ea3ecf441..605901648 100644
--- a/3rdparty/spirv-cross/main.cpp
+++ b/3rdparty/spirv-cross/main.cpp
@@ -536,6 +536,7 @@ static void print_resources(const Compiler &compiler, const ShaderResources &res
 	print_resources(compiler, "push", res.push_constant_buffers);
 	print_resources(compiler, "counters", res.atomic_counters);
 	print_resources(compiler, "acceleration structures", res.acceleration_structures);
+	print_resources(compiler, "tensors", res.tensors);
 	print_resources(compiler, "record buffers", res.shader_record_buffers);
 	print_resources(compiler, spv::StorageClassInput, res.builtin_inputs);
 	print_resources(compiler, spv::StorageClassOutput, res.builtin_outputs);
diff --git a/3rdparty/spirv-cross/spirv.h b/3rdparty/spirv-cross/spirv.h
index 43dd311f0..005d451d3 100644
--- a/3rdparty/spirv-cross/spirv.h
+++ b/3rdparty/spirv-cross/spirv.h
@@ -1,27 +1,11 @@
 /*
-** Copyright (c) 2014-2024 The Khronos Group Inc.
+** Copyright: 2014-2024 The Khronos Group Inc.
+** License: MIT
 ** 
-** Permission is hereby granted, free of charge, to any person obtaining a copy
-** of this software and/or associated documentation files (the "Materials"),
-** to deal in the Materials without restriction, including without limitation
-** the rights to use, copy, modify, merge, publish, distribute, sublicense,
-** and/or sell copies of the Materials, and to permit persons to whom the
-** Materials are furnished to do so, subject to the following conditions:
-** 
-** The above copyright notice and this permission notice shall be included in
-** all copies or substantial portions of the Materials.
-** 
-** MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS
-** STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND
-** HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/
-** 
-** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-** OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-** THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-** FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS
-** IN THE MATERIALS.
+** MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+** KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+** SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+** https://www.khronos.org/registry/
 */
 
 /*
@@ -176,6 +160,8 @@ typedef enum SpvExecutionMode_ {
     SpvExecutionModeSignedZeroInfNanPreserve = 4461,
     SpvExecutionModeRoundingModeRTE = 4462,
     SpvExecutionModeRoundingModeRTZ = 4463,
+    SpvExecutionModeNonCoherentTileAttachmentReadQCOM = 4489,
+    SpvExecutionModeTileShadingRateQCOM = 4490,
     SpvExecutionModeEarlyAndLateFragmentTestsAMD = 5017,
     SpvExecutionModeStencilRefReplacingEXT = 5027,
     SpvExecutionModeCoalescingAMDX = 5069,
@@ -245,6 +231,7 @@ typedef enum SpvStorageClass_ {
     SpvStorageClassImage = 11,
     SpvStorageClassStorageBuffer = 12,
     SpvStorageClassTileImageEXT = 4172,
+    SpvStorageClassTileAttachmentQCOM = 4491,
     SpvStorageClassNodePayloadAMDX = 5068,
     SpvStorageClassCallableDataKHR = 5328,
     SpvStorageClassCallableDataNV = 5328,
@@ -554,6 +541,7 @@ typedef enum SpvDecoration_ {
     SpvDecorationMaxByteOffset = 45,
     SpvDecorationAlignmentId = 46,
     SpvDecorationMaxByteOffsetId = 47,
+    SpvDecorationSaturatedToLargestFloat8NormalConversionEXT = 4216,
     SpvDecorationNoSignedWrap = 4469,
     SpvDecorationNoUnsignedWrap = 4470,
     SpvDecorationWeightTextureQCOM = 4487,
@@ -723,6 +711,9 @@ typedef enum SpvBuiltIn_ {
     SpvBuiltInDeviceIndex = 4438,
     SpvBuiltInViewIndex = 4440,
     SpvBuiltInShadingRateKHR = 4444,
+    SpvBuiltInTileOffsetQCOM = 4492,
+    SpvBuiltInTileDimensionQCOM = 4493,
+    SpvBuiltInTileApronSizeQCOM = 4494,
     SpvBuiltInBaryCoordNoPerspAMD = 4992,
     SpvBuiltInBaryCoordNoPerspCentroidAMD = 4993,
     SpvBuiltInBaryCoordNoPerspSampleAMD = 4994,
@@ -1073,7 +1064,13 @@ typedef enum SpvCapability_ {
     SpvCapabilityTileImageColorReadAccessEXT = 4166,
     SpvCapabilityTileImageDepthReadAccessEXT = 4167,
     SpvCapabilityTileImageStencilReadAccessEXT = 4168,
+    SpvCapabilityTensorsARM = 4174,
+    SpvCapabilityStorageTensorArrayDynamicIndexingARM = 4175,
+    SpvCapabilityStorageTensorArrayNonUniformIndexingARM = 4176,
+    SpvCapabilityGraphARM = 4191,
     SpvCapabilityCooperativeMatrixLayoutsARM = 4201,
+    SpvCapabilityFloat8EXT = 4212,
+    SpvCapabilityFloat8CooperativeMatrixEXT = 4213,
     SpvCapabilityFragmentShadingRateKHR = 4422,
     SpvCapabilitySubgroupBallotKHR = 4423,
     SpvCapabilityDrawParameters = 4427,
@@ -1109,6 +1106,7 @@ typedef enum SpvCapability_ {
     SpvCapabilityTextureSampleWeightedQCOM = 4484,
     SpvCapabilityTextureBoxFilterQCOM = 4485,
     SpvCapabilityTextureBlockMatchQCOM = 4486,
+    SpvCapabilityTileShadingQCOM = 4495,
     SpvCapabilityTextureBlockMatch2QCOM = 4498,
     SpvCapabilityFloat16ImageAMD = 5008,
     SpvCapabilityImageGatherBiasLodAMD = 5009,
@@ -1119,6 +1117,8 @@ typedef enum SpvCapability_ {
     SpvCapabilityShaderClockKHR = 5055,
     SpvCapabilityShaderEnqueueAMDX = 5067,
     SpvCapabilityQuadControlKHR = 5087,
+    SpvCapabilityInt4TypeINTEL = 5112,
+    SpvCapabilityInt4CooperativeMatrixINTEL = 5114,
     SpvCapabilityBFloat16TypeKHR = 5116,
     SpvCapabilityBFloat16DotProductKHR = 5117,
     SpvCapabilityBFloat16CooperativeMatrixKHR = 5118,
@@ -1287,6 +1287,7 @@ typedef enum SpvCapability_ {
     SpvCapabilityMaskedGatherScatterINTEL = 6427,
     SpvCapabilityCacheControlsINTEL = 6441,
     SpvCapabilityRegisterLimitsINTEL = 6460,
+    SpvCapabilityBindlessImagesINTEL = 6528,
     SpvCapabilityMax = 0x7fffffff,
 } SpvCapability;
 
@@ -1463,6 +1464,24 @@ typedef enum SpvTensorAddressingOperandsMask_ {
     SpvTensorAddressingOperandsDecodeFuncMask = 0x00000002,
 } SpvTensorAddressingOperandsMask;
 
+typedef enum SpvTensorOperandsShift_ {
+    SpvTensorOperandsNontemporalARMShift = 0,
+    SpvTensorOperandsOutOfBoundsValueARMShift = 1,
+    SpvTensorOperandsMakeElementAvailableARMShift = 2,
+    SpvTensorOperandsMakeElementVisibleARMShift = 3,
+    SpvTensorOperandsNonPrivateElementARMShift = 4,
+    SpvTensorOperandsMax = 0x7fffffff,
+} SpvTensorOperandsShift;
+
+typedef enum SpvTensorOperandsMask_ {
+    SpvTensorOperandsMaskNone = 0,
+    SpvTensorOperandsNontemporalARMMask = 0x00000001,
+    SpvTensorOperandsOutOfBoundsValueARMMask = 0x00000002,
+    SpvTensorOperandsMakeElementAvailableARMMask = 0x00000004,
+    SpvTensorOperandsMakeElementVisibleARMMask = 0x00000008,
+    SpvTensorOperandsNonPrivateElementARMMask = 0x00000010,
+} SpvTensorOperandsMask;
+
 typedef enum SpvInitializationModeQualifier_ {
     SpvInitializationModeQualifierInitOnDeviceReprogramINTEL = 0,
     SpvInitializationModeQualifierInitOnDeviceResetINTEL = 1,
@@ -1549,6 +1568,8 @@ typedef enum SpvRawAccessChainOperandsMask_ {
 
 typedef enum SpvFPEncoding_ {
     SpvFPEncodingBFloat16KHR = 0,
+    SpvFPEncodingFloat8E4M3EXT = 4214,
+    SpvFPEncodingFloat8E5M2EXT = 4215,
     SpvFPEncodingMax = 0x7fffffff,
 } SpvFPEncoding;
 
@@ -1927,6 +1948,17 @@ typedef enum SpvOp_ {
     SpvOpColorAttachmentReadEXT = 4160,
     SpvOpDepthAttachmentReadEXT = 4161,
     SpvOpStencilAttachmentReadEXT = 4162,
+    SpvOpTypeTensorARM = 4163,
+    SpvOpTensorReadARM = 4164,
+    SpvOpTensorWriteARM = 4165,
+    SpvOpTensorQuerySizeARM = 4166,
+    SpvOpGraphConstantARM = 4181,
+    SpvOpGraphEntryPointARM = 4182,
+    SpvOpGraphARM = 4183,
+    SpvOpGraphInputARM = 4184,
+    SpvOpGraphSetOutputARM = 4185,
+    SpvOpGraphEndARM = 4186,
+    SpvOpTypeGraphARM = 4190,
     SpvOpTerminateInvocation = 4416,
     SpvOpTypeUntypedPointerKHR = 4417,
     SpvOpUntypedVariableKHR = 4418,
@@ -2385,6 +2417,9 @@ typedef enum SpvOp_ {
     SpvOpRoundFToTF32INTEL = 6426,
     SpvOpMaskedGatherINTEL = 6428,
     SpvOpMaskedScatterINTEL = 6429,
+    SpvOpConvertHandleToImageINTEL = 6529,
+    SpvOpConvertHandleToSamplerINTEL = 6530,
+    SpvOpConvertHandleToSampledImageINTEL = 6531,
     SpvOpMax = 0x7fffffff,
 } SpvOp;
 
@@ -2743,6 +2778,17 @@ inline void SpvHasResultAndType(SpvOp opcode, bool *hasResult, bool *hasResultTy
     case SpvOpColorAttachmentReadEXT: *hasResult = true; *hasResultType = true; break;
     case SpvOpDepthAttachmentReadEXT: *hasResult = true; *hasResultType = true; break;
     case SpvOpStencilAttachmentReadEXT: *hasResult = true; *hasResultType = true; break;
+    case SpvOpTypeTensorARM: *hasResult = true; *hasResultType = false; break;
+    case SpvOpTensorReadARM: *hasResult = true; *hasResultType = true; break;
+    case SpvOpTensorWriteARM: *hasResult = false; *hasResultType = false; break;
+    case SpvOpTensorQuerySizeARM: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGraphConstantARM: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGraphEntryPointARM: *hasResult = false; *hasResultType = false; break;
+    case SpvOpGraphARM: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGraphInputARM: *hasResult = true; *hasResultType = true; break;
+    case SpvOpGraphSetOutputARM: *hasResult = false; *hasResultType = false; break;
+    case SpvOpGraphEndARM: *hasResult = false; *hasResultType = false; break;
+    case SpvOpTypeGraphARM: *hasResult = true; *hasResultType = false; break;
     case SpvOpTerminateInvocation: *hasResult = false; *hasResultType = false; break;
     case SpvOpTypeUntypedPointerKHR: *hasResult = true; *hasResultType = false; break;
     case SpvOpUntypedVariableKHR: *hasResult = true; *hasResultType = true; break;
@@ -3190,6 +3236,9 @@ inline void SpvHasResultAndType(SpvOp opcode, bool *hasResult, bool *hasResultTy
     case SpvOpRoundFToTF32INTEL: *hasResult = true; *hasResultType = true; break;
     case SpvOpMaskedGatherINTEL: *hasResult = true; *hasResultType = true; break;
     case SpvOpMaskedScatterINTEL: *hasResult = false; *hasResultType = false; break;
+    case SpvOpConvertHandleToImageINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertHandleToSamplerINTEL: *hasResult = true; *hasResultType = true; break;
+    case SpvOpConvertHandleToSampledImageINTEL: *hasResult = true; *hasResultType = true; break;
     }
 }
 inline const char* SpvSourceLanguageToString(SpvSourceLanguage value) {
@@ -3305,6 +3354,8 @@ inline const char* SpvExecutionModeToString(SpvExecutionMode value) {
     case SpvExecutionModeSignedZeroInfNanPreserve: return "SignedZeroInfNanPreserve";
     case SpvExecutionModeRoundingModeRTE: return "RoundingModeRTE";
     case SpvExecutionModeRoundingModeRTZ: return "RoundingModeRTZ";
+    case SpvExecutionModeNonCoherentTileAttachmentReadQCOM: return "NonCoherentTileAttachmentReadQCOM";
+    case SpvExecutionModeTileShadingRateQCOM: return "TileShadingRateQCOM";
     case SpvExecutionModeEarlyAndLateFragmentTestsAMD: return "EarlyAndLateFragmentTestsAMD";
     case SpvExecutionModeStencilRefReplacingEXT: return "StencilRefReplacingEXT";
     case SpvExecutionModeCoalescingAMDX: return "CoalescingAMDX";
@@ -3371,6 +3422,7 @@ inline const char* SpvStorageClassToString(SpvStorageClass value) {
     case SpvStorageClassImage: return "Image";
     case SpvStorageClassStorageBuffer: return "StorageBuffer";
     case SpvStorageClassTileImageEXT: return "TileImageEXT";
+    case SpvStorageClassTileAttachmentQCOM: return "TileAttachmentQCOM";
     case SpvStorageClassNodePayloadAMDX: return "NodePayloadAMDX";
     case SpvStorageClassCallableDataKHR: return "CallableDataKHR";
     case SpvStorageClassIncomingCallableDataKHR: return "IncomingCallableDataKHR";
@@ -3619,6 +3671,7 @@ inline const char* SpvDecorationToString(SpvDecoration value) {
     case SpvDecorationMaxByteOffset: return "MaxByteOffset";
     case SpvDecorationAlignmentId: return "AlignmentId";
     case SpvDecorationMaxByteOffsetId: return "MaxByteOffsetId";
+    case SpvDecorationSaturatedToLargestFloat8NormalConversionEXT: return "SaturatedToLargestFloat8NormalConversionEXT";
     case SpvDecorationNoSignedWrap: return "NoSignedWrap";
     case SpvDecorationNoUnsignedWrap: return "NoUnsignedWrap";
     case SpvDecorationWeightTextureQCOM: return "WeightTextureQCOM";
@@ -3778,6 +3831,9 @@ inline const char* SpvBuiltInToString(SpvBuiltIn value) {
     case SpvBuiltInDeviceIndex: return "DeviceIndex";
     case SpvBuiltInViewIndex: return "ViewIndex";
     case SpvBuiltInShadingRateKHR: return "ShadingRateKHR";
+    case SpvBuiltInTileOffsetQCOM: return "TileOffsetQCOM";
+    case SpvBuiltInTileDimensionQCOM: return "TileDimensionQCOM";
+    case SpvBuiltInTileApronSizeQCOM: return "TileApronSizeQCOM";
     case SpvBuiltInBaryCoordNoPerspAMD: return "BaryCoordNoPerspAMD";
     case SpvBuiltInBaryCoordNoPerspCentroidAMD: return "BaryCoordNoPerspCentroidAMD";
     case SpvBuiltInBaryCoordNoPerspSampleAMD: return "BaryCoordNoPerspSampleAMD";
@@ -3958,7 +4014,13 @@ inline const char* SpvCapabilityToString(SpvCapability value) {
     case SpvCapabilityTileImageColorReadAccessEXT: return "TileImageColorReadAccessEXT";
     case SpvCapabilityTileImageDepthReadAccessEXT: return "TileImageDepthReadAccessEXT";
     case SpvCapabilityTileImageStencilReadAccessEXT: return "TileImageStencilReadAccessEXT";
+    case SpvCapabilityTensorsARM: return "TensorsARM";
+    case SpvCapabilityStorageTensorArrayDynamicIndexingARM: return "StorageTensorArrayDynamicIndexingARM";
+    case SpvCapabilityStorageTensorArrayNonUniformIndexingARM: return "StorageTensorArrayNonUniformIndexingARM";
+    case SpvCapabilityGraphARM: return "GraphARM";
     case SpvCapabilityCooperativeMatrixLayoutsARM: return "CooperativeMatrixLayoutsARM";
+    case SpvCapabilityFloat8EXT: return "Float8EXT";
+    case SpvCapabilityFloat8CooperativeMatrixEXT: return "Float8CooperativeMatrixEXT";
     case SpvCapabilityFragmentShadingRateKHR: return "FragmentShadingRateKHR";
     case SpvCapabilitySubgroupBallotKHR: return "SubgroupBallotKHR";
     case SpvCapabilityDrawParameters: return "DrawParameters";
@@ -3992,6 +4054,7 @@ inline const char* SpvCapabilityToString(SpvCapability value) {
     case SpvCapabilityTextureSampleWeightedQCOM: return "TextureSampleWeightedQCOM";
     case SpvCapabilityTextureBoxFilterQCOM: return "TextureBoxFilterQCOM";
     case SpvCapabilityTextureBlockMatchQCOM: return "TextureBlockMatchQCOM";
+    case SpvCapabilityTileShadingQCOM: return "TileShadingQCOM";
     case SpvCapabilityTextureBlockMatch2QCOM: return "TextureBlockMatch2QCOM";
     case SpvCapabilityFloat16ImageAMD: return "Float16ImageAMD";
     case SpvCapabilityImageGatherBiasLodAMD: return "ImageGatherBiasLodAMD";
@@ -4002,6 +4065,8 @@ inline const char* SpvCapabilityToString(SpvCapability value) {
     case SpvCapabilityShaderClockKHR: return "ShaderClockKHR";
     case SpvCapabilityShaderEnqueueAMDX: return "ShaderEnqueueAMDX";
     case SpvCapabilityQuadControlKHR: return "QuadControlKHR";
+    case SpvCapabilityInt4TypeINTEL: return "Int4TypeINTEL";
+    case SpvCapabilityInt4CooperativeMatrixINTEL: return "Int4CooperativeMatrixINTEL";
     case SpvCapabilityBFloat16TypeKHR: return "BFloat16TypeKHR";
     case SpvCapabilityBFloat16DotProductKHR: return "BFloat16DotProductKHR";
     case SpvCapabilityBFloat16CooperativeMatrixKHR: return "BFloat16CooperativeMatrixKHR";
@@ -4144,6 +4209,7 @@ inline const char* SpvCapabilityToString(SpvCapability value) {
     case SpvCapabilityMaskedGatherScatterINTEL: return "MaskedGatherScatterINTEL";
     case SpvCapabilityCacheControlsINTEL: return "CacheControlsINTEL";
     case SpvCapabilityRegisterLimitsINTEL: return "RegisterLimitsINTEL";
+    case SpvCapabilityBindlessImagesINTEL: return "BindlessImagesINTEL";
     default: return "Unknown";
     }
 }
@@ -4299,6 +4365,8 @@ inline const char* SpvNamedMaximumNumberOfRegistersToString(SpvNamedMaximumNumbe
 inline const char* SpvFPEncodingToString(SpvFPEncoding value) {
     switch (value) {
     case SpvFPEncodingBFloat16KHR: return "BFloat16KHR";
+    case SpvFPEncodingFloat8E4M3EXT: return "Float8E4M3EXT";
+    case SpvFPEncodingFloat8E5M2EXT: return "Float8E5M2EXT";
     default: return "Unknown";
     }
 }
@@ -4683,6 +4751,17 @@ inline const char* SpvOpToString(SpvOp value) {
     case SpvOpColorAttachmentReadEXT: return "OpColorAttachmentReadEXT";
     case SpvOpDepthAttachmentReadEXT: return "OpDepthAttachmentReadEXT";
     case SpvOpStencilAttachmentReadEXT: return "OpStencilAttachmentReadEXT";
+    case SpvOpTypeTensorARM: return "OpTypeTensorARM";
+    case SpvOpTensorReadARM: return "OpTensorReadARM";
+    case SpvOpTensorWriteARM: return "OpTensorWriteARM";
+    case SpvOpTensorQuerySizeARM: return "OpTensorQuerySizeARM";
+    case SpvOpGraphConstantARM: return "OpGraphConstantARM";
+    case SpvOpGraphEntryPointARM: return "OpGraphEntryPointARM";
+    case SpvOpGraphARM: return "OpGraphARM";
+    case SpvOpGraphInputARM: return "OpGraphInputARM";
+    case SpvOpGraphSetOutputARM: return "OpGraphSetOutputARM";
+    case SpvOpGraphEndARM: return "OpGraphEndARM";
+    case SpvOpTypeGraphARM: return "OpTypeGraphARM";
     case SpvOpTerminateInvocation: return "OpTerminateInvocation";
     case SpvOpTypeUntypedPointerKHR: return "OpTypeUntypedPointerKHR";
     case SpvOpUntypedVariableKHR: return "OpUntypedVariableKHR";
@@ -5130,6 +5209,9 @@ inline const char* SpvOpToString(SpvOp value) {
     case SpvOpRoundFToTF32INTEL: return "OpRoundFToTF32INTEL";
     case SpvOpMaskedGatherINTEL: return "OpMaskedGatherINTEL";
     case SpvOpMaskedScatterINTEL: return "OpMaskedScatterINTEL";
+    case SpvOpConvertHandleToImageINTEL: return "OpConvertHandleToImageINTEL";
+    case SpvOpConvertHandleToSamplerINTEL: return "OpConvertHandleToSamplerINTEL";
+    case SpvOpConvertHandleToSampledImageINTEL: return "OpConvertHandleToSampledImageINTEL";
     default: return "Unknown";
     }
 }
diff --git a/3rdparty/spirv-cross/spirv.hpp b/3rdparty/spirv-cross/spirv.hpp
index 5fbba32bf..f7a7bf835 100644
--- a/3rdparty/spirv-cross/spirv.hpp
+++ b/3rdparty/spirv-cross/spirv.hpp
@@ -1,26 +1,10 @@
-// Copyright (c) 2014-2024 The Khronos Group Inc.
+// Copyright: 2014-2024 The Khronos Group Inc.
+// License: MIT
 // 
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and/or associated documentation files (the "Materials"),
-// to deal in the Materials without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Materials, and to permit persons to whom the
-// Materials are furnished to do so, subject to the following conditions:
-// 
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Materials.
-// 
-// MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS
-// STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND
-// HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/
-// 
-// THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS
-// IN THE MATERIALS.
+// MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+// KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+// SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+// https://www.khronos.org/registry/
 
 // This header is automatically generated by the same tool that creates
 // the Binary Section of the SPIR-V specification.
@@ -172,6 +156,8 @@ enum ExecutionMode {
     ExecutionModeSignedZeroInfNanPreserve = 4461,
     ExecutionModeRoundingModeRTE = 4462,
     ExecutionModeRoundingModeRTZ = 4463,
+    ExecutionModeNonCoherentTileAttachmentReadQCOM = 4489,
+    ExecutionModeTileShadingRateQCOM = 4490,
     ExecutionModeEarlyAndLateFragmentTestsAMD = 5017,
     ExecutionModeStencilRefReplacingEXT = 5027,
     ExecutionModeCoalescingAMDX = 5069,
@@ -241,6 +227,7 @@ enum StorageClass {
     StorageClassImage = 11,
     StorageClassStorageBuffer = 12,
     StorageClassTileImageEXT = 4172,
+    StorageClassTileAttachmentQCOM = 4491,
     StorageClassNodePayloadAMDX = 5068,
     StorageClassCallableDataKHR = 5328,
     StorageClassCallableDataNV = 5328,
@@ -550,6 +537,7 @@ enum Decoration {
     DecorationMaxByteOffset = 45,
     DecorationAlignmentId = 46,
     DecorationMaxByteOffsetId = 47,
+    DecorationSaturatedToLargestFloat8NormalConversionEXT = 4216,
     DecorationNoSignedWrap = 4469,
     DecorationNoUnsignedWrap = 4470,
     DecorationWeightTextureQCOM = 4487,
@@ -719,6 +707,9 @@ enum BuiltIn {
     BuiltInDeviceIndex = 4438,
     BuiltInViewIndex = 4440,
     BuiltInShadingRateKHR = 4444,
+    BuiltInTileOffsetQCOM = 4492,
+    BuiltInTileDimensionQCOM = 4493,
+    BuiltInTileApronSizeQCOM = 4494,
     BuiltInBaryCoordNoPerspAMD = 4992,
     BuiltInBaryCoordNoPerspCentroidAMD = 4993,
     BuiltInBaryCoordNoPerspSampleAMD = 4994,
@@ -1069,7 +1060,13 @@ enum Capability {
     CapabilityTileImageColorReadAccessEXT = 4166,
     CapabilityTileImageDepthReadAccessEXT = 4167,
     CapabilityTileImageStencilReadAccessEXT = 4168,
+    CapabilityTensorsARM = 4174,
+    CapabilityStorageTensorArrayDynamicIndexingARM = 4175,
+    CapabilityStorageTensorArrayNonUniformIndexingARM = 4176,
+    CapabilityGraphARM = 4191,
     CapabilityCooperativeMatrixLayoutsARM = 4201,
+    CapabilityFloat8EXT = 4212,
+    CapabilityFloat8CooperativeMatrixEXT = 4213,
     CapabilityFragmentShadingRateKHR = 4422,
     CapabilitySubgroupBallotKHR = 4423,
     CapabilityDrawParameters = 4427,
@@ -1105,6 +1102,7 @@ enum Capability {
     CapabilityTextureSampleWeightedQCOM = 4484,
     CapabilityTextureBoxFilterQCOM = 4485,
     CapabilityTextureBlockMatchQCOM = 4486,
+    CapabilityTileShadingQCOM = 4495,
     CapabilityTextureBlockMatch2QCOM = 4498,
     CapabilityFloat16ImageAMD = 5008,
     CapabilityImageGatherBiasLodAMD = 5009,
@@ -1115,6 +1113,8 @@ enum Capability {
     CapabilityShaderClockKHR = 5055,
     CapabilityShaderEnqueueAMDX = 5067,
     CapabilityQuadControlKHR = 5087,
+    CapabilityInt4TypeINTEL = 5112,
+    CapabilityInt4CooperativeMatrixINTEL = 5114,
     CapabilityBFloat16TypeKHR = 5116,
     CapabilityBFloat16DotProductKHR = 5117,
     CapabilityBFloat16CooperativeMatrixKHR = 5118,
@@ -1283,6 +1283,7 @@ enum Capability {
     CapabilityMaskedGatherScatterINTEL = 6427,
     CapabilityCacheControlsINTEL = 6441,
     CapabilityRegisterLimitsINTEL = 6460,
+    CapabilityBindlessImagesINTEL = 6528,
     CapabilityMax = 0x7fffffff,
 };
 
@@ -1459,6 +1460,24 @@ enum TensorAddressingOperandsMask {
     TensorAddressingOperandsDecodeFuncMask = 0x00000002,
 };
 
+enum TensorOperandsShift {
+    TensorOperandsNontemporalARMShift = 0,
+    TensorOperandsOutOfBoundsValueARMShift = 1,
+    TensorOperandsMakeElementAvailableARMShift = 2,
+    TensorOperandsMakeElementVisibleARMShift = 3,
+    TensorOperandsNonPrivateElementARMShift = 4,
+    TensorOperandsMax = 0x7fffffff,
+};
+
+enum TensorOperandsMask {
+    TensorOperandsMaskNone = 0,
+    TensorOperandsNontemporalARMMask = 0x00000001,
+    TensorOperandsOutOfBoundsValueARMMask = 0x00000002,
+    TensorOperandsMakeElementAvailableARMMask = 0x00000004,
+    TensorOperandsMakeElementVisibleARMMask = 0x00000008,
+    TensorOperandsNonPrivateElementARMMask = 0x00000010,
+};
+
 enum InitializationModeQualifier {
     InitializationModeQualifierInitOnDeviceReprogramINTEL = 0,
     InitializationModeQualifierInitOnDeviceResetINTEL = 1,
@@ -1545,6 +1564,8 @@ enum RawAccessChainOperandsMask {
 
 enum FPEncoding {
     FPEncodingBFloat16KHR = 0,
+    FPEncodingFloat8E4M3EXT = 4214,
+    FPEncodingFloat8E5M2EXT = 4215,
     FPEncodingMax = 0x7fffffff,
 };
 
@@ -1923,6 +1944,17 @@ enum Op {
     OpColorAttachmentReadEXT = 4160,
     OpDepthAttachmentReadEXT = 4161,
     OpStencilAttachmentReadEXT = 4162,
+    OpTypeTensorARM = 4163,
+    OpTensorReadARM = 4164,
+    OpTensorWriteARM = 4165,
+    OpTensorQuerySizeARM = 4166,
+    OpGraphConstantARM = 4181,
+    OpGraphEntryPointARM = 4182,
+    OpGraphARM = 4183,
+    OpGraphInputARM = 4184,
+    OpGraphSetOutputARM = 4185,
+    OpGraphEndARM = 4186,
+    OpTypeGraphARM = 4190,
     OpTerminateInvocation = 4416,
     OpTypeUntypedPointerKHR = 4417,
     OpUntypedVariableKHR = 4418,
@@ -2381,6 +2413,9 @@ enum Op {
     OpRoundFToTF32INTEL = 6426,
     OpMaskedGatherINTEL = 6428,
     OpMaskedScatterINTEL = 6429,
+    OpConvertHandleToImageINTEL = 6529,
+    OpConvertHandleToSamplerINTEL = 6530,
+    OpConvertHandleToSampledImageINTEL = 6531,
     OpMax = 0x7fffffff,
 };
 
@@ -2739,6 +2774,17 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpColorAttachmentReadEXT: *hasResult = true; *hasResultType = true; break;
     case OpDepthAttachmentReadEXT: *hasResult = true; *hasResultType = true; break;
     case OpStencilAttachmentReadEXT: *hasResult = true; *hasResultType = true; break;
+    case OpTypeTensorARM: *hasResult = true; *hasResultType = false; break;
+    case OpTensorReadARM: *hasResult = true; *hasResultType = true; break;
+    case OpTensorWriteARM: *hasResult = false; *hasResultType = false; break;
+    case OpTensorQuerySizeARM: *hasResult = true; *hasResultType = true; break;
+    case OpGraphConstantARM: *hasResult = true; *hasResultType = true; break;
+    case OpGraphEntryPointARM: *hasResult = false; *hasResultType = false; break;
+    case OpGraphARM: *hasResult = true; *hasResultType = true; break;
+    case OpGraphInputARM: *hasResult = true; *hasResultType = true; break;
+    case OpGraphSetOutputARM: *hasResult = false; *hasResultType = false; break;
+    case OpGraphEndARM: *hasResult = false; *hasResultType = false; break;
+    case OpTypeGraphARM: *hasResult = true; *hasResultType = false; break;
     case OpTerminateInvocation: *hasResult = false; *hasResultType = false; break;
     case OpTypeUntypedPointerKHR: *hasResult = true; *hasResultType = false; break;
     case OpUntypedVariableKHR: *hasResult = true; *hasResultType = true; break;
@@ -3186,6 +3232,9 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpRoundFToTF32INTEL: *hasResult = true; *hasResultType = true; break;
     case OpMaskedGatherINTEL: *hasResult = true; *hasResultType = true; break;
     case OpMaskedScatterINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpConvertHandleToImageINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpConvertHandleToSamplerINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpConvertHandleToSampledImageINTEL: *hasResult = true; *hasResultType = true; break;
     }
 }
 inline const char* SourceLanguageToString(SourceLanguage value) {
@@ -3301,6 +3350,8 @@ inline const char* ExecutionModeToString(ExecutionMode value) {
     case ExecutionModeSignedZeroInfNanPreserve: return "SignedZeroInfNanPreserve";
     case ExecutionModeRoundingModeRTE: return "RoundingModeRTE";
     case ExecutionModeRoundingModeRTZ: return "RoundingModeRTZ";
+    case ExecutionModeNonCoherentTileAttachmentReadQCOM: return "NonCoherentTileAttachmentReadQCOM";
+    case ExecutionModeTileShadingRateQCOM: return "TileShadingRateQCOM";
     case ExecutionModeEarlyAndLateFragmentTestsAMD: return "EarlyAndLateFragmentTestsAMD";
     case ExecutionModeStencilRefReplacingEXT: return "StencilRefReplacingEXT";
     case ExecutionModeCoalescingAMDX: return "CoalescingAMDX";
@@ -3367,6 +3418,7 @@ inline const char* StorageClassToString(StorageClass value) {
     case StorageClassImage: return "Image";
     case StorageClassStorageBuffer: return "StorageBuffer";
     case StorageClassTileImageEXT: return "TileImageEXT";
+    case StorageClassTileAttachmentQCOM: return "TileAttachmentQCOM";
     case StorageClassNodePayloadAMDX: return "NodePayloadAMDX";
     case StorageClassCallableDataKHR: return "CallableDataKHR";
     case StorageClassIncomingCallableDataKHR: return "IncomingCallableDataKHR";
@@ -3615,6 +3667,7 @@ inline const char* DecorationToString(Decoration value) {
     case DecorationMaxByteOffset: return "MaxByteOffset";
     case DecorationAlignmentId: return "AlignmentId";
     case DecorationMaxByteOffsetId: return "MaxByteOffsetId";
+    case DecorationSaturatedToLargestFloat8NormalConversionEXT: return "SaturatedToLargestFloat8NormalConversionEXT";
     case DecorationNoSignedWrap: return "NoSignedWrap";
     case DecorationNoUnsignedWrap: return "NoUnsignedWrap";
     case DecorationWeightTextureQCOM: return "WeightTextureQCOM";
@@ -3774,6 +3827,9 @@ inline const char* BuiltInToString(BuiltIn value) {
     case BuiltInDeviceIndex: return "DeviceIndex";
     case BuiltInViewIndex: return "ViewIndex";
     case BuiltInShadingRateKHR: return "ShadingRateKHR";
+    case BuiltInTileOffsetQCOM: return "TileOffsetQCOM";
+    case BuiltInTileDimensionQCOM: return "TileDimensionQCOM";
+    case BuiltInTileApronSizeQCOM: return "TileApronSizeQCOM";
     case BuiltInBaryCoordNoPerspAMD: return "BaryCoordNoPerspAMD";
     case BuiltInBaryCoordNoPerspCentroidAMD: return "BaryCoordNoPerspCentroidAMD";
     case BuiltInBaryCoordNoPerspSampleAMD: return "BaryCoordNoPerspSampleAMD";
@@ -3954,7 +4010,13 @@ inline const char* CapabilityToString(Capability value) {
     case CapabilityTileImageColorReadAccessEXT: return "TileImageColorReadAccessEXT";
     case CapabilityTileImageDepthReadAccessEXT: return "TileImageDepthReadAccessEXT";
     case CapabilityTileImageStencilReadAccessEXT: return "TileImageStencilReadAccessEXT";
+    case CapabilityTensorsARM: return "TensorsARM";
+    case CapabilityStorageTensorArrayDynamicIndexingARM: return "StorageTensorArrayDynamicIndexingARM";
+    case CapabilityStorageTensorArrayNonUniformIndexingARM: return "StorageTensorArrayNonUniformIndexingARM";
+    case CapabilityGraphARM: return "GraphARM";
     case CapabilityCooperativeMatrixLayoutsARM: return "CooperativeMatrixLayoutsARM";
+    case CapabilityFloat8EXT: return "Float8EXT";
+    case CapabilityFloat8CooperativeMatrixEXT: return "Float8CooperativeMatrixEXT";
     case CapabilityFragmentShadingRateKHR: return "FragmentShadingRateKHR";
     case CapabilitySubgroupBallotKHR: return "SubgroupBallotKHR";
     case CapabilityDrawParameters: return "DrawParameters";
@@ -3988,6 +4050,7 @@ inline const char* CapabilityToString(Capability value) {
     case CapabilityTextureSampleWeightedQCOM: return "TextureSampleWeightedQCOM";
     case CapabilityTextureBoxFilterQCOM: return "TextureBoxFilterQCOM";
     case CapabilityTextureBlockMatchQCOM: return "TextureBlockMatchQCOM";
+    case CapabilityTileShadingQCOM: return "TileShadingQCOM";
     case CapabilityTextureBlockMatch2QCOM: return "TextureBlockMatch2QCOM";
     case CapabilityFloat16ImageAMD: return "Float16ImageAMD";
     case CapabilityImageGatherBiasLodAMD: return "ImageGatherBiasLodAMD";
@@ -3998,6 +4061,8 @@ inline const char* CapabilityToString(Capability value) {
     case CapabilityShaderClockKHR: return "ShaderClockKHR";
     case CapabilityShaderEnqueueAMDX: return "ShaderEnqueueAMDX";
     case CapabilityQuadControlKHR: return "QuadControlKHR";
+    case CapabilityInt4TypeINTEL: return "Int4TypeINTEL";
+    case CapabilityInt4CooperativeMatrixINTEL: return "Int4CooperativeMatrixINTEL";
     case CapabilityBFloat16TypeKHR: return "BFloat16TypeKHR";
     case CapabilityBFloat16DotProductKHR: return "BFloat16DotProductKHR";
     case CapabilityBFloat16CooperativeMatrixKHR: return "BFloat16CooperativeMatrixKHR";
@@ -4140,6 +4205,7 @@ inline const char* CapabilityToString(Capability value) {
     case CapabilityMaskedGatherScatterINTEL: return "MaskedGatherScatterINTEL";
     case CapabilityCacheControlsINTEL: return "CacheControlsINTEL";
     case CapabilityRegisterLimitsINTEL: return "RegisterLimitsINTEL";
+    case CapabilityBindlessImagesINTEL: return "BindlessImagesINTEL";
     default: return "Unknown";
     }
 }
@@ -4295,6 +4361,8 @@ inline const char* NamedMaximumNumberOfRegistersToString(NamedMaximumNumberOfReg
 inline const char* FPEncodingToString(FPEncoding value) {
     switch (value) {
     case FPEncodingBFloat16KHR: return "BFloat16KHR";
+    case FPEncodingFloat8E4M3EXT: return "Float8E4M3EXT";
+    case FPEncodingFloat8E5M2EXT: return "Float8E5M2EXT";
     default: return "Unknown";
     }
 }
@@ -4679,6 +4747,17 @@ inline const char* OpToString(Op value) {
     case OpColorAttachmentReadEXT: return "OpColorAttachmentReadEXT";
     case OpDepthAttachmentReadEXT: return "OpDepthAttachmentReadEXT";
     case OpStencilAttachmentReadEXT: return "OpStencilAttachmentReadEXT";
+    case OpTypeTensorARM: return "OpTypeTensorARM";
+    case OpTensorReadARM: return "OpTensorReadARM";
+    case OpTensorWriteARM: return "OpTensorWriteARM";
+    case OpTensorQuerySizeARM: return "OpTensorQuerySizeARM";
+    case OpGraphConstantARM: return "OpGraphConstantARM";
+    case OpGraphEntryPointARM: return "OpGraphEntryPointARM";
+    case OpGraphARM: return "OpGraphARM";
+    case OpGraphInputARM: return "OpGraphInputARM";
+    case OpGraphSetOutputARM: return "OpGraphSetOutputARM";
+    case OpGraphEndARM: return "OpGraphEndARM";
+    case OpTypeGraphARM: return "OpTypeGraphARM";
     case OpTerminateInvocation: return "OpTerminateInvocation";
     case OpTypeUntypedPointerKHR: return "OpTypeUntypedPointerKHR";
     case OpUntypedVariableKHR: return "OpUntypedVariableKHR";
@@ -5126,6 +5205,9 @@ inline const char* OpToString(Op value) {
     case OpRoundFToTF32INTEL: return "OpRoundFToTF32INTEL";
     case OpMaskedGatherINTEL: return "OpMaskedGatherINTEL";
     case OpMaskedScatterINTEL: return "OpMaskedScatterINTEL";
+    case OpConvertHandleToImageINTEL: return "OpConvertHandleToImageINTEL";
+    case OpConvertHandleToSamplerINTEL: return "OpConvertHandleToSamplerINTEL";
+    case OpConvertHandleToSampledImageINTEL: return "OpConvertHandleToSampledImageINTEL";
     default: return "Unknown";
     }
 }
@@ -5186,6 +5268,10 @@ inline TensorAddressingOperandsMask operator|(TensorAddressingOperandsMask a, Te
 inline TensorAddressingOperandsMask operator&(TensorAddressingOperandsMask a, TensorAddressingOperandsMask b) { return TensorAddressingOperandsMask(unsigned(a) & unsigned(b)); }
 inline TensorAddressingOperandsMask operator^(TensorAddressingOperandsMask a, TensorAddressingOperandsMask b) { return TensorAddressingOperandsMask(unsigned(a) ^ unsigned(b)); }
 inline TensorAddressingOperandsMask operator~(TensorAddressingOperandsMask a) { return TensorAddressingOperandsMask(~unsigned(a)); }
+inline TensorOperandsMask operator|(TensorOperandsMask a, TensorOperandsMask b) { return TensorOperandsMask(unsigned(a) | unsigned(b)); }
+inline TensorOperandsMask operator&(TensorOperandsMask a, TensorOperandsMask b) { return TensorOperandsMask(unsigned(a) & unsigned(b)); }
+inline TensorOperandsMask operator^(TensorOperandsMask a, TensorOperandsMask b) { return TensorOperandsMask(unsigned(a) ^ unsigned(b)); }
+inline TensorOperandsMask operator~(TensorOperandsMask a) { return TensorOperandsMask(~unsigned(a)); }
 inline MatrixMultiplyAccumulateOperandsMask operator|(MatrixMultiplyAccumulateOperandsMask a, MatrixMultiplyAccumulateOperandsMask b) { return MatrixMultiplyAccumulateOperandsMask(unsigned(a) | unsigned(b)); }
 inline MatrixMultiplyAccumulateOperandsMask operator&(MatrixMultiplyAccumulateOperandsMask a, MatrixMultiplyAccumulateOperandsMask b) { return MatrixMultiplyAccumulateOperandsMask(unsigned(a) & unsigned(b)); }
 inline MatrixMultiplyAccumulateOperandsMask operator^(MatrixMultiplyAccumulateOperandsMask a, MatrixMultiplyAccumulateOperandsMask b) { return MatrixMultiplyAccumulateOperandsMask(unsigned(a) ^ unsigned(b)); }
diff --git a/3rdparty/spirv-cross/spirv_cfg.cpp b/3rdparty/spirv-cross/spirv_cfg.cpp
index 932994798..c68886d98 100644
--- a/3rdparty/spirv-cross/spirv_cfg.cpp
+++ b/3rdparty/spirv-cross/spirv_cfg.cpp
@@ -81,31 +81,105 @@ bool CFG::is_back_edge(uint32_t to) const
 	// We have a back edge if the visit order is set with the temporary magic value 0.
 	// Crossing edges will have already been recorded with a visit order.
 	auto itr = visit_order.find(to);
-	return itr != end(visit_order) && itr->second.get() == 0;
+	return itr != end(visit_order) && itr->second.visited_branches && !itr->second.visited_resolve;
 }
 
-bool CFG::has_visited_forward_edge(uint32_t to) const
+bool CFG::has_visited_branch(uint32_t to) const
 {
-	// If > 0, we have visited the edge already, and this is not a back edge branch.
 	auto itr = visit_order.find(to);
-	return itr != end(visit_order) && itr->second.get() > 0;
+	return itr != end(visit_order) && itr->second.visited_branches;
 }
 
-bool CFG::post_order_visit(uint32_t block_id)
+void CFG::post_order_visit_entry(uint32_t block)
 {
-	// If we have already branched to this block (back edge), stop recursion.
-	// If our branches are back-edges, we do not record them.
-	// We have to record crossing edges however.
-	if (has_visited_forward_edge(block_id))
-		return true;
-	else if (is_back_edge(block_id))
-		return false;
+	visit_stack.push_back(block);
 
-	// Block back-edges from recursively revisiting ourselves.
-	visit_order[block_id].get() = 0;
+	while (!visit_stack.empty())
+	{
+		bool keep_iterating;
+		do
+		{
+			// Reverse the order to allow for stack-like behavior and preserves the visit order from recursive algorithm.
+			// Traverse depth first.
+			uint32_t to_visit = visit_stack.back();
+			last_visited_size = visit_stack.size();
+			post_order_visit_branches(to_visit);
+			keep_iterating = last_visited_size != visit_stack.size();
+			if (keep_iterating)
+				std::reverse(visit_stack.begin() + last_visited_size, visit_stack.end());
+		} while (keep_iterating);
 
+		// We've reached the end of some tree leaf. Resolve the stack.
+		// Any node which has been visited for real can be popped now.
+		while (!visit_stack.empty() && visit_order[visit_stack.back()].visited_branches)
+		{
+			post_order_visit_resolve(visit_stack.back());
+			visit_stack.pop_back();
+		}
+	}
+}
+
+void CFG::visit_branch(uint32_t block_id)
+{
+	// Prune obvious duplicates.
+	if (std::find(visit_stack.begin() + last_visited_size, visit_stack.end(), block_id) == visit_stack.end() &&
+	    !has_visited_branch(block_id))
+	{
+		visit_stack.push_back(block_id);
+	}
+}
+
+void CFG::post_order_visit_branches(uint32_t block_id)
+{
 	auto &block = compiler.get<SPIRBlock>(block_id);
 
+	auto &visit = visit_order[block_id];
+	if (visit.visited_branches)
+		return;
+	visit.visited_branches = true;
+
+	if (block.merge == SPIRBlock::MergeLoop)
+		visit_branch(block.merge_block);
+	else if (block.merge == SPIRBlock::MergeSelection)
+		visit_branch(block.next_block);
+
+	// First visit our branch targets.
+	switch (block.terminator)
+	{
+	case SPIRBlock::Direct:
+		visit_branch(block.next_block);
+		break;
+
+	case SPIRBlock::Select:
+		visit_branch(block.true_block);
+		visit_branch(block.false_block);
+		break;
+
+	case SPIRBlock::MultiSelect:
+	{
+		const auto &cases = compiler.get_case_list(block);
+		for (const auto &target : cases)
+			visit_branch(target.block);
+		if (block.default_block)
+			visit_branch(block.default_block);
+		break;
+	}
+
+	default:
+		break;
+	}
+}
+
+void CFG::post_order_visit_resolve(uint32_t block_id)
+{
+	auto &block = compiler.get<SPIRBlock>(block_id);
+
+	auto &visit_block = visit_order[block_id];
+	assert(visit_block.visited_branches);
+	auto &visited = visit_order[block_id].visited_resolve;
+	if (visited)
+		return;
+
 	// If this is a loop header, add an implied branch to the merge target.
 	// This is needed to avoid annoying cases with do { ... } while(false) loops often generated by inliners.
 	// To the CFG, this is linear control flow, but we risk picking the do/while scope as our dominating block.
@@ -116,21 +190,21 @@ bool CFG::post_order_visit(uint32_t block_id)
 	// is lower than inside the loop, which is going to be key for some traversal algorithms like post-dominance analysis.
 	// For selection constructs true/false blocks will end up visiting the merge block directly and it works out fine,
 	// but for loops, only the header might end up actually branching to merge block.
-	if (block.merge == SPIRBlock::MergeLoop && post_order_visit(block.merge_block))
+	if (block.merge == SPIRBlock::MergeLoop && !is_back_edge(block.merge_block))
 		add_branch(block_id, block.merge_block);
 
 	// First visit our branch targets.
 	switch (block.terminator)
 	{
 	case SPIRBlock::Direct:
-		if (post_order_visit(block.next_block))
+		if (!is_back_edge(block.next_block))
 			add_branch(block_id, block.next_block);
 		break;
 
 	case SPIRBlock::Select:
-		if (post_order_visit(block.true_block))
+		if (!is_back_edge(block.true_block))
 			add_branch(block_id, block.true_block);
-		if (post_order_visit(block.false_block))
+		if (!is_back_edge(block.false_block))
 			add_branch(block_id, block.false_block);
 		break;
 
@@ -139,10 +213,10 @@ bool CFG::post_order_visit(uint32_t block_id)
 		const auto &cases = compiler.get_case_list(block);
 		for (const auto &target : cases)
 		{
-			if (post_order_visit(target.block))
+			if (!is_back_edge(target.block))
 				add_branch(block_id, target.block);
 		}
-		if (block.default_block && post_order_visit(block.default_block))
+		if (block.default_block && !is_back_edge(block.default_block))
 			add_branch(block_id, block.default_block);
 		break;
 	}
@@ -157,7 +231,7 @@ bool CFG::post_order_visit(uint32_t block_id)
 	// We can use the variable without a Phi since there is only one possible parent here.
 	// However, in this case, we need to hoist out the inner variable to outside the branch.
 	// Use same strategy as loops.
-	if (block.merge == SPIRBlock::MergeSelection && post_order_visit(block.next_block))
+	if (block.merge == SPIRBlock::MergeSelection && !is_back_edge(block.next_block))
 	{
 		// If there is only one preceding edge to the merge block and it's not ourselves, we need a fixup.
 		// Add a fake branch so any dominator in either the if (), or else () block, or a lone case statement
@@ -201,10 +275,9 @@ bool CFG::post_order_visit(uint32_t block_id)
 		}
 	}
 
-	// Then visit ourselves. Start counting at one, to let 0 be a magic value for testing back vs. crossing edges.
-	visit_order[block_id].get() = ++visit_count;
+	visited = true;
+	visit_block.order = ++visit_count;
 	post_order.push_back(block_id);
-	return true;
 }
 
 void CFG::build_post_order_visit_order()
@@ -213,11 +286,12 @@ void CFG::build_post_order_visit_order()
 	visit_count = 0;
 	visit_order.clear();
 	post_order.clear();
-	post_order_visit(block);
+	post_order_visit_entry(block);
 }
 
 void CFG::add_branch(uint32_t from, uint32_t to)
 {
+	assert(from && to);
 	const auto add_unique = [](SmallVector<uint32_t> &l, uint32_t value) {
 		auto itr = find(begin(l), end(l), value);
 		if (itr == end(l))
diff --git a/3rdparty/spirv-cross/spirv_cfg.hpp b/3rdparty/spirv-cross/spirv_cfg.hpp
index 1d85fe0a9..1c21ea070 100644
--- a/3rdparty/spirv-cross/spirv_cfg.hpp
+++ b/3rdparty/spirv-cross/spirv_cfg.hpp
@@ -68,7 +68,7 @@ public:
 	{
 		auto itr = visit_order.find(block);
 		assert(itr != std::end(visit_order));
-		int v = itr->second.get();
+		int v = itr->second.order;
 		assert(v > 0);
 		return uint32_t(v);
 	}
@@ -114,17 +114,9 @@ public:
 private:
 	struct VisitOrder
 	{
-		int &get()
-		{
-			return v;
-		}
-
-		const int &get() const
-		{
-			return v;
-		}
-
-		int v = -1;
+		int order = -1;
+		bool visited_resolve = false;
+		bool visited_branches = false;
 	};
 
 	Compiler &compiler;
@@ -139,11 +131,17 @@ private:
 	void add_branch(uint32_t from, uint32_t to);
 	void build_post_order_visit_order();
 	void build_immediate_dominators();
-	bool post_order_visit(uint32_t block);
+	void post_order_visit_branches(uint32_t block);
+	void post_order_visit_resolve(uint32_t block);
+	void post_order_visit_entry(uint32_t block);
 	uint32_t visit_count = 0;
 
 	bool is_back_edge(uint32_t to) const;
-	bool has_visited_forward_edge(uint32_t to) const;
+	bool has_visited_branch(uint32_t to) const;
+	void visit_branch(uint32_t block_id);
+
+	SmallVector<uint32_t> visit_stack;
+	size_t last_visited_size = 0;
 };
 
 class DominatorBuilder
diff --git a/3rdparty/spirv-cross/spirv_common.hpp b/3rdparty/spirv-cross/spirv_common.hpp
index a4778c29b..854efe5eb 100644
--- a/3rdparty/spirv-cross/spirv_common.hpp
+++ b/3rdparty/spirv-cross/spirv_common.hpp
@@ -574,6 +574,7 @@ struct SPIRType : IVariant
 		Sampler,
 		AccelerationStructure,
 		RayQuery,
+		CoopVecNV,
 
 		// Keep internal types at the end.
 		ControlPointArray,
@@ -581,7 +582,11 @@ struct SPIRType : IVariant
 		Char,
 		// MSL specific type, that is used by 'object'(analog of 'task' from glsl) shader.
 		MeshGridProperties,
-		BFloat16
+		BFloat16,
+		FloatE4M3,
+		FloatE5M2,
+
+		Tensor
 	};
 
 	// Scalar/vector/matrix support.
@@ -606,13 +611,29 @@ struct SPIRType : IVariant
 	bool pointer = false;
 	bool forward_pointer = false;
 
-	struct
+	union
 	{
-		uint32_t use_id = 0;
-		uint32_t rows_id = 0;
-		uint32_t columns_id = 0;
-		uint32_t scope_id = 0;
-	} cooperative;
+		struct
+		{
+			uint32_t use_id;
+			uint32_t rows_id;
+			uint32_t columns_id;
+			uint32_t scope_id;
+		} cooperative;
+
+		struct
+		{
+			uint32_t component_type_id;
+			uint32_t component_count_id;
+		} coopVecNV;
+
+		struct
+		{
+			uint32_t type;
+			uint32_t rank;
+			uint32_t shape;
+		} tensor;
+	} ext;
 
 	spv::StorageClass storage = spv::StorageClassGeneric;
 
@@ -670,6 +691,12 @@ struct SPIRExtension : IVariant
 		NonSemanticGeneric
 	};
 
+	enum ShaderDebugInfoOps
+	{
+		DebugLine = 103,
+		DebugSource = 35
+	};
+
 	explicit SPIRExtension(Extension ext_)
 	    : ext(ext_)
 	{
@@ -695,6 +722,11 @@ struct SPIREntryPoint
 	FunctionID self = 0;
 	std::string name;
 	std::string orig_name;
+	std::unordered_map<uint32_t, uint32_t> fp_fast_math_defaults;
+	bool signed_zero_inf_nan_preserve_8 = false;
+	bool signed_zero_inf_nan_preserve_16 = false;
+	bool signed_zero_inf_nan_preserve_32 = false;
+	bool signed_zero_inf_nan_preserve_64 = false;
 	SmallVector<VariableID> interface_variables;
 
 	Bitset flags;
@@ -927,6 +959,7 @@ struct SPIRBlock : IVariant
 	// All access to these variables are dominated by this block,
 	// so before branching anywhere we need to make sure that we declare these variables.
 	SmallVector<VariableID> dominated_variables;
+	SmallVector<bool> rearm_dominated_variables;
 
 	// These are variables which should be declared in a for loop header, if we
 	// fail to use a classic for-loop,
@@ -1238,6 +1271,26 @@ struct SPIRConstant : IVariant
 		return u.f32;
 	}
 
+	static inline float fe4m3_to_f32(uint8_t v)
+	{
+		if ((v & 0x7f) == 0x7f)
+		{
+			union
+			{
+				float f32;
+				uint32_t u32;
+			} u;
+
+			u.u32 = (v & 0x80) ? 0xffffffffu : 0x7fffffffu;
+			return u.f32;
+		}
+		else
+		{
+			// Reuse the FP16 to FP32 code. Cute bit-hackery.
+			return f16_to_f32((int16_t(int8_t(v)) << 7) & (0xffff ^ 0x4000)) * 256.0f;
+		}
+	}
+
 	inline uint32_t specialization_constant_id(uint32_t col, uint32_t row) const
 	{
 		return m.c[col].id[row];
@@ -1286,6 +1339,16 @@ struct SPIRConstant : IVariant
 		return fp32;
 	}
 
+	inline float scalar_floate4m3(uint32_t col = 0, uint32_t row = 0) const
+	{
+		return fe4m3_to_f32(scalar_u8(col, row));
+	}
+
+	inline float scalar_bf8(uint32_t col = 0, uint32_t row = 0) const
+	{
+		return f16_to_f32(scalar_u8(col, row) << 8);
+	}
+
 	inline float scalar_f32(uint32_t col = 0, uint32_t row = 0) const
 	{
 		return m.c[col].r[row].f32;
@@ -1356,9 +1419,10 @@ struct SPIRConstant : IVariant
 
 	SPIRConstant() = default;
 
-	SPIRConstant(TypeID constant_type_, const uint32_t *elements, uint32_t num_elements, bool specialized)
+	SPIRConstant(TypeID constant_type_, const uint32_t *elements, uint32_t num_elements, bool specialized, bool replicated_ = false)
 	    : constant_type(constant_type_)
 	    , specialization(specialized)
+	    , replicated(replicated_)
 	{
 		subconstants.reserve(num_elements);
 		for (uint32_t i = 0; i < num_elements; i++)
@@ -1437,6 +1501,9 @@ struct SPIRConstant : IVariant
 	// For composites which are constant arrays, etc.
 	SmallVector<ConstantID> subconstants;
 
+	// Whether the subconstants are intended to be replicated (e.g. OpConstantCompositeReplicateEXT)
+	bool replicated = false;
+
 	// Non-Vulkan GLSL, HLSL and sometimes MSL emits defines for each specialization constant,
 	// and uses them to initialize the constant. This allows the user
 	// to still be able to specialize the value by supplying corresponding
@@ -1732,6 +1799,7 @@ struct Meta
 		uint32_t spec_id = 0;
 		uint32_t index = 0;
 		spv::FPRoundingMode fp_rounding_mode = spv::FPRoundingModeMax;
+		spv::FPFastMathModeMask fp_fast_math_mode = spv::FPFastMathModeMaskNone;
 		bool builtin = false;
 		bool qualified_alias_explicit_override = false;
 
@@ -1787,7 +1855,8 @@ private:
 
 static inline bool type_is_floating_point(const SPIRType &type)
 {
-	return type.basetype == SPIRType::Half || type.basetype == SPIRType::Float || type.basetype == SPIRType::Double;
+	return type.basetype == SPIRType::Half || type.basetype == SPIRType::Float || type.basetype == SPIRType::Double ||
+	       type.basetype == SPIRType::BFloat16 || type.basetype == SPIRType::FloatE5M2 || type.basetype == SPIRType::FloatE4M3;
 }
 
 static inline bool type_is_integral(const SPIRType &type)
diff --git a/3rdparty/spirv-cross/spirv_cross.cpp b/3rdparty/spirv-cross/spirv_cross.cpp
index 8aa4e5e70..4c1d39d98 100644
--- a/3rdparty/spirv-cross/spirv_cross.cpp
+++ b/3rdparty/spirv-cross/spirv_cross.cpp
@@ -280,6 +280,9 @@ bool Compiler::block_is_pure(const SPIRBlock &block)
 			// This is a global side effect of the function.
 			return false;
 
+		case OpTensorReadARM:
+			return false;
+
 		case OpExtInst:
 		{
 			uint32_t extension_set = ops[2];
@@ -373,6 +376,7 @@ void Compiler::register_global_read_dependencies(const SPIRBlock &block, uint32_
 
 		case OpLoad:
 		case OpCooperativeMatrixLoadKHR:
+		case OpCooperativeVectorLoadNV:
 		case OpImageRead:
 		{
 			// If we're in a storage class which does not get invalidated, adding dependencies here is no big deal.
@@ -1152,6 +1156,11 @@ ShaderResources Compiler::get_shader_resources(const unordered_set<VariableID> *
 			{
 				res.acceleration_structures.push_back({ var.self, var.basetype, type.self, get_name(var.self) });
 			}
+			// Tensors
+			else if (type.basetype == SPIRType::Tensor)
+			{
+				res.tensors.push_back({ var.self, var.basetype, type.self, get_name(var.self) });
+			}
 			else
 			{
 				res.gl_plain_uniforms.push_back({ var.self, var.basetype, type.self, get_name(var.self) });
@@ -1169,11 +1178,8 @@ bool Compiler::type_is_top_level_block(const SPIRType &type) const
 	return has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock);
 }
 
-bool Compiler::type_is_block_like(const SPIRType &type) const
+bool Compiler::type_is_explicit_layout(const SPIRType &type) const
 {
-	if (type_is_top_level_block(type))
-		return true;
-
 	if (type.basetype == SPIRType::Struct)
 	{
 		// Block-like types may have Offset decorations.
@@ -1185,6 +1191,14 @@ bool Compiler::type_is_block_like(const SPIRType &type) const
 	return false;
 }
 
+bool Compiler::type_is_block_like(const SPIRType &type) const
+{
+	if (type_is_top_level_block(type))
+		return true;
+	else
+		return type_is_explicit_layout(type);
+}
+
 void Compiler::parse_fixup()
 {
 	// Figure out specialization constants for work group sizes.
@@ -2370,6 +2384,10 @@ void Compiler::set_execution_mode(ExecutionMode mode, uint32_t arg0, uint32_t ar
 		execution.output_primitives = arg0;
 		break;
 
+	case ExecutionModeFPFastMathDefault:
+		execution.fp_fast_math_defaults[arg0] = arg1;
+		break;
+
 	default:
 		break;
 	}
@@ -4334,6 +4352,7 @@ bool Compiler::may_read_undefined_variable_in_block(const SPIRBlock &block, uint
 
 		case OpCopyObject:
 		case OpLoad:
+		case OpCooperativeVectorLoadNV:
 		case OpCooperativeMatrixLoadKHR:
 			if (ops[2] == var)
 				return true;
@@ -5151,7 +5170,7 @@ bool Compiler::is_depth_image(const SPIRType &type, uint32_t id) const
 bool Compiler::type_is_opaque_value(const SPIRType &type) const
 {
 	return !type.pointer && (type.basetype == SPIRType::SampledImage || type.basetype == SPIRType::Image ||
-	                         type.basetype == SPIRType::Sampler);
+	                         type.basetype == SPIRType::Sampler || type.basetype == SPIRType::Tensor);
 }
 
 // Make these member functions so we can easily break on any force_recompile events.
@@ -5469,6 +5488,7 @@ bool Compiler::InterlockedResourceAccessHandler::handle(Op opcode, const uint32_
 	{
 	case OpLoad:
 	case OpCooperativeMatrixLoadKHR:
+	case OpCooperativeVectorLoadNV:
 	{
 		if (length < 3)
 			return false;
@@ -5547,6 +5567,7 @@ bool Compiler::InterlockedResourceAccessHandler::handle(Op opcode, const uint32_
 	case OpImageWrite:
 	case OpAtomicStore:
 	case OpCooperativeMatrixStoreKHR:
+	case OpCooperativeVectorStoreNV:
 	{
 		if (length < 1)
 			return false;
diff --git a/3rdparty/spirv-cross/spirv_cross.hpp b/3rdparty/spirv-cross/spirv_cross.hpp
index b65b5ac77..65e4bedfa 100644
--- a/3rdparty/spirv-cross/spirv_cross.hpp
+++ b/3rdparty/spirv-cross/spirv_cross.hpp
@@ -95,6 +95,7 @@ struct ShaderResources
 	SmallVector<Resource> atomic_counters;
 	SmallVector<Resource> acceleration_structures;
 	SmallVector<Resource> gl_plain_uniforms;
+	SmallVector<Resource> tensors;
 
 	// There can only be one push constant block,
 	// but keep the vector in case this restriction is lifted in the future.
@@ -1171,6 +1172,7 @@ protected:
 	bool type_contains_recursion(const SPIRType &type);
 	bool type_is_array_of_pointers(const SPIRType &type) const;
 	bool type_is_block_like(const SPIRType &type) const;
+	bool type_is_explicit_layout(const SPIRType &type) const;
 	bool type_is_top_level_block(const SPIRType &type) const;
 	bool type_is_opaque_value(const SPIRType &type) const;
 
diff --git a/3rdparty/spirv-cross/spirv_cross_c.cpp b/3rdparty/spirv-cross/spirv_cross_c.cpp
index 6827f6135..8a4a49279 100644
--- a/3rdparty/spirv-cross/spirv_cross_c.cpp
+++ b/3rdparty/spirv-cross/spirv_cross_c.cpp
@@ -200,6 +200,7 @@ struct spvc_resources_s : ScratchMemoryAllocation
 	SmallVector<spvc_reflected_resource> separate_samplers;
 	SmallVector<spvc_reflected_resource> acceleration_structures;
 	SmallVector<spvc_reflected_resource> gl_plain_uniforms;
+	SmallVector<spvc_reflected_resource> tensors;
 
 	SmallVector<spvc_reflected_builtin_resource> builtin_inputs;
 	SmallVector<spvc_reflected_builtin_resource> builtin_outputs;
@@ -1872,6 +1873,8 @@ bool spvc_resources_s::copy_resources(const ShaderResources &resources)
 		return false;
 	if (!copy_resources(gl_plain_uniforms, resources.gl_plain_uniforms))
 		return false;
+	if (!copy_resources(tensors, resources.tensors))
+		return false;
 	if (!copy_resources(builtin_inputs, resources.builtin_inputs))
 		return false;
 	if (!copy_resources(builtin_outputs, resources.builtin_outputs))
@@ -2025,6 +2028,11 @@ spvc_result spvc_resources_get_resource_list_for_type(spvc_resources resources,
 
 	case SPVC_RESOURCE_TYPE_GL_PLAIN_UNIFORM:
 		list = &resources->gl_plain_uniforms;
+		break;
+
+	case SPVC_RESOURCE_TYPE_TENSOR:
+		list = &resources->tensors;
+		break;
 
 	default:
 		break;
diff --git a/3rdparty/spirv-cross/spirv_cross_c.h b/3rdparty/spirv-cross/spirv_cross_c.h
index 4eab9225f..f360711e8 100644
--- a/3rdparty/spirv-cross/spirv_cross_c.h
+++ b/3rdparty/spirv-cross/spirv_cross_c.h
@@ -40,7 +40,7 @@ extern "C" {
 /* Bumped if ABI or API breaks backwards compatibility. */
 #define SPVC_C_API_VERSION_MAJOR 0
 /* Bumped if APIs or enumerations are added in a backwards compatible way. */
-#define SPVC_C_API_VERSION_MINOR 66
+#define SPVC_C_API_VERSION_MINOR 67
 /* Bumped if internal implementation details change. */
 #define SPVC_C_API_VERSION_PATCH 0
 
@@ -227,6 +227,7 @@ typedef enum spvc_resource_type
 	SPVC_RESOURCE_TYPE_RAY_QUERY = 13,
 	SPVC_RESOURCE_TYPE_SHADER_RECORD_BUFFER = 14,
 	SPVC_RESOURCE_TYPE_GL_PLAIN_UNIFORM = 15,
+	SPVC_RESOURCE_TYPE_TENSOR = 16,
 	SPVC_RESOURCE_TYPE_INT_MAX = 0x7fffffff
 } spvc_resource_type;
 
diff --git a/3rdparty/spirv-cross/spirv_cross_parsed_ir.cpp b/3rdparty/spirv-cross/spirv_cross_parsed_ir.cpp
index 397e40f4d..1dcd24e9f 100644
--- a/3rdparty/spirv-cross/spirv_cross_parsed_ir.cpp
+++ b/3rdparty/spirv-cross/spirv_cross_parsed_ir.cpp
@@ -452,6 +452,10 @@ void ParsedIR::set_decoration(ID id, Decoration decoration, uint32_t argument)
 		dec.fp_rounding_mode = static_cast<FPRoundingMode>(argument);
 		break;
 
+	case DecorationFPFastMathMode:
+		dec.fp_fast_math_mode = static_cast<FPFastMathModeMask>(argument);
+		break;
+
 	default:
 		break;
 	}
@@ -523,8 +527,27 @@ void ParsedIR::mark_used_as_array_length(ID id)
 	switch (ids[id].get_type())
 	{
 	case TypeConstant:
-		get<SPIRConstant>(id).is_used_as_array_length = true;
+	{
+		auto &c = get<SPIRConstant>(id);
+		c.is_used_as_array_length = true;
+
+		// Mark composite dependencies as well.
+		for (auto &sub_id: c.m.id)
+			if (sub_id)
+				mark_used_as_array_length(sub_id);
+
+		for (uint32_t col = 0; col < c.m.columns; col++)
+		{
+			for (auto &sub_id : c.m.c[col].id)
+				if (sub_id)
+					mark_used_as_array_length(sub_id);
+		}
+
+		for (auto &sub_id : c.subconstants)
+			if (sub_id)
+				mark_used_as_array_length(sub_id);
 		break;
+	}
 
 	case TypeConstantOp:
 	{
@@ -643,6 +666,8 @@ uint32_t ParsedIR::get_decoration(ID id, Decoration decoration) const
 		return dec.index;
 	case DecorationFPRoundingMode:
 		return dec.fp_rounding_mode;
+	case DecorationFPFastMathMode:
+		return dec.fp_fast_math_mode;
 	default:
 		return 1;
 	}
@@ -730,6 +755,10 @@ void ParsedIR::unset_decoration(ID id, Decoration decoration)
 		dec.fp_rounding_mode = FPRoundingModeMax;
 		break;
 
+	case DecorationFPFastMathMode:
+		dec.fp_fast_math_mode = FPFastMathModeMaskNone;
+		break;
+
 	case DecorationHlslCounterBufferGOOGLE:
 	{
 		auto &counter = meta[id].hlsl_magic_counter_buffer;
diff --git a/3rdparty/spirv-cross/spirv_cross_parsed_ir.hpp b/3rdparty/spirv-cross/spirv_cross_parsed_ir.hpp
index 3892248aa..8c30ef819 100644
--- a/3rdparty/spirv-cross/spirv_cross_parsed_ir.hpp
+++ b/3rdparty/spirv-cross/spirv_cross_parsed_ir.hpp
@@ -111,6 +111,7 @@ public:
 
 	struct Source
 	{
+		spv::SourceLanguage lang = spv::SourceLanguageUnknown;
 		uint32_t version = 0;
 		bool es = false;
 		bool known = false;
diff --git a/3rdparty/spirv-cross/spirv_glsl.cpp b/3rdparty/spirv-cross/spirv_glsl.cpp
index ca9d0309d..5392bd796 100644
--- a/3rdparty/spirv-cross/spirv_glsl.cpp
+++ b/3rdparty/spirv-cross/spirv_glsl.cpp
@@ -41,6 +41,9 @@ using namespace spv;
 using namespace SPIRV_CROSS_NAMESPACE;
 using namespace std;
 
+namespace SPIRV_CROSS_NAMESPACE
+{
+
 enum ExtraSubExpressionType
 {
 	// Create masks above any legal ID range to allow multiple address spaces into the extra_sub_expressions map.
@@ -48,6 +51,46 @@ enum ExtraSubExpressionType
 	EXTRA_SUB_EXPRESSION_TYPE_AUX = 0x20000000
 };
 
+struct GlslConstantNameMapping
+{
+	uint32_t value;
+	const char *alias;
+};
+
+#define DEF_GLSL_MAPPING(x) { x, "gl_" #x }
+#define DEF_GLSL_MAPPING_EXT(x) { x##KHR, "gl_" #x }
+static const GlslConstantNameMapping CoopVecComponentTypeNames[] = {
+	DEF_GLSL_MAPPING(ComponentTypeFloat16NV),
+	DEF_GLSL_MAPPING(ComponentTypeFloat32NV),
+	DEF_GLSL_MAPPING(ComponentTypeFloat64NV),
+	DEF_GLSL_MAPPING(ComponentTypeSignedInt8NV),
+	DEF_GLSL_MAPPING(ComponentTypeSignedInt16NV),
+	DEF_GLSL_MAPPING(ComponentTypeSignedInt32NV),
+	DEF_GLSL_MAPPING(ComponentTypeSignedInt64NV),
+	DEF_GLSL_MAPPING(ComponentTypeUnsignedInt8NV),
+	DEF_GLSL_MAPPING(ComponentTypeUnsignedInt16NV),
+	DEF_GLSL_MAPPING(ComponentTypeUnsignedInt32NV),
+	DEF_GLSL_MAPPING(ComponentTypeUnsignedInt64NV),
+	DEF_GLSL_MAPPING(ComponentTypeSignedInt8PackedNV),
+	DEF_GLSL_MAPPING(ComponentTypeUnsignedInt8PackedNV),
+	DEF_GLSL_MAPPING(ComponentTypeFloatE4M3NV),
+	DEF_GLSL_MAPPING(ComponentTypeFloatE5M2NV),
+};
+
+static const GlslConstantNameMapping CoopVecMatrixLayoutNames[] = {
+	DEF_GLSL_MAPPING(CooperativeVectorMatrixLayoutRowMajorNV),
+	DEF_GLSL_MAPPING(CooperativeVectorMatrixLayoutColumnMajorNV),
+	DEF_GLSL_MAPPING(CooperativeVectorMatrixLayoutInferencingOptimalNV),
+	DEF_GLSL_MAPPING(CooperativeVectorMatrixLayoutTrainingOptimalNV),
+};
+
+static const GlslConstantNameMapping CoopMatMatrixLayoutNames[] = {
+	DEF_GLSL_MAPPING_EXT(CooperativeMatrixLayoutRowMajor),
+	DEF_GLSL_MAPPING_EXT(CooperativeMatrixLayoutColumnMajor),
+};
+#undef DEF_GLSL_MAPPING
+#undef DEF_GLSL_MAPPING_EXT
+
 static bool is_unsigned_opcode(Op op)
 {
 	// Don't have to be exhaustive, only relevant for legacy target checking ...
@@ -159,6 +202,7 @@ static BufferPackingStandard packing_to_substruct_packing(BufferPackingStandard
 		return packing;
 	}
 }
+}
 
 void CompilerGLSL::init()
 {
@@ -617,6 +661,19 @@ void CompilerGLSL::find_static_extensions()
 			ray_tracing_is_khr = true;
 			break;
 
+		case CapabilityRayTracingClusterAccelerationStructureNV:
+			if (options.es || options.version < 460 || !options.vulkan_semantics)
+				SPIRV_CROSS_THROW("Cluster AS requires Vulkan GLSL 460.");
+			require_extension_internal("GL_NV_cluster_acceleration_structure");
+			ray_tracing_is_khr = true;
+			break;
+
+		case CapabilityTensorsARM:
+			if (options.es || options.version < 460 || !options.vulkan_semantics)
+				SPIRV_CROSS_THROW("Tensor requires Vulkan GLSL 460.");
+			require_extension_internal("GL_ARM_tensors");
+			break;
+
 		default:
 			break;
 		}
@@ -1529,9 +1586,12 @@ uint32_t CompilerGLSL::type_to_packed_base_size(const SPIRType &type, BufferPack
 	case SPIRType::Half:
 	case SPIRType::Short:
 	case SPIRType::UShort:
+	case SPIRType::BFloat16:
 		return 2;
 	case SPIRType::SByte:
 	case SPIRType::UByte:
+	case SPIRType::FloatE4M3:
+	case SPIRType::FloatE5M2:
 		return 1;
 
 	default:
@@ -2550,7 +2610,7 @@ void CompilerGLSL::emit_buffer_block_flattened(const SPIRVariable &var)
 			SPIRV_CROSS_THROW("Basic types in a flattened UBO must be float, int or uint.");
 
 		auto flags = ir.get_buffer_block_flags(var);
-		statement("uniform ", flags_to_qualifiers_glsl(tmp, flags), type_to_glsl(tmp), " ", buffer_name, "[",
+		statement("uniform ", flags_to_qualifiers_glsl(tmp, 0, flags), type_to_glsl(tmp), " ", buffer_name, "[",
 		          buffer_size, "];");
 	}
 	else
@@ -5791,8 +5851,22 @@ string CompilerGLSL::constant_op_expression(const SPIRConstantOp &cop)
 
 	case OpCompositeExtract:
 	{
-		auto expr = access_chain_internal(cop.arguments[0], &cop.arguments[1], uint32_t(cop.arguments.size() - 1),
-		                                  ACCESS_CHAIN_INDEX_IS_LITERAL_BIT, nullptr);
+		// Trivial vector extracts (of WorkGroupSize typically),
+		// punch through to the input spec constant if the composite is used as array size.
+		const auto *c = maybe_get<SPIRConstant>(cop.arguments[0]);
+
+		string expr;
+		if (c && cop.arguments.size() == 2 && c->is_used_as_array_length &&
+		    !backend.supports_spec_constant_array_size &&
+		    is_vector(get<SPIRType>(c->constant_type)))
+		{
+			expr = to_expression(c->specialization_constant_id(0, cop.arguments[1]));
+		}
+		else
+		{
+			expr = access_chain_internal(cop.arguments[0], &cop.arguments[1], uint32_t(cop.arguments.size() - 1),
+			                             ACCESS_CHAIN_INDEX_IS_LITERAL_BIT, nullptr);
+		}
 		return expr;
 	}
 
@@ -5940,6 +6014,30 @@ string CompilerGLSL::constant_expression(const SPIRConstant &c,
 		require_extension_internal("GL_EXT_null_initializer");
 		return backend.constant_null_initializer;
 	}
+	else if (c.replicated && type.op != spv::OpTypeArray)
+	{
+		if (type.op == spv::OpTypeMatrix)
+		{
+			uint32_t num_elements = type.columns;
+			// GLSL does not allow the replication constructor for matrices
+			// mat4(vec4(0.0)) needs to be manually expanded to mat4(vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0));
+			std::string res;
+			res += type_to_glsl(type);
+			res += "(";
+			for (uint32_t i = 0; i < num_elements; i++)
+			{
+				res += to_expression(c.subconstants[0]);
+				if (i < num_elements - 1)
+					res += ", ";
+			}
+			res += ")";
+			return res;
+		}
+		else
+		{
+			return join(type_to_glsl(type), "(", to_expression(c.subconstants[0]), ")");
+		}
+	}
 	else if (!c.subconstants.empty())
 	{
 		// Handles Arrays and structures.
@@ -5989,8 +6087,16 @@ string CompilerGLSL::constant_expression(const SPIRConstant &c,
 		}
 
 		uint32_t subconstant_index = 0;
-		for (auto &elem : c.subconstants)
+		size_t num_elements = c.subconstants.size();
+		if (c.replicated)
 		{
+			if (type.array.size() != 1)
+				SPIRV_CROSS_THROW("Multidimensional arrays not yet supported as replicated constans");
+			num_elements = type.array[0];
+		}
+		for (size_t i = 0; i < num_elements; i++)
+		{
+			auto &elem = c.subconstants[c.replicated ? 0 : i];
 			if (auto *op = maybe_get<SPIRConstantOp>(elem))
 			{
 				res += constant_op_expression(*op);
@@ -6021,7 +6127,7 @@ string CompilerGLSL::constant_expression(const SPIRConstant &c,
 				}
 			}
 
-			if (&elem != &c.subconstants.back())
+			if (i != num_elements - 1)
 				res += ", ";
 
 			subconstant_index++;
@@ -6095,17 +6201,44 @@ string CompilerGLSL::constant_expression(const SPIRConstant &c,
 #pragma warning(disable : 4996)
 #endif
 
+string CompilerGLSL::convert_floate4m3_to_string(const SPIRConstant &c, uint32_t col, uint32_t row)
+{
+	string res;
+	float float_value = c.scalar_floate4m3(col, row);
+
+	// There is no infinity in e4m3.
+	if (std::isnan(float_value))
+	{
+		SPIRType type { OpTypeFloat };
+		type.basetype = SPIRType::Half;
+		type.vecsize = 1;
+		type.columns = 1;
+		res = join(type_to_glsl(type), "(0.0 / 0.0)");
+	}
+	else
+	{
+		SPIRType type { OpTypeFloat };
+		type.basetype = SPIRType::FloatE4M3;
+		type.vecsize = 1;
+		type.columns = 1;
+		res = join(type_to_glsl(type), "(", format_float(float_value), ")");
+	}
+
+	return res;
+}
+
 string CompilerGLSL::convert_half_to_string(const SPIRConstant &c, uint32_t col, uint32_t row)
 {
 	string res;
-	float float_value = c.scalar_f16(col, row);
+	bool is_bfloat8 = get<SPIRType>(c.constant_type).basetype == SPIRType::FloatE5M2;
+	float float_value = is_bfloat8 ? c.scalar_bf8(col, row) : c.scalar_f16(col, row);
 
 	// There is no literal "hf" in GL_NV_gpu_shader5, so to avoid lots
 	// of complicated workarounds, just value-cast to the half type always.
 	if (std::isnan(float_value) || std::isinf(float_value))
 	{
 		SPIRType type { OpTypeFloat };
-		type.basetype = SPIRType::Half;
+		type.basetype = is_bfloat8 ? SPIRType::FloatE5M2 : SPIRType::Half;
 		type.vecsize = 1;
 		type.columns = 1;
 
@@ -6121,7 +6254,7 @@ string CompilerGLSL::convert_half_to_string(const SPIRConstant &c, uint32_t col,
 	else
 	{
 		SPIRType type { OpTypeFloat };
-		type.basetype = SPIRType::Half;
+		type.basetype = is_bfloat8 ? SPIRType::FloatE5M2 : SPIRType::Half;
 		type.vecsize = 1;
 		type.columns = 1;
 		res = join(type_to_glsl(type), "(", format_float(float_value), ")");
@@ -6358,6 +6491,29 @@ string CompilerGLSL::constant_expression_vector(const SPIRConstant &c, uint32_t
 
 	switch (type.basetype)
 	{
+	case SPIRType::FloatE4M3:
+		if (splat || swizzle_splat)
+		{
+			res += convert_floate4m3_to_string(c, vector, 0);
+			if (swizzle_splat)
+				res = remap_swizzle(get<SPIRType>(c.constant_type), 1, res);
+		}
+		else
+		{
+			for (uint32_t i = 0; i < c.vector_size(); i++)
+			{
+				if (c.vector_size() > 1 && c.specialization_constant_id(vector, i) != 0)
+					res += to_expression(c.specialization_constant_id(vector, i));
+				else
+					res += convert_floate4m3_to_string(c, vector, i);
+
+				if (i + 1 < c.vector_size())
+					res += ", ";
+			}
+		}
+		break;
+
+	case SPIRType::FloatE5M2:
 	case SPIRType::Half:
 		if (splat || swizzle_splat)
 		{
@@ -6722,7 +6878,7 @@ void CompilerGLSL::emit_uninitialized_temporary(uint32_t result_type, uint32_t r
 		if (options.force_zero_initialized_variables && type_can_zero_initialize(type))
 			initializer = join(" = ", to_zero_initialized_expression(result_type));
 
-		statement(flags_to_qualifiers_glsl(type, flags), variable_decl(type, to_name(result_id)), initializer, ";");
+		statement(flags_to_qualifiers_glsl(type, result_id, flags), variable_decl(type, to_name(result_id)), initializer, ";");
 	}
 }
 
@@ -6757,7 +6913,7 @@ string CompilerGLSL::declare_temporary(uint32_t result_type, uint32_t result_id)
 		// The result_id has not been made into an expression yet, so use flags interface.
 		add_local_variable_name(result_id);
 		auto &flags = get_decoration_bitset(result_id);
-		return join(flags_to_qualifiers_glsl(type, flags), variable_decl(type, to_name(result_id)), " = ");
+		return join(flags_to_qualifiers_glsl(type, result_id, flags), variable_decl(type, to_name(result_id)), " = ");
 	}
 }
 
@@ -6830,7 +6986,7 @@ void CompilerGLSL::emit_binary_op(uint32_t result_type, uint32_t result_id, uint
 {
 	// Various FP arithmetic opcodes such as add, sub, mul will hit this.
 	bool force_temporary_precise = backend.support_precise_qualifier &&
-	                               has_decoration(result_id, DecorationNoContraction) &&
+	                               has_legacy_nocontract(result_type, result_id) &&
 	                               type_is_floating_point(get<SPIRType>(result_type));
 	bool forward = should_forward(op0) && should_forward(op1) && !force_temporary_precise;
 
@@ -9795,6 +9951,22 @@ string CompilerGLSL::bitcast_glsl_op(const SPIRType &out_type, const SPIRType &i
 		return "bfloat16BitsToUintEXT";
 	else if (out_type.basetype == SPIRType::Short && in_type.basetype == SPIRType::BFloat16)
 		return "bfloat16BitsToIntEXT";
+	else if (out_type.basetype == SPIRType::FloatE4M3 && in_type.basetype == SPIRType::UByte)
+		return "uintBitsToFloate4m3EXT";
+	else if (out_type.basetype == SPIRType::FloatE4M3 && in_type.basetype == SPIRType::SByte)
+		return "intBitsToFloate4m3EXT";
+	else if (out_type.basetype == SPIRType::UByte && in_type.basetype == SPIRType::FloatE4M3)
+		return "floate4m3BitsToUintEXT";
+	else if (out_type.basetype == SPIRType::SByte && in_type.basetype == SPIRType::FloatE4M3)
+		return "floate4m3BitsToIntEXT";
+	else if (out_type.basetype == SPIRType::FloatE5M2 && in_type.basetype == SPIRType::UByte)
+		return "uintBitsToFloate5m2EXT";
+	else if (out_type.basetype == SPIRType::FloatE5M2 && in_type.basetype == SPIRType::SByte)
+		return "intBitsToFloate5m2EXT";
+	else if (out_type.basetype == SPIRType::UByte && in_type.basetype == SPIRType::FloatE5M2)
+		return "floate5m2BitsToUintEXT";
+	else if (out_type.basetype == SPIRType::SByte && in_type.basetype == SPIRType::FloatE5M2)
+		return "floate5m2BitsToIntEXT";
 
 	return "";
 }
@@ -10198,6 +10370,14 @@ string CompilerGLSL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 	case BuiltInCullPrimitiveEXT:
 		return "gl_CullPrimitiveEXT";
 
+	case BuiltInClusterIDNV:
+	{
+		if (!options.vulkan_semantics)
+			SPIRV_CROSS_THROW("Can only use ClusterIDNV in Vulkan GLSL.");
+		require_extension_internal("GL_NV_cluster_acceleration_structure");
+		return "gl_ClusterIDNV";
+	}
+
 	default:
 		return join("gl_BuiltIn_", convert_to_string(builtin));
 	}
@@ -10441,8 +10621,8 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 			if (ptr_chain_array_entry)
 				expr = join("(", expr, ")");
 		}
-		// Arrays
-		else if (!type->array.empty())
+		// Arrays and OpTypeCooperativeVectorNV (aka fancy arrays)
+		else if (!type->array.empty() || type->op == spv::OpTypeCooperativeVectorNV)
 		{
 			// If we are flattening multidimensional arrays, only create opening bracket on first
 			// array index.
@@ -11547,7 +11727,7 @@ void CompilerGLSL::emit_variable_temporary_copies(const SPIRVariable &var)
 	{
 		auto &type = get<SPIRType>(var.basetype);
 		auto &flags = get_decoration_bitset(var.self);
-		statement(flags_to_qualifiers_glsl(type, flags), variable_decl(type, join("_", var.self, "_copy")), ";");
+		statement(flags_to_qualifiers_glsl(type, var.self, flags), variable_decl(type, join("_", var.self, "_copy")), ";");
 		flushed_phi_variables.insert(var.self);
 	}
 }
@@ -13761,13 +13941,42 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	case OpCooperativeMatrixConvertNV:
+		if (!options.vulkan_semantics)
+			SPIRV_CROSS_THROW("CooperativeMatrixConvertNV requires vulkan semantics.");
+		require_extension_internal("GL_NV_cooperative_matrix2");
+		// fallthrough
 	case OpFConvert:
 	{
 		uint32_t result_type = ops[0];
 		uint32_t id = ops[1];
 
-		auto func = type_to_glsl_constructor(get<SPIRType>(result_type));
-		emit_unary_func_op(result_type, id, ops[2], func.c_str());
+		auto &type = get<SPIRType>(result_type);
+
+		if (type.op == OpTypeCooperativeMatrixKHR && opcode == OpFConvert)
+		{
+			auto &expr_type = expression_type(ops[2]);
+			if (get<SPIRConstant>(type.ext.cooperative.use_id).scalar() !=
+			    get<SPIRConstant>(expr_type.ext.cooperative.use_id).scalar())
+			{
+				// Somewhat questionable with spec constant uses.
+				if (!options.vulkan_semantics)
+					SPIRV_CROSS_THROW("NV_cooperative_matrix2 requires vulkan semantics.");
+				require_extension_internal("GL_NV_cooperative_matrix2");
+			}
+		}
+
+		if ((type.basetype == SPIRType::FloatE4M3 || type.basetype == SPIRType::FloatE5M2) &&
+		    has_decoration(id, spv::DecorationSaturatedToLargestFloat8NormalConversionEXT))
+		{
+			emit_uninitialized_temporary_expression(result_type, id);
+			statement("saturatedConvertEXT(", to_expression(id), ", ", to_unpacked_expression(ops[2]), ");");
+		}
+		else
+		{
+			auto func = type_to_glsl_constructor(type);
+			emit_unary_func_op(result_type, id, ops[2], func.c_str());
+		}
 		break;
 	}
 
@@ -14913,8 +15122,11 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		{
 			emit_spv_amd_gcn_shader_op(ops[0], ops[1], ops[3], &ops[4], length - 4);
 		}
+		else if (ext == SPIRExtension::NonSemanticShaderDebugInfo)
+		{
+			emit_non_semantic_shader_debug_info(ops[0], ops[1], ops[3], &ops[4], length - 4);
+		}
 		else if (ext == SPIRExtension::SPV_debug_info ||
-		         ext == SPIRExtension::NonSemanticShaderDebugInfo ||
 		         ext == SPIRExtension::NonSemanticGeneric)
 		{
 			break; // Ignore SPIR-V debug information extended instructions.
@@ -15319,7 +15531,81 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	GLSL_RAY_QUERY_GET_OP2(IntersectionWorldToObject);
 #undef GLSL_RAY_QUERY_GET_OP
 #undef GLSL_RAY_QUERY_GET_OP2
+	case OpRayQueryGetClusterIdNV:
+		flush_variable_declaration(ops[2]);
+		emit_op(ops[0], ops[1], join("rayQueryGetIntersectionClusterIdNV(", to_expression(ops[2]), ", ", "bool(", to_expression(ops[3]), "))"), false);
+		break;
+	case OpTensorQuerySizeARM:
+		flush_variable_declaration(ops[1]);
+		// tensorSizeARM(tensor, dimension)
+		emit_binary_func_op(ops[0], ops[1], ops[2], ops[3], "tensorSizeARM");
+		break;
+	case OpTensorReadARM:
+	{
+		flush_variable_declaration(ops[1]);
+		emit_uninitialized_temporary_expression(ops[0], ops[1]);
 
+		SmallVector<std::string> args {
+			to_expression(ops[2]), // tensor
+			to_expression(ops[3]), // coordinates
+			to_expression(ops[1]), // out value
+		};
+		if (length > 4)
+		{
+			std::string tensor_operands;
+			if (ops[4] == 0)
+				tensor_operands = "0x0u";
+			else if (ops[4] == spv::TensorOperandsNontemporalARMMask)
+				tensor_operands = "gl_TensorOperandsNonTemporalARM";
+			else if (ops[4] == spv::TensorOperandsOutOfBoundsValueARMMask)
+				tensor_operands = "gl_TensorOperandsOutOfBoundsValueARM";
+			else if (ops[4] == (spv::TensorOperandsNontemporalARMMask | spv::TensorOperandsOutOfBoundsValueARMMask))
+				tensor_operands = "gl_TensorOperandsNonTemporalARM | gl_TensorOperandsOutOfBoundsValueARM";
+			else
+				SPIRV_CROSS_THROW("Invalid tensorOperands for tensorReadARM.");
+			if ((ops[4] & spv::TensorOperandsOutOfBoundsValueARMMask) && length != 6)
+				SPIRV_CROSS_THROW("gl_TensorOperandsOutOfBoundsValueARM requires an outOfBoundsValue argument.");
+			args.push_back(tensor_operands); // tensorOperands
+		}
+		if (length >= 6)
+		{
+			if ((length > 6) || (ops[4] & spv::TensorOperandsOutOfBoundsValueARMMask) == 0)
+				SPIRV_CROSS_THROW("Too many arguments to tensorReadARM.");
+			args.push_back(to_expression(ops[5])); // outOfBoundsValue
+		}
+
+		// tensorRead(tensor, sizeof(type), coordinates, value, operand, ...)
+		statement("tensorReadARM(", merge(args), ");");
+		break;
+	}
+	case OpTensorWriteARM:
+	{
+		flush_variable_declaration(ops[0]);
+
+		SmallVector<std::string> args {
+			to_expression(ops[0]), // tensor
+			to_expression(ops[1]), // coordinates
+			to_expression(ops[2]), // out value
+		};
+
+		if (length > 3)
+		{
+			std::string tensor_operands;
+			if (ops[3] == 0)
+				tensor_operands = "0x0u";
+			else if (ops[3] == spv::TensorOperandsNontemporalARMMask)
+				tensor_operands = "gl_TensorOperandsNonTemporalARM";
+			else
+				SPIRV_CROSS_THROW("Invalid tensorOperands for tensorWriteARM.");
+			args.push_back(tensor_operands); // tensorOperands
+		}
+		if (length > 4)
+			SPIRV_CROSS_THROW("Too many arguments to tensorWriteARM.");
+
+		// tensorWrite(tensor, sizeof(type), coordinates, value)
+		statement("tensorWriteARM(", merge(args), ");");
+		break;
+	}
 	case OpConvertUToAccelerationStructureKHR:
 	{
 		require_extension_internal("GL_EXT_ray_tracing");
@@ -15465,6 +15751,104 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	case OpCooperativeVectorLoadNV:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t id = ops[1];
+
+		emit_uninitialized_temporary_expression(result_type, id);
+
+		statement("coopVecLoadNV(", to_expression(id), ", ", to_expression(ops[2]), ", ", to_expression(ops[3]), ");");
+		register_read(id, ops[2], false);
+		break;
+	}
+
+	case OpCooperativeVectorStoreNV:
+	{
+		uint32_t id = ops[0];
+
+		statement("coopVecStoreNV(", to_expression(ops[2]), ", ", to_expression(id), ", ", to_expression(ops[1]), ");");
+		register_write(ops[2]);
+		break;
+	}
+
+	case OpCooperativeVectorOuterProductAccumulateNV:
+	{
+		auto buf = ops[0];
+		auto offset = ops[1];
+		auto v1 = ops[2];
+		auto v2 = ops[3];
+		auto matrix_layout_id = ops[4];
+		auto matrix_iterpretation_id = ops[5];
+		auto matrix_stride_id = length >= 6 ? ops[6] : 0;
+		statement(join("coopVecOuterProductAccumulateNV(", to_expression(v1), ", ", to_expression(v2), ", ",
+		               to_expression(buf), ", ", to_expression(offset), ", ",
+		               matrix_stride_id ? to_expression(matrix_stride_id) : "0",
+					   ", ", to_pretty_expression_if_int_constant(
+							   matrix_layout_id, std::begin(CoopVecMatrixLayoutNames), std::end(CoopVecMatrixLayoutNames)),
+		               ", ", to_pretty_expression_if_int_constant(
+							   matrix_iterpretation_id, std::begin(CoopVecComponentTypeNames), std::end(CoopVecComponentTypeNames)),
+		               ");"));
+		register_write(ops[0]);
+		break;
+	}
+
+	case OpCooperativeVectorReduceSumAccumulateNV:
+	{
+		auto buf = ops[0];
+		auto offset = ops[1];
+		auto v1 = ops[2];
+		statement(join("coopVecReduceSumAccumulateNV(", to_expression(v1), ", ", to_expression(buf), ", ",
+		               to_expression(offset), ");"));
+		register_write(ops[0]);
+		break;
+	}
+
+	case OpCooperativeVectorMatrixMulNV:
+	case OpCooperativeVectorMatrixMulAddNV:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t id = ops[1];
+
+		emit_uninitialized_temporary_expression(result_type, id);
+
+		std::string stmt;
+		switch (opcode)
+		{
+		case OpCooperativeVectorMatrixMulAddNV:
+			stmt += "coopVecMatMulAddNV(";
+			break;
+		case OpCooperativeVectorMatrixMulNV:
+			stmt += "coopVecMatMulNV(";
+			break;
+		default:
+			SPIRV_CROSS_THROW("Invalid op code for coopvec instruction.");
+		}
+		for (uint32_t i = 1; i < length; i++)
+		{
+			// arguments 3, 6 and in case of MulAddNv also 9 use component type int constants
+			if (i == 3 || i == 6 || (i == 9 && opcode == OpCooperativeVectorMatrixMulAddNV))
+			{
+				stmt += to_pretty_expression_if_int_constant(
+						ops[i], std::begin(CoopVecComponentTypeNames), std::end(CoopVecComponentTypeNames));
+			}
+			else if ((i == 12 && opcode == OpCooperativeVectorMatrixMulAddNV) ||
+			         (i == 9 && opcode == OpCooperativeVectorMatrixMulNV))
+			{
+				stmt += to_pretty_expression_if_int_constant(
+						ops[i], std::begin(CoopVecMatrixLayoutNames), std::end(CoopVecMatrixLayoutNames));
+			}
+			else
+				stmt += to_expression(ops[i]);
+
+			if (i < length - 1)
+				stmt += ", ";
+		}
+		stmt += ");";
+		statement(stmt);
+		break;
+	}
+
 	case OpCooperativeMatrixLengthKHR:
 	{
 		// Need to synthesize a dummy temporary, since the SPIR-V opcode is based on the type.
@@ -15492,27 +15876,10 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		if (!is_forcing_recompilation())
 			split_expr = split_coopmat_pointer(expr);
 
-		string layout_expr;
-		if (const auto *layout = maybe_get<SPIRConstant>(ops[3]))
-		{
-			if (!layout->specialization)
-			{
-				if (layout->scalar() == spv::CooperativeMatrixLayoutColumnMajorKHR)
-					layout_expr = "gl_CooperativeMatrixLayoutColumnMajor";
-				else
-					layout_expr = "gl_CooperativeMatrixLayoutRowMajor";
-			}
-		}
-
-		if (layout_expr.empty())
-			layout_expr = join("int(", to_expression(ops[3]), ")");
-
-		statement("coopMatLoad(",
-		          to_expression(id), ", ",
-		          split_expr.first, ", ",
-		          split_expr.second, ", ",
-		          to_expression(ops[4]), ", ",
-		          layout_expr, ");");
+		string layout_expr = to_pretty_expression_if_int_constant(
+				ops[3], std::begin(CoopMatMatrixLayoutNames), std::end(CoopMatMatrixLayoutNames));
+		statement("coopMatLoad(", to_expression(id), ", ", split_expr.first, ", ", split_expr.second, ", ",
+		          to_expression(ops[4]), ", ", layout_expr, ");");
 
 		register_read(id, ops[2], false);
 		break;
@@ -15532,27 +15899,11 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		if (!is_forcing_recompilation())
 			split_expr = split_coopmat_pointer(expr);
 
-		string layout_expr;
-		if (const auto *layout = maybe_get<SPIRConstant>(ops[2]))
-		{
-			if (!layout->specialization)
-			{
-				if (layout->scalar() == spv::CooperativeMatrixLayoutColumnMajorKHR)
-					layout_expr = "gl_CooperativeMatrixLayoutColumnMajor";
-				else
-					layout_expr = "gl_CooperativeMatrixLayoutRowMajor";
-			}
-		}
+		string layout_expr = to_pretty_expression_if_int_constant(
+				ops[2], std::begin(CoopMatMatrixLayoutNames), std::end(CoopMatMatrixLayoutNames));
 
-		if (layout_expr.empty())
-			layout_expr = join("int(", to_expression(ops[2]), ")");
-
-		statement("coopMatStore(",
-		          to_expression(ops[1]), ", ",
-		          split_expr.first, ", ",
-		          split_expr.second, ", ",
-		          to_expression(ops[3]), ", ",
-		          layout_expr, ");");
+		statement("coopMatStore(", to_expression(ops[1]), ", ", split_expr.first, ", ", split_expr.second, ", ",
+		          to_expression(ops[3]), ", ", layout_expr, ");");
 
 		// TODO: Do we care about memory operands?
 
@@ -15583,6 +15934,51 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	case OpCompositeConstructReplicateEXT:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t id = ops[1];
+
+		auto &type = get<SPIRType>(result_type);
+		auto value_to_replicate = to_expression(ops[2]);
+		std::string rhs;
+		// Matrices don't have a replicating constructor for vectors. Need to manually replicate
+		if (type.op == spv::OpTypeMatrix || type.op == spv::OpTypeArray)
+		{
+			if (type.op == spv::OpTypeArray && type.array.size() != 1)
+			{
+				SPIRV_CROSS_THROW(
+				    "Multi-dimensional arrays currently not supported for OpCompositeConstructReplicateEXT");
+			}
+			uint32_t num_elements = type.op == spv::OpTypeMatrix ? type.columns : type.array[0];
+			if (backend.use_initializer_list && type.op == spv::OpTypeArray)
+			{
+				rhs += "{";
+			}
+			else
+			{
+				rhs += type_to_glsl_constructor(type);
+				rhs += "(";
+			}
+			for (uint32_t i = 0; i < num_elements; i++)
+			{
+				rhs += value_to_replicate;
+				if (i < num_elements - 1)
+					rhs += ", ";
+			}
+			if (backend.use_initializer_list && type.op == spv::OpTypeArray)
+				rhs += "}";
+			else
+				rhs += ")";
+		}
+		else
+		{
+			rhs = join(type_to_glsl(type), "(", to_expression(ops[2]), ")");
+		}
+		emit_op(result_type, id, rhs, true);
+		break;
+	}
+
 	default:
 		statement("// unimplemented op ", instruction.op);
 		break;
@@ -15800,7 +16196,7 @@ void CompilerGLSL::emit_struct_member(const SPIRType &type, uint32_t member_type
 	if (is_block)
 		qualifiers = to_interpolation_qualifiers(memberflags);
 
-	statement(layout_for_member(type, index), qualifiers, qualifier, flags_to_qualifiers_glsl(membertype, memberflags),
+	statement(layout_for_member(type, index), qualifiers, qualifier, flags_to_qualifiers_glsl(membertype, 0, memberflags),
 	          variable_decl(membertype, to_member_name(type, index)), ";");
 }
 
@@ -15808,7 +16204,7 @@ void CompilerGLSL::emit_struct_padding_target(const SPIRType &)
 {
 }
 
-string CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const Bitset &flags)
+string CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, uint32_t id, const Bitset &flags)
 {
 	// GL_EXT_buffer_reference variables can be marked as restrict.
 	if (flags.get(DecorationRestrictPointerEXT))
@@ -15816,8 +16212,12 @@ string CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const Bitset
 
 	string qual;
 
-	if (type_is_floating_point(type) && flags.get(DecorationNoContraction) && backend.support_precise_qualifier)
+	if (type_is_floating_point(type) &&
+	    (flags.get(DecorationNoContraction) || (type.self && has_legacy_nocontract(type.self, id))) &&
+	    backend.support_precise_qualifier)
+	{
 		qual = "precise ";
+	}
 
 	// Structs do not have precision qualifiers, neither do doubles (desktop only anyways, so no mediump/highp).
 	bool type_supports_precision =
@@ -15886,7 +16286,7 @@ string CompilerGLSL::to_precision_qualifiers_glsl(uint32_t id)
 		if (result_type.width < 32)
 			return "mediump ";
 	}
-	return flags_to_qualifiers_glsl(type, ir.meta[id].decoration.decoration_flags);
+	return flags_to_qualifiers_glsl(type, id, ir.meta[id].decoration.decoration_flags);
 }
 
 void CompilerGLSL::fixup_io_block_patch_primitive_qualifiers(const SPIRVariable &var)
@@ -15969,6 +16369,13 @@ string CompilerGLSL::to_qualifiers_glsl(uint32_t id)
 				SPIRV_CROSS_THROW("Cannot use GL_EXT_shader_image_load_formatted in ESSL.");
 		}
 	}
+	else if (type.basetype == SPIRType::Tensor)
+	{
+		if (flags.get(DecorationNonWritable))
+			res += "readonly ";
+		if (flags.get(DecorationNonReadable))
+			res += "writeonly ";
+	}
 
 	res += to_precision_qualifiers_glsl(id);
 
@@ -16403,6 +16810,14 @@ string CompilerGLSL::type_to_glsl(const SPIRType &type, uint32_t id)
 	case SPIRType::RayQuery:
 		return "rayQueryEXT";
 
+	case SPIRType::Tensor:
+		if (type.ext.tensor.rank == 0)
+			SPIRV_CROSS_THROW("GLSL tensors must have a Rank.");
+		if (type.ext.tensor.shape != 0)
+			SPIRV_CROSS_THROW("GLSL tensors cannot have a Shape.");
+		return join("tensorARM<", type_to_glsl(get<SPIRType>(type.ext.tensor.type)), ", ",
+								to_expression(type.ext.tensor.rank), ">");
+
 	case SPIRType::Void:
 		return "void";
 
@@ -16430,6 +16845,17 @@ string CompilerGLSL::type_to_glsl(const SPIRType &type, uint32_t id)
 			require_extension_internal("GL_ARB_shader_atomic_counters");
 	}
 
+	if (type.op == spv::OpTypeCooperativeVectorNV)
+	{
+		require_extension_internal("GL_NV_cooperative_vector");
+		if (!options.vulkan_semantics)
+			SPIRV_CROSS_THROW("Cooperative vector NV only available in Vulkan.");
+
+		std::string component_type_str = type_to_glsl(get<SPIRType>(type.ext.coopVecNV.component_type_id));
+
+		return join("coopvecNV<", component_type_str, ", ", to_expression(type.ext.coopVecNV.component_count_id), ">");
+	}
+
 	const SPIRType *coop_type = &type;
 	while (is_pointer(*coop_type) || is_array(*coop_type))
 		coop_type = &get<SPIRType>(coop_type->parent_type);
@@ -16440,7 +16866,7 @@ string CompilerGLSL::type_to_glsl(const SPIRType &type, uint32_t id)
 		if (!options.vulkan_semantics)
 			SPIRV_CROSS_THROW("Cooperative matrix only available in Vulkan.");
 		// GLSL doesn't support this as spec constant, which makes sense ...
-		uint32_t use_type = get<SPIRConstant>(coop_type->cooperative.use_id).scalar();
+		uint32_t use_type = get<SPIRConstant>(coop_type->ext.cooperative.use_id).scalar();
 
 		const char *use = nullptr;
 		switch (use_type)
@@ -16462,7 +16888,7 @@ string CompilerGLSL::type_to_glsl(const SPIRType &type, uint32_t id)
 		}
 
 		string scope_expr;
-		if (const auto *scope = maybe_get<SPIRConstant>(coop_type->cooperative.scope_id))
+		if (const auto *scope = maybe_get<SPIRConstant>(coop_type->ext.cooperative.scope_id))
 		{
 			if (!scope->specialization)
 			{
@@ -16477,12 +16903,12 @@ string CompilerGLSL::type_to_glsl(const SPIRType &type, uint32_t id)
 		}
 
 		if (scope_expr.empty())
-			scope_expr = to_expression(coop_type->cooperative.scope_id);
+			scope_expr = to_expression(coop_type->ext.cooperative.scope_id);
 
 		return join("coopmat<", type_to_glsl(get<SPIRType>(coop_type->parent_type)), ", ",
 		            scope_expr, ", ",
-		            to_expression(coop_type->cooperative.rows_id), ", ",
-		            to_expression(coop_type->cooperative.columns_id), ", ", use, ">");
+		            to_expression(coop_type->ext.cooperative.rows_id), ", ",
+		            to_expression(coop_type->ext.cooperative.columns_id), ", ", use, ">");
 	}
 
 	if (type.vecsize == 1 && type.columns == 1) // Scalar builtin
@@ -16512,6 +16938,16 @@ string CompilerGLSL::type_to_glsl(const SPIRType &type, uint32_t id)
 				SPIRV_CROSS_THROW("bfloat16 requires Vulkan semantics.");
 			require_extension_internal("GL_EXT_bfloat16");
 			return "bfloat16_t";
+		case SPIRType::FloatE4M3:
+			if (!options.vulkan_semantics)
+				SPIRV_CROSS_THROW("floate4m3_t requires Vulkan semantics.");
+			require_extension_internal("GL_EXT_float_e4m3");
+			return "floate4m3_t";
+		case SPIRType::FloatE5M2:
+			if (!options.vulkan_semantics)
+				SPIRV_CROSS_THROW("floate5m2_t requires Vulkan semantics.");
+			require_extension_internal("GL_EXT_float_e5m2");
+			return "floate5m2_t";
 		case SPIRType::Float:
 			return "float";
 		case SPIRType::Double:
@@ -16549,6 +16985,16 @@ string CompilerGLSL::type_to_glsl(const SPIRType &type, uint32_t id)
 				SPIRV_CROSS_THROW("bfloat16 requires Vulkan semantics.");
 			require_extension_internal("GL_EXT_bfloat16");
 			return join("bf16vec", type.vecsize);
+		case SPIRType::FloatE4M3:
+			if (!options.vulkan_semantics)
+				SPIRV_CROSS_THROW("floate4m3_t requires Vulkan semantics.");
+			require_extension_internal("GL_EXT_float_e4m3");
+			return join("fe4m3vec", type.vecsize);
+		case SPIRType::FloatE5M2:
+			if (!options.vulkan_semantics)
+				SPIRV_CROSS_THROW("floate5m2_t requires Vulkan semantics.");
+			require_extension_internal("GL_EXT_float_e5m2");
+			return join("fe5m2vec", type.vecsize);
 		case SPIRType::Float:
 			return join("vec", type.vecsize);
 		case SPIRType::Double:
@@ -16785,7 +17231,7 @@ void CompilerGLSL::emit_function_prototype(SPIRFunction &func, const Bitset &ret
 	string decl;
 
 	auto &type = get<SPIRType>(func.return_type);
-	decl += flags_to_qualifiers_glsl(type, return_flags);
+	decl += flags_to_qualifiers_glsl(type, 0, return_flags);
 	decl += type_to_glsl(type);
 	decl += type_to_array_glsl(type, 0);
 	decl += " ";
@@ -17643,7 +18089,7 @@ void CompilerGLSL::emit_hoisted_temporaries(SmallVector<pair<TypeID, ID>> &tempo
 		if (options.force_zero_initialized_variables && type_can_zero_initialize(type))
 			initializer = join(" = ", to_zero_initialized_expression(tmp.first));
 
-		statement(flags_to_qualifiers_glsl(type, flags), variable_decl(type, to_name(tmp.second)), initializer, ";");
+		statement(flags_to_qualifiers_glsl(type, tmp.second, flags), variable_decl(type, to_name(tmp.second)), initializer, ";");
 
 		hoisted_temporaries.insert(tmp.second);
 		forced_temporaries.insert(tmp.second);
@@ -17658,7 +18104,7 @@ void CompilerGLSL::emit_hoisted_temporaries(SmallVector<pair<TypeID, ID>> &tempo
 		{
 			uint32_t mirror_id = mirrored_precision_itr->second;
 			auto &mirror_flags = get_decoration_bitset(mirror_id);
-			statement(flags_to_qualifiers_glsl(type, mirror_flags),
+			statement(flags_to_qualifiers_glsl(type, mirror_id, mirror_flags),
 			          variable_decl(type, to_name(mirror_id)),
 			          initializer, ";");
 			// The temporary might be read from before it's assigned, set up the expression now.
@@ -17669,6 +18115,26 @@ void CompilerGLSL::emit_hoisted_temporaries(SmallVector<pair<TypeID, ID>> &tempo
 }
 
 void CompilerGLSL::emit_block_chain(SPIRBlock &block)
+{
+	SmallVector<BlockID> cleanup_stack;
+	BlockID next_block = emit_block_chain_inner(block);
+
+	while (next_block != 0)
+	{
+		cleanup_stack.push_back(next_block);
+		next_block = emit_block_chain_inner(get<SPIRBlock>(next_block));
+	}
+
+	while (!cleanup_stack.empty())
+	{
+		emit_block_chain_cleanup(get<SPIRBlock>(cleanup_stack.back()));
+		cleanup_stack.pop_back();
+	}
+
+	emit_block_chain_cleanup(block);
+}
+
+BlockID CompilerGLSL::emit_block_chain_inner(SPIRBlock &block)
 {
 	bool select_branch_to_true_block = false;
 	bool select_branch_to_false_block = false;
@@ -17719,12 +18185,13 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 	}
 
 	// Remember deferred declaration state. We will restore it before returning.
-	SmallVector<bool, 64> rearm_dominated_variables(block.dominated_variables.size());
+	assert(block.rearm_dominated_variables.empty());
+	block.rearm_dominated_variables.resize(block.dominated_variables.size());
 	for (size_t i = 0; i < block.dominated_variables.size(); i++)
 	{
 		uint32_t var_id = block.dominated_variables[i];
 		auto &var = get<SPIRVariable>(var_id);
-		rearm_dominated_variables[i] = var.deferred_declaration;
+		block.rearm_dominated_variables[i] = var.deferred_declaration;
 	}
 
 	// This is the method often used by spirv-opt to implement loops.
@@ -18317,6 +18784,8 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 		SPIRV_CROSS_THROW("Unimplemented block terminator.");
 	}
 
+	BlockID trailing_block_id = 0;
+
 	if (block.next_block && emit_next_block)
 	{
 		// If we hit this case, we're dealing with an unconditional branch, which means we will output
@@ -18324,8 +18793,10 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 		if (block.merge != SPIRBlock::MergeSelection)
 		{
 			flush_phi(block.self, block.next_block);
+
 			// For a direct branch, need to remember to invalidate expressions in the next linear block instead.
-			get<SPIRBlock>(block.next_block).invalidate_expressions = block.invalidate_expressions;
+			get<SPIRBlock>(block.next_block).invalidate_expressions.clear();
+			std::swap(get<SPIRBlock>(block.next_block).invalidate_expressions, block.invalidate_expressions);
 		}
 
 		// For switch fallthrough cases, we terminate the chain here, but we still need to handle Phi.
@@ -18346,7 +18817,15 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 				branch_to_continue(block.self, block.next_block);
 			}
 			else if (BlockID(block.self) != block.next_block)
-				emit_block_chain(get<SPIRBlock>(block.next_block));
+			{
+				// Recursing here is quite scary since it's quite easy to stack overflow if
+				// the SPIR-V is constructed a particular way.
+				// We have to simulate the tail call ourselves.
+				if (block.merge != SPIRBlock::MergeLoop)
+					trailing_block_id = block.next_block;
+				else
+					emit_block_chain(get<SPIRBlock>(block.next_block));
+			}
 		}
 	}
 
@@ -18388,20 +18867,28 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 		if (is_continue(block.merge_block))
 			branch_to_continue(block.self, block.merge_block);
 		else
-			emit_block_chain(get<SPIRBlock>(block.merge_block));
+			trailing_block_id = block.merge_block;
 	}
 
+	return trailing_block_id;
+}
+
+void CompilerGLSL::emit_block_chain_cleanup(SPIRBlock &block)
+{
 	// Forget about control dependent expressions now.
 	block.invalidate_expressions.clear();
 
-	// After we return, we must be out of scope, so if we somehow have to re-emit this function,
+	// After we return, we must be out of scope, so if we somehow have to re-emit this block,
 	// re-declare variables if necessary.
-	assert(rearm_dominated_variables.size() == block.dominated_variables.size());
+	// We only need one array here for rearm_dominated_variables,
+	// since it should be impossible for the same block to be remitted in the same chain twice.
+	assert(block.rearm_dominated_variables.size() == block.dominated_variables.size());
 	for (size_t i = 0; i < block.dominated_variables.size(); i++)
 	{
 		uint32_t var = block.dominated_variables[i];
-		get<SPIRVariable>(var).deferred_declaration = rearm_dominated_variables[i];
+		get<SPIRVariable>(var).deferred_declaration = block.rearm_dominated_variables[i];
 	}
+	block.rearm_dominated_variables.clear();
 
 	// Just like for deferred declaration, we need to forget about loop variable enable
 	// if our block chain is reinstantiated later.
@@ -18943,6 +19430,34 @@ void CompilerGLSL::emit_line_directive(uint32_t file_id, uint32_t line_literal)
 	}
 }
 
+void CompilerGLSL::emit_non_semantic_shader_debug_info(uint32_t, uint32_t result_id, uint32_t eop,
+                                                       const uint32_t *args, uint32_t)
+{
+	if (!options.emit_line_directives)
+		return;
+
+	switch (eop)
+	{
+	case SPIRExtension::DebugLine:
+	{
+		// We're missing line end and columns here, but I don't think we can emit those in any meaningful way.
+		emit_line_directive(args[0], get<SPIRConstant>(args[1]).scalar());
+		break;
+	}
+
+	case SPIRExtension::DebugSource:
+	{
+		// Forward the string declaration here. We ignore the optional text operand.
+		auto &str = get<SPIRString>(args[0]).str;
+		set<SPIRString>(result_id, str);
+		break;
+	}
+
+	default:
+		break;
+	}
+}
+
 void CompilerGLSL::emit_copy_logical_type(uint32_t lhs_id, uint32_t lhs_type_id, uint32_t rhs_id, uint32_t rhs_type_id,
                                           SmallVector<uint32_t> chain)
 {
@@ -19582,3 +20097,96 @@ std::string CompilerGLSL::format_double(double value) const
 	return convert_to_string(value, current_locale_radix_character);
 }
 
+std::string CompilerGLSL::to_pretty_expression_if_int_constant(
+		uint32_t id,
+		const GlslConstantNameMapping *mapping_start, const GlslConstantNameMapping *mapping_end,
+		bool register_expression_read)
+{
+	auto *c = maybe_get<SPIRConstant>(id);
+	if (c && !c->specialization)
+	{
+		auto value = c->scalar();
+		auto pretty_name = std::find_if(mapping_start, mapping_end,
+		                                [value](const GlslConstantNameMapping &mapping) { return mapping.value == value; });
+		if (pretty_name != mapping_end)
+			return pretty_name->alias;
+	}
+	return join("int(", to_expression(id, register_expression_read), ")");
+}
+
+uint32_t CompilerGLSL::get_fp_fast_math_flags_for_op(uint32_t result_type, uint32_t id) const
+{
+	uint32_t fp_flags = ~0;
+
+	if (!type_is_floating_point(get<SPIRType>(result_type)))
+		return fp_flags;
+
+	auto &ep = get_entry_point();
+
+	// Per-operation flag supersedes all defaults.
+	if (id != 0 && has_decoration(id, DecorationFPFastMathMode))
+		return get_decoration(id, DecorationFPFastMathMode);
+
+	// Handle float_controls1 execution modes.
+	uint32_t width = get<SPIRType>(result_type).width;
+
+	bool szinp = false;
+
+	switch (width)
+	{
+	case 8:
+		szinp = ep.signed_zero_inf_nan_preserve_8;
+		break;
+
+	case 16:
+		szinp = ep.signed_zero_inf_nan_preserve_16;
+		break;
+
+	case 32:
+		szinp = ep.signed_zero_inf_nan_preserve_32;
+		break;
+
+	case 64:
+		szinp = ep.signed_zero_inf_nan_preserve_64;
+		break;
+
+	default:
+		break;
+	}
+
+	if (szinp)
+		fp_flags &= ~(FPFastMathModeNSZMask | FPFastMathModeNotInfMask | FPFastMathModeNotNaNMask);
+
+	// Legacy NoContraction deals with any kind of transform to the expression.
+	if (id != 0 && has_decoration(id, DecorationNoContraction))
+		fp_flags &= ~(FPFastMathModeAllowContractMask | FPFastMathModeAllowTransformMask | FPFastMathModeAllowReassocMask);
+
+	// Handle float_controls2 execution modes.
+	bool found_default = false;
+	for (auto &fp_pair : ep.fp_fast_math_defaults)
+	{
+		if (get<SPIRType>(fp_pair.first).width == width && fp_pair.second)
+		{
+			fp_flags &= get<SPIRConstant>(fp_pair.second).scalar();
+			found_default = true;
+		}
+	}
+
+	// From SPV_KHR_float_controls2:
+	// "This definition implies that, if the entry point set any FPFastMathDefault execution mode
+	// then any type for which a default is not set uses no fast math flags
+	// (although this can still be overridden on a per-operation basis).
+	// Modules must not mix setting fast math modes explicitly using this extension and relying on older API defaults."
+	if (!found_default && !ep.fp_fast_math_defaults.empty())
+		fp_flags = 0;
+
+	return fp_flags;
+}
+
+bool CompilerGLSL::has_legacy_nocontract(uint32_t result_type, uint32_t id) const
+{
+	const auto fp_flags = FPFastMathModeAllowContractMask |
+	                      FPFastMathModeAllowTransformMask |
+	                      FPFastMathModeAllowReassocMask;
+	return (get_fp_fast_math_flags_for_op(result_type, id) & fp_flags) != fp_flags;
+}
diff --git a/3rdparty/spirv-cross/spirv_glsl.hpp b/3rdparty/spirv-cross/spirv_glsl.hpp
index cea150f22..776ab8a6c 100644
--- a/3rdparty/spirv-cross/spirv_glsl.hpp
+++ b/3rdparty/spirv-cross/spirv_glsl.hpp
@@ -32,6 +32,8 @@
 
 namespace SPIRV_CROSS_NAMESPACE
 {
+struct GlslConstantNameMapping;
+
 enum PlsFormat
 {
 	PlsNone = 0,
@@ -426,6 +428,8 @@ protected:
 	                                                   const uint32_t *args, uint32_t count);
 	virtual void emit_spv_amd_gcn_shader_op(uint32_t result_type, uint32_t result_id, uint32_t op, const uint32_t *args,
 	                                        uint32_t count);
+	void emit_non_semantic_shader_debug_info(uint32_t result_type, uint32_t result_id, uint32_t op,
+	                                         const uint32_t *args, uint32_t count);
 	virtual void emit_header();
 	void emit_line_directive(uint32_t file_id, uint32_t line_literal);
 	void build_workgroup_size(SmallVector<std::string> &arguments, const SpecializationConstant &x,
@@ -662,6 +666,7 @@ protected:
 		bool workgroup_size_is_hidden = false;
 		bool requires_relaxed_precision_analysis = false;
 		bool implicit_c_integer_promotion_rules = false;
+		bool supports_spec_constant_array_size = true;
 	} backend;
 
 	void emit_struct(SPIRType &type);
@@ -685,6 +690,8 @@ protected:
 	void emit_flattened_io_block_member(const std::string &basename, const SPIRType &type, const char *qual,
 	                                    const SmallVector<uint32_t> &indices);
 	void emit_block_chain(SPIRBlock &block);
+	BlockID emit_block_chain_inner(SPIRBlock &block);
+	void emit_block_chain_cleanup(SPIRBlock &block);
 	void emit_hoisted_temporaries(SmallVector<std::pair<TypeID, ID>> &temporaries);
 	int get_constant_mapping_to_workgroup_component(const SPIRConstant &constant) const;
 	void emit_constant(const SPIRConstant &constant);
@@ -802,6 +809,10 @@ protected:
 	virtual void append_global_func_args(const SPIRFunction &func, uint32_t index, SmallVector<std::string> &arglist);
 	std::string to_non_uniform_aware_expression(uint32_t id);
 	std::string to_atomic_ptr_expression(uint32_t id);
+	std::string to_pretty_expression_if_int_constant(
+			uint32_t id,
+			const GlslConstantNameMapping *mapping_start, const GlslConstantNameMapping *mapping_end,
+			bool register_expression_read = true);
 	std::string to_expression(uint32_t id, bool register_expression_read = true);
 	std::string to_composite_constructor_expression(const SPIRType &parent_type, uint32_t id, bool block_like_type);
 	std::string to_rerolled_array_expression(const SPIRType &parent_type, const std::string &expr, const SPIRType &type);
@@ -830,7 +841,7 @@ protected:
 	void emit_output_variable_initializer(const SPIRVariable &var);
 	std::string to_precision_qualifiers_glsl(uint32_t id);
 	virtual const char *to_storage_qualifiers_glsl(const SPIRVariable &var);
-	std::string flags_to_qualifiers_glsl(const SPIRType &type, const Bitset &flags);
+	std::string flags_to_qualifiers_glsl(const SPIRType &type, uint32_t id, const Bitset &flags);
 	const char *format_to_glsl(spv::ImageFormat format);
 	virtual std::string layout_for_member(const SPIRType &type, uint32_t index);
 	virtual std::string to_interpolation_qualifiers(const Bitset &flags);
@@ -1017,6 +1028,8 @@ protected:
 	const Instruction *get_next_instruction_in_block(const Instruction &instr);
 	static uint32_t mask_relevant_memory_semantics(uint32_t semantics);
 
+	std::string convert_floate4m3_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
+	std::string convert_floate5m2_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
 	std::string convert_half_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
 	std::string convert_float_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
 	std::string convert_double_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
@@ -1068,6 +1081,9 @@ protected:
 	std::string format_float(float value) const;
 	std::string format_double(double value) const;
 
+	uint32_t get_fp_fast_math_flags_for_op(uint32_t result_type, uint32_t id) const;
+	bool has_legacy_nocontract(uint32_t result_type, uint32_t id) const;
+
 private:
 	void init();
 
diff --git a/3rdparty/spirv-cross/spirv_hlsl.cpp b/3rdparty/spirv-cross/spirv_hlsl.cpp
index 1ec4cb70f..0db81c46e 100644
--- a/3rdparty/spirv-cross/spirv_hlsl.cpp
+++ b/3rdparty/spirv-cross/spirv_hlsl.cpp
@@ -3029,7 +3029,7 @@ string CompilerHLSL::get_inner_entry_point_name() const
 		SPIRV_CROSS_THROW("Unsupported execution model.");
 }
 
-uint32_t CompilerHLSL::input_vertices_from_execution_mode(spirv_cross::SPIREntryPoint &execution) const
+uint32_t CompilerHLSL::input_vertices_from_execution_mode(SPIREntryPoint &execution) const
 {
 	uint32_t input_vertices = 1;
 
@@ -3061,7 +3061,7 @@ void CompilerHLSL::emit_function_prototype(SPIRFunction &func, const Bitset &ret
 	auto &type = get<SPIRType>(func.return_type);
 	if (type.array.empty())
 	{
-		decl += flags_to_qualifiers_glsl(type, return_flags);
+		decl += flags_to_qualifiers_glsl(type, 0, return_flags);
 		decl += type_to_glsl(type);
 		decl += " ";
 	}
diff --git a/3rdparty/spirv-cross/spirv_msl.cpp b/3rdparty/spirv-cross/spirv_msl.cpp
index 53f74f177..f60431441 100644
--- a/3rdparty/spirv-cross/spirv_msl.cpp
+++ b/3rdparty/spirv-cross/spirv_msl.cpp
@@ -510,7 +510,7 @@ void CompilerMSL::build_implicit_builtins()
 				has_local_invocation_index = true;
 			}
 
-			if (need_workgroup_size && builtin == BuiltInLocalInvocationId)
+			if (need_workgroup_size && builtin == BuiltInWorkgroupSize)
 			{
 				builtin_workgroup_size_id = var.self;
 				mark_implicit_builtin(StorageClassInput, BuiltInWorkgroupSize, var.self);
@@ -684,28 +684,6 @@ void CompilerMSL::build_implicit_builtins()
 				mark_implicit_builtin(StorageClassInput, BuiltInBaseInstance, var_id);
 			}
 
-			if (need_multiview)
-			{
-				// Multiview shaders are not allowed to write to gl_Layer, ostensibly because
-				// it is implicitly written from gl_ViewIndex, but we have to do that explicitly.
-				// Note that we can't just abuse gl_ViewIndex for this purpose: it's an input, but
-				// gl_Layer is an output in vertex-pipeline shaders.
-				uint32_t type_ptr_out_id = ir.increase_bound_by(2);
-				SPIRType uint_type_ptr_out = get_uint_type();
-				uint_type_ptr.op = OpTypePointer;
-				uint_type_ptr_out.pointer = true;
-				uint_type_ptr_out.pointer_depth++;
-				uint_type_ptr_out.parent_type = get_uint_type_id();
-				uint_type_ptr_out.storage = StorageClassOutput;
-				auto &ptr_out_type = set<SPIRType>(type_ptr_out_id, uint_type_ptr_out);
-				ptr_out_type.self = get_uint_type_id();
-				uint32_t var_id = type_ptr_out_id + 1;
-				set<SPIRVariable>(var_id, type_ptr_out_id, StorageClassOutput);
-				set_decoration(var_id, DecorationBuiltIn, BuiltInLayer);
-				builtin_layer_id = var_id;
-				mark_implicit_builtin(StorageClassOutput, BuiltInLayer, var_id);
-			}
-
 			if (need_multiview && !has_view_idx)
 			{
 				uint32_t var_id = ir.increase_bound_by(1);
@@ -718,6 +696,28 @@ void CompilerMSL::build_implicit_builtins()
 			}
 		}
 
+		if (need_multiview)
+		{
+			// Multiview shaders are not allowed to write to gl_Layer, ostensibly because
+			// it is implicitly written from gl_ViewIndex, but we have to do that explicitly.
+			// Note that we can't just abuse gl_ViewIndex for this purpose: it's an input, but
+			// gl_Layer is an output in vertex-pipeline shaders.
+			uint32_t type_ptr_out_id = ir.increase_bound_by(2);
+			SPIRType uint_type_ptr_out = get_uint_type();
+			uint_type_ptr_out.op = OpTypePointer;
+			uint_type_ptr_out.pointer = true;
+			uint_type_ptr_out.pointer_depth++;
+			uint_type_ptr_out.parent_type = get_uint_type_id();
+			uint_type_ptr_out.storage = StorageClassOutput;
+			auto &ptr_out_type = set<SPIRType>(type_ptr_out_id, uint_type_ptr_out);
+			ptr_out_type.self = get_uint_type_id();
+			uint32_t var_id = type_ptr_out_id + 1;
+			set<SPIRVariable>(var_id, type_ptr_out_id, StorageClassOutput);
+			set_decoration(var_id, DecorationBuiltIn, BuiltInLayer);
+			builtin_layer_id = var_id;
+			mark_implicit_builtin(StorageClassOutput, BuiltInLayer, var_id);
+		}
+
 		if ((need_tesc_params && (msl_options.multi_patch_workgroup || !has_invocation_id || !has_primitive_id)) ||
 		    (need_tese_params && !has_primitive_id) || need_grid_params)
 		{
@@ -932,25 +932,55 @@ void CompilerMSL::build_implicit_builtins()
 
 		if (need_workgroup_size && !has_workgroup_size)
 		{
-			uint32_t offset = ir.increase_bound_by(2);
-			uint32_t type_ptr_id = offset;
-			uint32_t var_id = offset + 1;
+			auto &execution = get_entry_point();
+			// First, check if the workgroup size _constant_ were defined.
+			// If it were, we don't need to do--in fact, shouldn't do--anything.
+			builtin_workgroup_size_id = execution.workgroup_size.constant;
+			if (builtin_workgroup_size_id == 0)
+			{
+				uint32_t var_id = ir.increase_bound_by(1);
 
-			// Create gl_WorkgroupSize.
-			uint32_t type_id = build_extended_vector_type(get_uint_type_id(), 3);
-			SPIRType uint_type_ptr = get<SPIRType>(type_id);
-			uint_type_ptr.op = OpTypePointer;
-			uint_type_ptr.pointer = true;
-			uint_type_ptr.pointer_depth++;
-			uint_type_ptr.parent_type = type_id;
-			uint_type_ptr.storage = StorageClassInput;
+				// Create gl_WorkgroupSize.
+				uint32_t type_id = build_extended_vector_type(get_uint_type_id(), 3);
+				// If we have LocalSize or LocalSizeId, use those to define the workgroup size.
+				if (execution.flags.get(ExecutionModeLocalSizeId))
+				{
+					const SPIRConstant *init[] = { &get<SPIRConstant>(execution.workgroup_size.id_x),
+						                           &get<SPIRConstant>(execution.workgroup_size.id_y),
+						                           &get<SPIRConstant>(execution.workgroup_size.id_z) };
+					bool specialized = init[0]->specialization || init[1]->specialization || init[2]->specialization;
+					set<SPIRConstant>(var_id, type_id, init, 3, specialized);
+					execution.workgroup_size.constant = var_id;
+				}
+				else if (execution.flags.get(ExecutionModeLocalSize))
+				{
+					uint32_t offset = ir.increase_bound_by(3);
+					const SPIRConstant *init[] = {
+						&set<SPIRConstant>(offset, get_uint_type_id(), execution.workgroup_size.x, false),
+						&set<SPIRConstant>(offset + 1, get_uint_type_id(), execution.workgroup_size.y, false),
+						&set<SPIRConstant>(offset + 2, get_uint_type_id(), execution.workgroup_size.z, false)
+					};
+					set<SPIRConstant>(var_id, type_id, init, 3, false);
+					execution.workgroup_size.constant = var_id;
+				}
+				else
+				{
+					uint32_t type_ptr_id = ir.increase_bound_by(1);
+					SPIRType uint_type_ptr = get<SPIRType>(type_id);
+					uint_type_ptr.op = OpTypePointer;
+					uint_type_ptr.pointer = true;
+					uint_type_ptr.pointer_depth++;
+					uint_type_ptr.parent_type = type_id;
+					uint_type_ptr.storage = StorageClassInput;
 
-			auto &ptr_type = set<SPIRType>(type_ptr_id, uint_type_ptr);
-			ptr_type.self = type_id;
-			set<SPIRVariable>(var_id, type_ptr_id, StorageClassInput);
-			set_decoration(var_id, DecorationBuiltIn, BuiltInWorkgroupSize);
-			builtin_workgroup_size_id = var_id;
-			mark_implicit_builtin(StorageClassInput, BuiltInWorkgroupSize, var_id);
+					auto &ptr_type = set<SPIRType>(type_ptr_id, uint_type_ptr);
+					ptr_type.self = type_id;
+					set<SPIRVariable>(var_id, type_ptr_id, StorageClassInput);
+					mark_implicit_builtin(StorageClassInput, BuiltInWorkgroupSize, var_id);
+				}
+				set_decoration(var_id, DecorationBuiltIn, BuiltInWorkgroupSize);
+				builtin_workgroup_size_id = var_id;
+			}
 		}
 
 		if (!has_frag_depth && force_frag_depth_passthrough)
@@ -1681,6 +1711,7 @@ string CompilerMSL::compile()
 	backend.array_is_value_type_in_buffer_blocks = false;
 	backend.support_pointer_to_pointer = true;
 	backend.implicit_c_integer_promotion_rules = true;
+	backend.supports_spec_constant_array_size = false;
 
 	capture_output_to_buffer = msl_options.capture_output_to_buffer;
 	is_rasterization_disabled = msl_options.disable_rasterization || capture_output_to_buffer;
@@ -1841,7 +1872,7 @@ void CompilerMSL::preprocess_op_codes()
 	if (preproc.uses_atomics)
 	{
 		add_header_line("#include <metal_atomic>");
-		add_pragma_line("#pragma clang diagnostic ignored \"-Wunused-variable\"");
+		add_pragma_line("#pragma clang diagnostic ignored \"-Wunused-variable\"", false);
 	}
 
 	// Before MSL 2.1 (2.2 for textures), Metal vertex functions that write to
@@ -1850,6 +1881,10 @@ void CompilerMSL::preprocess_op_codes()
 	    (preproc.uses_image_write && !msl_options.supports_msl_version(2, 2)))
 		is_rasterization_disabled = true;
 
+	// FIXME: This currently does not consider BDA side effects, so we cannot deduce const device for BDA.
+	if (preproc.uses_buffer_write || preproc.uses_image_write)
+		has_descriptor_side_effects = true;
+
 	// Tessellation control shaders are run as compute functions in Metal, and so
 	// must capture their output to a buffer.
 	if (is_tesc_shader() || (get_execution_model() == ExecutionModelVertex && msl_options.vertex_for_tessellation))
@@ -5720,22 +5755,44 @@ void CompilerMSL::emit_header()
 {
 	// This particular line can be overridden during compilation, so make it a flag and not a pragma line.
 	if (suppress_missing_prototypes)
-		statement("#pragma clang diagnostic ignored \"-Wmissing-prototypes\"");
+		add_pragma_line("#pragma clang diagnostic ignored \"-Wmissing-prototypes\"", false);
 	if (suppress_incompatible_pointer_types_discard_qualifiers)
-		statement("#pragma clang diagnostic ignored \"-Wincompatible-pointer-types-discards-qualifiers\"");
+		add_pragma_line("#pragma clang diagnostic ignored \"-Wincompatible-pointer-types-discards-qualifiers\"", false);
 
 	// Disable warning about "sometimes unitialized" when zero-initializing simple threadgroup variables
 	if (suppress_sometimes_unitialized)
-		statement("#pragma clang diagnostic ignored \"-Wsometimes-uninitialized\"");
+		add_pragma_line("#pragma clang diagnostic ignored \"-Wsometimes-uninitialized\"", false);
 
 	// Disable warning about missing braces for array<T> template to make arrays a value type
 	if (spv_function_implementations.count(SPVFuncImplUnsafeArray) != 0)
-		statement("#pragma clang diagnostic ignored \"-Wmissing-braces\"");
+		add_pragma_line("#pragma clang diagnostic ignored \"-Wmissing-braces\"", false);
+
+	// Floating point fast math compile declarations
+	if (msl_options.use_fast_math_pragmas && msl_options.supports_msl_version(3, 2))
+	{
+		uint32_t contract_mask = FPFastMathModeAllowContractMask;
+		uint32_t relax_mask = (FPFastMathModeNSZMask | FPFastMathModeAllowRecipMask | FPFastMathModeAllowReassocMask);
+		uint32_t fast_mask = (relax_mask | FPFastMathModeNotNaNMask | FPFastMathModeNotInfMask);
+
+		// FP math mode
+		uint32_t fp_flags = get_fp_fast_math_flags(true);
+		const char *math_mode = "safe";
+		if ((fp_flags & fast_mask) == fast_mask)		// Must have all flags
+			math_mode = "fast";
+		else if ((fp_flags & relax_mask) == relax_mask)	// Must have all flags
+			math_mode = "relaxed";
+
+		add_pragma_line(join("#pragma metal fp math_mode(", math_mode, ")"), false);
+
+		// FP contraction
+		const char *contract_mode = ((fp_flags & contract_mask) == contract_mask) ?  "fast" : "off";
+		add_pragma_line(join("#pragma metal fp contract(", contract_mode, ")"), false);
+	}
 
 	for (auto &pragma : pragma_lines)
 		statement(pragma);
 
-	if (!pragma_lines.empty() || suppress_missing_prototypes)
+	if (!pragma_lines.empty())
 		statement("");
 
 	statement("#include <metal_stdlib>");
@@ -5755,18 +5812,23 @@ void CompilerMSL::emit_header()
 		statement("");
 }
 
-void CompilerMSL::add_pragma_line(const string &line)
+void CompilerMSL::add_pragma_line(const string &line, bool recompile_on_unique)
 {
-	auto rslt = pragma_lines.insert(line);
-	if (rslt.second)
-		force_recompile();
+	if (std::find(pragma_lines.begin(), pragma_lines.end(), line) == pragma_lines.end())
+	{
+		pragma_lines.push_back(line);
+		if (recompile_on_unique)
+			force_recompile();
+	}
 }
 
 void CompilerMSL::add_typedef_line(const string &line)
 {
-	auto rslt = typedef_lines.insert(line);
-	if (rslt.second)
+	if (std::find(typedef_lines.begin(), typedef_lines.end(), line) == typedef_lines.end())
+	{
+		typedef_lines.push_back(line);
 		force_recompile();
+	}
 }
 
 // Template struct like spvUnsafeArray<> need to be declared *before* any resources are declared
@@ -8353,9 +8415,22 @@ void CompilerMSL::emit_specialization_constants_and_structs()
 					if (unique_func_constants[constant_id] == c.self)
 						statement("constant ", sc_type_name, " ", sc_tmp_name, " [[function_constant(", constant_id,
 						          ")]];");
-					statement("constant ", sc_type_name, " ", sc_name, " = is_function_constant_defined(", sc_tmp_name,
-					          ") ? ", bitcast_expression(type, sc_tmp_type, sc_tmp_name), " : ", constant_expression(c),
-					          ";");
+					// RenderDoc and other instrumentation may reuse the same SpecId with different base types.
+					// We deduplicate to one [[function_constant(id)]] temp and then initialize all variants from it.
+					// Metal forbids as_type to/from 'bool', so if either side is Boolean, avoid bitcasting here and
+					// prefer a value cast via a constructor instead (e.g. uint(tmp) / float(tmp) / bool(tmp)).
+					// This preserves expected toggle semantics and prevents illegal MSL like as_type<uint>(bool_tmp).
+					{
+						string sc_true_expr;
+						if (sc_tmp_type == type.basetype)
+							sc_true_expr = sc_tmp_name;
+						else if (sc_tmp_type == SPIRType::Boolean || type.basetype == SPIRType::Boolean)
+							sc_true_expr = join(sc_type_name, "(", sc_tmp_name, ")");
+						else
+							sc_true_expr = bitcast_expression(type, sc_tmp_type, sc_tmp_name);
+						statement("constant ", sc_type_name, " ", sc_name, " = is_function_constant_defined(", sc_tmp_name,
+						          ") ? ", sc_true_expr, " : ", constant_expression(c), ";");
+					}
 				}
 				else if (has_decoration(c.self, DecorationSpecId))
 				{
@@ -9161,7 +9236,12 @@ bool CompilerMSL::prepare_access_chain_for_scalar_access(std::string &expr, cons
 	// and there is a risk of concurrent write access to other components,
 	// we must cast the access chain to a plain pointer to ensure we only access the exact scalars we expect.
 	// The MSL compiler refuses to allow component-level access for any non-packed vector types.
-	if (!is_packed && (storage == StorageClassStorageBuffer || storage == StorageClassWorkgroup))
+	// MSL refuses to take address or reference to vector component, even for packed types, so just force
+	// through the pointer cast. No much we can do sadly.
+	// For packed types, we could technically omit this if we know the reference does not have to turn into a pointer
+	// of some kind, but that requires external analysis passes to figure out, and
+	// this case is likely rare enough that we don't need to bother.
+	if (storage == StorageClassStorageBuffer || storage == StorageClassWorkgroup)
 	{
 		const char *addr_space = storage == StorageClassWorkgroup ? "threadgroup" : "device";
 		expr = join("((", addr_space, " ", type_to_glsl(type), "*)&", enclose_expression(expr), ")");
@@ -9485,21 +9565,21 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 		break;
 
 	case OpFMul:
-		if (msl_options.invariant_float_math || has_decoration(ops[1], DecorationNoContraction))
+		if (msl_options.invariant_float_math || has_legacy_nocontract(ops[0], ops[1]))
 			MSL_BFOP(spvFMul);
 		else
 			MSL_BOP(*);
 		break;
 
 	case OpFAdd:
-		if (msl_options.invariant_float_math || has_decoration(ops[1], DecorationNoContraction))
+		if (msl_options.invariant_float_math || has_legacy_nocontract(ops[0], ops[1]))
 			MSL_BFOP(spvFAdd);
 		else
 			MSL_BOP(+);
 		break;
 
 	case OpFSub:
-		if (msl_options.invariant_float_math || has_decoration(ops[1], DecorationNoContraction))
+		if (msl_options.invariant_float_math || has_legacy_nocontract(ops[0], ops[1]))
 			MSL_BFOP(spvFSub);
 		else
 			MSL_BOP(-);
@@ -9997,7 +10077,7 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 	case OpVectorTimesMatrix:
 	case OpMatrixTimesVector:
 	{
-		if (!msl_options.invariant_float_math && !has_decoration(ops[1], DecorationNoContraction))
+		if (!msl_options.invariant_float_math && !has_legacy_nocontract(ops[0], ops[1]))
 		{
 			CompilerGLSL::emit_instruction(instruction);
 			break;
@@ -10039,7 +10119,7 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 
 	case OpMatrixTimesMatrix:
 	{
-		if (!msl_options.invariant_float_math && !has_decoration(ops[1], DecorationNoContraction))
+		if (!msl_options.invariant_float_math && !has_legacy_nocontract(ops[0], ops[1]))
 		{
 			CompilerGLSL::emit_instruction(instruction);
 			break;
@@ -10725,13 +10805,13 @@ bool CompilerMSL::emit_array_copy(const char *expr, uint32_t lhs_id, uint32_t rh
 	auto *lhs_var = maybe_get_backing_variable(lhs_id);
 	if (lhs_var && lhs_storage == StorageClassStorageBuffer && storage_class_array_is_thread(lhs_var->storage))
 		lhs_is_array_template = true;
-	else if (lhs_var && lhs_storage != StorageClassGeneric && type_is_block_like(get<SPIRType>(lhs_var->basetype)))
+	else if (lhs_var && lhs_storage != StorageClassGeneric && type_is_explicit_layout(get<SPIRType>(lhs_var->basetype)))
 		lhs_is_array_template = false;
 
 	auto *rhs_var = maybe_get_backing_variable(rhs_id);
 	if (rhs_var && rhs_storage == StorageClassStorageBuffer && storage_class_array_is_thread(rhs_var->storage))
 		rhs_is_array_template = true;
-	else if (rhs_var && rhs_storage != StorageClassGeneric && type_is_block_like(get<SPIRType>(rhs_var->basetype)))
+	else if (rhs_var && rhs_storage != StorageClassGeneric && type_is_explicit_layout(get<SPIRType>(rhs_var->basetype)))
 		rhs_is_array_template = false;
 
 	// If threadgroup storage qualifiers are *not* used:
@@ -11182,35 +11262,76 @@ void CompilerMSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 
 	auto &restype = get<SPIRType>(result_type);
 
+	// Only precise:: preserves NaN in trancendentals (supposedly, cannot find documentation for this).
+	const auto drop_nan_inf = FPFastMathModeNotInfMask | FPFastMathModeNotNaNMask;
+	bool preserve_nan = (get_fp_fast_math_flags_for_op(result_type, id) & drop_nan_inf) != drop_nan_inf;
+	const char *preserve_str = preserve_nan ? "precise" : "fast";
+
+	// TODO: Emit the default behavior to match existing code. Might need to be revisited.
+	// Only fp32 has the precise:: override.
+#define EMIT_PRECISE_OVERRIDE(glsl_op, op) \
+	case GLSLstd450##glsl_op: \
+		if (restype.basetype == SPIRType::Float && preserve_nan) \
+			emit_unary_func_op(result_type, id, args[0], "precise::" op); \
+		else \
+			CompilerGLSL::emit_glsl_op(result_type, id, eop, args, count); \
+		break
+
 	switch (op)
 	{
+	EMIT_PRECISE_OVERRIDE(Cos, "cos");
+	EMIT_PRECISE_OVERRIDE(Sin, "sin");
+	EMIT_PRECISE_OVERRIDE(Tan, "tan");
+	EMIT_PRECISE_OVERRIDE(Acos, "acos");
+	EMIT_PRECISE_OVERRIDE(Asin, "asin");
+	EMIT_PRECISE_OVERRIDE(Atan, "atan");
+	EMIT_PRECISE_OVERRIDE(Exp, "exp");
+	EMIT_PRECISE_OVERRIDE(Exp2, "exp2");
+	EMIT_PRECISE_OVERRIDE(Log, "log");
+	EMIT_PRECISE_OVERRIDE(Log2, "log2");
+	EMIT_PRECISE_OVERRIDE(Sqrt, "sqrt");
+#undef EMIT_PRECISE_OVERRIDE
+
 	case GLSLstd450Sinh:
 		if (restype.basetype == SPIRType::Half)
 		{
+			auto ftype = restype;
+			ftype.basetype = SPIRType::Float;
+
 			// MSL does not have overload for half. Force-cast back to half.
-			auto expr = join("half(fast::sinh(", to_unpacked_expression(args[0]), "))");
+			auto expr = join(type_to_glsl(restype), "(", preserve_str, "::sinh(", type_to_glsl(ftype), "(", to_unpacked_expression(args[0]), ")))");
 			emit_op(result_type, id, expr, should_forward(args[0]));
 			inherit_expression_dependencies(id, args[0]);
 		}
+		else if (preserve_nan)
+			emit_unary_func_op(result_type, id, args[0], "precise::sinh");
 		else
 			emit_unary_func_op(result_type, id, args[0], "fast::sinh");
 		break;
 	case GLSLstd450Cosh:
 		if (restype.basetype == SPIRType::Half)
 		{
+			auto ftype = restype;
+			ftype.basetype = SPIRType::Float;
+
 			// MSL does not have overload for half. Force-cast back to half.
-			auto expr = join("half(fast::cosh(", to_unpacked_expression(args[0]), "))");
+			auto expr = join(type_to_glsl(restype), "(", preserve_str, "::cosh(", type_to_glsl(ftype), "(", to_unpacked_expression(args[0]), ")))");
 			emit_op(result_type, id, expr, should_forward(args[0]));
 			inherit_expression_dependencies(id, args[0]);
 		}
+		else if (preserve_nan)
+			emit_unary_func_op(result_type, id, args[0], "precise::cosh");
 		else
 			emit_unary_func_op(result_type, id, args[0], "fast::cosh");
 		break;
 	case GLSLstd450Tanh:
 		if (restype.basetype == SPIRType::Half)
 		{
+			auto ftype = restype;
+			ftype.basetype = SPIRType::Float;
+
 			// MSL does not have overload for half. Force-cast back to half.
-			auto expr = join("half(fast::tanh(", to_unpacked_expression(args[0]), "))");
+			auto expr = join(type_to_glsl(restype), "(", preserve_str, "::tanh(", type_to_glsl(ftype), "(", to_unpacked_expression(args[0]), ")))");
 			emit_op(result_type, id, expr, should_forward(args[0]));
 			inherit_expression_dependencies(id, args[0]);
 		}
@@ -11221,7 +11342,13 @@ void CompilerMSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 		if (restype.basetype == SPIRType::Half)
 		{
 			// MSL does not have overload for half. Force-cast back to half.
-			auto expr = join("half(fast::atan2(", to_unpacked_expression(args[0]), ", ", to_unpacked_expression(args[1]), "))");
+			auto ftype = restype;
+			ftype.basetype = SPIRType::Float;
+
+			auto expr = join(type_to_glsl(restype),
+			                 "(", preserve_str, "::atan2(",
+			                 type_to_glsl(ftype), "(", to_unpacked_expression(args[0]), "), ",
+			                 type_to_glsl(ftype), "(", to_unpacked_expression(args[1]), ")))");
 			emit_op(result_type, id, expr, should_forward(args[0]) && should_forward(args[1]));
 			inherit_expression_dependencies(id, args[0]);
 			inherit_expression_dependencies(id, args[1]);
@@ -11230,7 +11357,10 @@ void CompilerMSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 			emit_binary_func_op(result_type, id, args[0], args[1], "precise::atan2");
 		break;
 	case GLSLstd450InverseSqrt:
-		emit_unary_func_op(result_type, id, args[0], "rsqrt");
+		if (restype.basetype == SPIRType::Float && preserve_nan)
+			emit_unary_func_op(result_type, id, args[0], "precise::rsqrt");
+		else
+			emit_unary_func_op(result_type, id, args[0], "rsqrt");
 		break;
 	case GLSLstd450RoundEven:
 		emit_unary_func_op(result_type, id, args[0], "rint");
@@ -11462,11 +11592,14 @@ void CompilerMSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 	{
 		auto &exp_type = expression_type(args[0]);
 		// MSL does not support scalar versions here.
-		// MSL has no implementation for normalize in the fast:: namespace for half2 and half3
+		// MSL has no implementation for normalize in the fast:: namespace for half
 		// Returns -1 or 1 for valid input, sign() does the job.
+
+		// precise::normalize asm looks ridiculous.
+		// Don't think this actually matters unless proven otherwise.
 		if (exp_type.vecsize == 1)
 			emit_unary_func_op(result_type, id, args[0], "sign");
-		else if (exp_type.vecsize <= 3 && exp_type.basetype == SPIRType::Half)
+		else if (exp_type.basetype == SPIRType::Half)
 			emit_unary_func_op(result_type, id, args[0], "normalize");
 		else
 			emit_unary_func_op(result_type, id, args[0], "fast::normalize");
@@ -11529,7 +11662,10 @@ void CompilerMSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 
 	case GLSLstd450Pow:
 		// powr makes x < 0.0 undefined, just like SPIR-V.
-		emit_binary_func_op(result_type, id, args[0], args[1], "powr");
+		if (restype.basetype == SPIRType::Float && preserve_nan)
+			emit_binary_func_op(result_type, id, args[0], args[1], "precise::powr");
+		else
+			emit_binary_func_op(result_type, id, args[0], args[1], "powr");
 		break;
 
 	default:
@@ -13753,6 +13889,24 @@ uint32_t CompilerMSL::get_or_allocate_builtin_output_member_location(spv::BuiltI
 	return loc;
 }
 
+bool CompilerMSL::entry_point_returns_stage_output() const
+{
+	if (get_execution_model() == ExecutionModelVertex && msl_options.vertex_for_tessellation)
+		return false;
+	bool ep_should_return_output = !get_is_rasterization_disabled();
+	return stage_out_var_id && ep_should_return_output;
+}
+
+bool CompilerMSL::entry_point_requires_const_device_buffers() const
+{
+	// For fragment, we don't really need it, but it might help avoid pessimization
+	// if the compiler deduces it needs to use late-Z for whatever reason.
+	return (get_execution_model() == ExecutionModelFragment && !has_descriptor_side_effects) ||
+	       (entry_point_returns_stage_output() &&
+	        (get_execution_model() == ExecutionModelVertex ||
+	         get_execution_model() == ExecutionModelTessellationEvaluation));
+}
+
 // Returns the type declaration for a function, including the
 // entry type if the current function is the entry point function
 string CompilerMSL::func_type_decl(SPIRType &type)
@@ -13763,8 +13917,7 @@ string CompilerMSL::func_type_decl(SPIRType &type)
 		return return_type;
 
 	// If an outgoing interface block has been defined, and it should be returned, override the entry point return type
-	bool ep_should_return_output = !get_is_rasterization_disabled();
-	if (stage_out_var_id && ep_should_return_output)
+	if (entry_point_returns_stage_output())
 		return_type = type_to_glsl(get_stage_out_struct_type()) + type_to_array_glsl(type, 0);
 
 	// Prepend a entry type, based on the execution model
@@ -13888,16 +14041,14 @@ string CompilerMSL::get_type_address_space(const SPIRType &type, uint32_t id, bo
 
 	case StorageClassStorageBuffer:
 	{
-		// For arguments from variable pointers, we use the write count deduction, so
-		// we should not assume any constness here. Only for global SSBOs.
-		bool readonly = false;
-		if (!var || has_decoration(type.self, DecorationBlock))
-			readonly = flags.get(DecorationNonWritable);
-
-		if (decoration_flags_signal_coherent(flags))
-			readonly = false;
-
-		addr_space = readonly ? "const device" : "device";
+		// When dealing with descriptor aliasing, it becomes very problematic to make use of
+		// readonly qualifiers.
+		// If rasterization is not disabled in vertex/tese, Metal does not allow side effects and refuses to compile "device",
+		// even if there are no writes. Just force const device.
+		if (entry_point_requires_const_device_buffers())
+			addr_space = "const device";
+		else
+			addr_space = "device";
 		break;
 	}
 
@@ -13914,7 +14065,12 @@ string CompilerMSL::get_type_address_space(const SPIRType &type, uint32_t id, bo
 		{
 			bool ssbo = has_decoration(type.self, DecorationBufferBlock);
 			if (ssbo)
-				addr_space = flags.get(DecorationNonWritable) ? "const device" : "device";
+			{
+				if (entry_point_requires_const_device_buffers())
+					addr_space = "const device";
+				else
+					addr_space = "device";
+			}
 			else
 				addr_space = "constant";
 		}
@@ -16553,7 +16709,7 @@ string CompilerMSL::type_to_array_glsl(const SPIRType &type, uint32_t variable_i
 	default:
 		if (type_is_array_of_pointers(type) || using_builtin_array())
 		{
-			const SPIRVariable *var = variable_id ? &get<SPIRVariable>(variable_id) : nullptr;
+			const SPIRVariable *var = variable_id ? maybe_get<SPIRVariable>(variable_id) : nullptr;
 			if (var && (var->storage == StorageClassUniform || var->storage == StorageClassStorageBuffer) &&
 			    is_array(get_variable_data_type(*var)))
 			{
@@ -16854,6 +17010,8 @@ string CompilerMSL::image_type_glsl(const SPIRType &type, uint32_t id, bool memb
 		if (p_var && p_var->basevariable)
 			p_var = maybe_get<SPIRVariable>(p_var->basevariable);
 
+		bool has_access_qualifier = true;
+
 		switch (img_type.access)
 		{
 		case AccessQualifierReadOnly:
@@ -16879,12 +17037,21 @@ string CompilerMSL::image_type_glsl(const SPIRType &type, uint32_t id, bool memb
 
 				img_type_name += "write";
 			}
+			else
+			{
+				has_access_qualifier = false;
+			}
 			break;
 		}
 		}
 
 		if (p_var && has_decoration(p_var->self, DecorationCoherent) && msl_options.supports_msl_version(3, 2))
+		{
+			// Cannot declare memory_coherence_device without access qualifier.
+			if (!has_access_qualifier)
+				img_type_name += ", access::read";
 			img_type_name += ", memory_coherence_device";
+		}
 	}
 
 	img_type_name += ">";
@@ -17317,13 +17484,18 @@ void CompilerMSL::emit_subgroup_cluster_op_cast(uint32_t result_type, uint32_t r
 	inherit_expression_dependencies(result_id, op0);
 }
 
+// Note: Metal forbids bitcasting to/from 'bool' using as_type. This function is used widely
+// for generating casts in the backend. To avoid generating illegal MSL when the canonical
+// function constant type (from deduplicated SpecId) is Boolean, fall back to value-cast in
+// that case by returning type_to_glsl(out_type) instead of as_type<...>.
 string CompilerMSL::bitcast_glsl_op(const SPIRType &out_type, const SPIRType &in_type)
 {
 	if (out_type.basetype == in_type.basetype)
 		return "";
 
-	assert(out_type.basetype != SPIRType::Boolean);
-	assert(in_type.basetype != SPIRType::Boolean);
+	// Avoid bitcasting to/from booleans in MSL; use value cast instead.
+	if (out_type.basetype == SPIRType::Boolean || in_type.basetype == SPIRType::Boolean)
+		return type_to_glsl(out_type);
 
 	bool integral_cast = type_is_integral(out_type) && type_is_integral(in_type) && (out_type.vecsize == in_type.vecsize);
 	bool same_size_cast = (out_type.width * out_type.vecsize) == (in_type.width * in_type.vecsize);
@@ -17679,6 +17851,9 @@ string CompilerMSL::builtin_qualifier(BuiltIn builtin)
 	case BuiltInGlobalInvocationId:
 		return "thread_position_in_grid";
 
+	case BuiltInWorkgroupSize:
+		return "threads_per_threadgroup";
+
 	case BuiltInWorkgroupId:
 		return "threadgroup_position_in_grid";
 
@@ -17864,6 +18039,7 @@ string CompilerMSL::builtin_type_decl(BuiltIn builtin, uint32_t id)
 	case BuiltInLocalInvocationId:
 	case BuiltInNumWorkgroups:
 	case BuiltInWorkgroupId:
+	case BuiltInWorkgroupSize:
 		return "uint3";
 	case BuiltInLocalInvocationIndex:
 	case BuiltInNumSubgroups:
@@ -18531,11 +18707,8 @@ CompilerMSL::SPVFuncImpl CompilerMSL::OpCodePreprocessor::get_spv_func_impl(Op o
 
 	case OpFAdd:
 	case OpFSub:
-		if (compiler.msl_options.invariant_float_math ||
-		    compiler.has_decoration(args[1], DecorationNoContraction))
-		{
+		if (compiler.msl_options.invariant_float_math || compiler.has_legacy_nocontract(args[0], args[1]))
 			return opcode == OpFAdd ? SPVFuncImplFAdd : SPVFuncImplFSub;
-		}
 		break;
 
 	case OpFMul:
@@ -18543,11 +18716,8 @@ CompilerMSL::SPVFuncImpl CompilerMSL::OpCodePreprocessor::get_spv_func_impl(Op o
 	case OpMatrixTimesVector:
 	case OpVectorTimesMatrix:
 	case OpMatrixTimesMatrix:
-		if (compiler.msl_options.invariant_float_math ||
-		    compiler.has_decoration(args[1], DecorationNoContraction))
-		{
+		if (compiler.msl_options.invariant_float_math || compiler.has_legacy_nocontract(args[0], args[1]))
 			return SPVFuncImplFMul;
-		}
 		break;
 
 	case OpQuantizeToF16:
@@ -19572,7 +19742,7 @@ void CompilerMSL::analyze_argument_buffers()
 				SetBindingPair pair = { desc_set, binding };
 
 				if (resource.basetype == SPIRType::Image || resource.basetype == SPIRType::Sampler ||
-				    resource.basetype == SPIRType::SampledImage)
+				    resource.basetype == SPIRType::SampledImage || resource.basetype == SPIRType::AccelerationStructure)
 				{
 					// Drop pointer information when we emit the resources into a struct.
 					buffer_type.member_types.push_back(get_variable_data_type_id(var));
@@ -19827,6 +19997,30 @@ bool CompilerMSL::specialization_constant_is_macro(uint32_t const_id) const
 	return constant_macro_ids.find(const_id) != constant_macro_ids.end();
 }
 
+// Start with all fast math flags enabled, and selectively disable based execution modes and float controls
+uint32_t CompilerMSL::get_fp_fast_math_flags(bool incl_ops) const
+{
+	uint32_t fp_flags = ~0;
+	auto &ep = get_entry_point();
+
+	if (ep.flags.get(ExecutionModeSignedZeroInfNanPreserve))
+		fp_flags &= ~(FPFastMathModeNSZMask | FPFastMathModeNotInfMask | FPFastMathModeNotNaNMask);
+
+	if (ep.flags.get(ExecutionModeContractionOff))
+		fp_flags &= ~(FPFastMathModeAllowContractMask);
+
+	for (auto &fp_pair : ep.fp_fast_math_defaults)
+		if (fp_pair.second)
+			fp_flags &= get<SPIRConstant>(fp_pair.second).scalar();
+
+	if (incl_ops)
+		for (auto &p_m : ir.meta)
+			if (p_m.second.decoration.decoration_flags.get(DecorationFPFastMathMode))
+				fp_flags &= p_m.second.decoration.fp_fast_math_mode;
+
+	return fp_flags;
+}
+
 void CompilerMSL::emit_block_hints(const SPIRBlock &)
 {
 }
diff --git a/3rdparty/spirv-cross/spirv_msl.hpp b/3rdparty/spirv-cross/spirv_msl.hpp
index a3d08bfcc..cd767ea15 100644
--- a/3rdparty/spirv-cross/spirv_msl.hpp
+++ b/3rdparty/spirv-cross/spirv_msl.hpp
@@ -542,6 +542,10 @@ public:
 		// The result can be queried with get_is_rasterization_disabled.
 		bool auto_disable_rasterization = false;
 
+		// Use Fast Math pragmas in MSL code, based on SPIR-V float controls and FP ExecutionModes.
+		// Requires MSL 3.2 or above, and has no effect with earlier MSL versions.
+		bool use_fast_math_pragmas = false;
+
 		bool is_ios() const
 		{
 			return platform == iOS;
@@ -767,6 +771,14 @@ public:
 	// These must only be called after a successful call to CompilerMSL::compile().
 	bool specialization_constant_is_macro(uint32_t constant_id) const;
 
+	// Returns a mask of SPIR-V FP Fast Math Mode flags, that represents the set of flags that can be applied
+	// across all floating-point types. Each FPFastMathDefault execution mode operation identifies the flags
+	// for one floating-point type, and the value returned here is a bitwise-AND combination across all types.
+	// If incl_ops is enabled, the FPFastMathMode of any SPIR-V operations are also included in the bitwise-AND
+	// to determine the minimal fast-math that applies to all default execution modes and all operations.
+	// The returned value is also affected by execution modes SignedZeroInfNanPreserve and ContractionOff.
+	uint32_t get_fp_fast_math_flags(bool incl_ops) const;
+
 protected:
 	// An enum of SPIR-V functions that are implemented in additional
 	// source code that is added to the shader if necessary.
@@ -1047,6 +1059,8 @@ protected:
 
 	void fix_up_shader_inputs_outputs();
 
+	bool entry_point_returns_stage_output() const;
+	bool entry_point_requires_const_device_buffers() const;
 	std::string func_type_decl(SPIRType &type);
 	std::string entry_point_args_classic(bool append_comma);
 	std::string entry_point_args_argument_buffer(bool append_comma);
@@ -1126,7 +1140,7 @@ protected:
 	                         uint32_t mem_order_1, uint32_t mem_order_2, bool has_mem_order_2, uint32_t op0, uint32_t op1 = 0,
 	                         bool op1_is_pointer = false, bool op1_is_literal = false, uint32_t op2 = 0);
 	const char *get_memory_order(uint32_t spv_mem_sem);
-	void add_pragma_line(const std::string &line);
+	void add_pragma_line(const std::string &line, bool recompile_on_unique);
 	void add_typedef_line(const std::string &line);
 	void emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uint32_t id_mem_sem);
 	bool emit_array_copy(const char *expr, uint32_t lhs_id, uint32_t rhs_id,
@@ -1209,8 +1223,8 @@ protected:
 	std::unordered_map<uint32_t, uint32_t> fragment_output_components;
 	std::unordered_map<uint32_t, uint32_t> builtin_to_automatic_input_location;
 	std::unordered_map<uint32_t, uint32_t> builtin_to_automatic_output_location;
-	std::set<std::string> pragma_lines;
-	std::set<std::string> typedef_lines;
+	std::vector<std::string> pragma_lines;
+	std::vector<std::string> typedef_lines;
 	SmallVector<uint32_t> vars_needing_early_declaration;
 	std::unordered_set<uint32_t> constant_macro_ids;
 
@@ -1252,6 +1266,7 @@ protected:
 	bool using_builtin_array() const;
 
 	bool is_rasterization_disabled = false;
+	bool has_descriptor_side_effects = false;
 	bool capture_output_to_buffer = false;
 	bool needs_swizzle_buffer_def = false;
 	bool used_swizzle_buffer = false;
diff --git a/3rdparty/spirv-cross/spirv_parser.cpp b/3rdparty/spirv-cross/spirv_parser.cpp
index 745ab6e81..f3ea7e288 100644
--- a/3rdparty/spirv-cross/spirv_parser.cpp
+++ b/3rdparty/spirv-cross/spirv_parser.cpp
@@ -213,8 +213,8 @@ void Parser::parse(const Instruction &instruction)
 
 	case OpSource:
 	{
-		auto lang = static_cast<SourceLanguage>(ops[0]);
-		switch (lang)
+		ir.source.lang = static_cast<SourceLanguage>(ops[0]);
+		switch (ir.source.lang)
 		{
 		case SourceLanguageESSL:
 			ir.source.es = true;
@@ -318,6 +318,19 @@ void Parser::parse(const Instruction &instruction)
 					ir.load_type_width.insert({ ops[1], type->width });
 			}
 		}
+		else if (op == OpExtInst)
+		{
+			// Don't want to deal with ForwardRefs here.
+
+			auto &ext = get<SPIRExtension>(ops[2]);
+			if (ext.ext == SPIRExtension::NonSemanticShaderDebugInfo)
+			{
+				// Parse global ShaderDebugInfo we care about.
+				// Just forward the string information.
+				if (ops[3] == SPIRExtension::DebugSource)
+					set<SPIRString>(ops[1], get<SPIRString>(ops[4]).str);
+			}
+		}
 		break;
 	}
 
@@ -369,6 +382,30 @@ void Parser::parse(const Instruction &instruction)
 			execution.output_primitives = ops[2];
 			break;
 
+		case ExecutionModeSignedZeroInfNanPreserve:
+			switch (ops[2])
+			{
+			case 8:
+				execution.signed_zero_inf_nan_preserve_8 = true;
+				break;
+
+			case 16:
+				execution.signed_zero_inf_nan_preserve_16 = true;
+				break;
+
+			case 32:
+				execution.signed_zero_inf_nan_preserve_32 = true;
+				break;
+
+			case 64:
+				execution.signed_zero_inf_nan_preserve_64 = true;
+				break;
+
+			default:
+				SPIRV_CROSS_THROW("Invalid bit-width for SignedZeroInfNanPreserve.");
+			}
+			break;
+
 		default:
 			break;
 		}
@@ -381,13 +418,21 @@ void Parser::parse(const Instruction &instruction)
 		auto mode = static_cast<ExecutionMode>(ops[1]);
 		execution.flags.set(mode);
 
-		if (mode == ExecutionModeLocalSizeId)
+		switch (mode)
 		{
+		case ExecutionModeLocalSizeId:
 			execution.workgroup_size.id_x = ops[2];
 			execution.workgroup_size.id_y = ops[3];
 			execution.workgroup_size.id_z = ops[4];
-		}
+			break;
 
+		case ExecutionModeFPFastMathDefault:
+			execution.fp_fast_math_defaults[ops[2]] = ops[3];
+			break;
+
+		default:
+			break;
+		}
 		break;
 	}
 
@@ -538,7 +583,7 @@ void Parser::parse(const Instruction &instruction)
 		uint32_t width = ops[1];
 		auto &type = set<SPIRType>(id, op);
 
-		if (width != 16 && length > 2)
+		if (width != 16 && width != 8 && length > 2)
 			SPIRV_CROSS_THROW("Unrecognized FP encoding mode for OpTypeFloat.");
 
 		if (width == 64)
@@ -557,6 +602,17 @@ void Parser::parse(const Instruction &instruction)
 			else
 				type.basetype = SPIRType::Half;
 		}
+		else if (width == 8)
+		{
+			if (length < 2)
+				SPIRV_CROSS_THROW("Missing encoding for OpTypeFloat 8.");
+			else if (ops[2] == spv::FPEncodingFloat8E4M3EXT)
+				type.basetype = SPIRType::FloatE4M3;
+			else if (ops[2] == spv::FPEncodingFloat8E5M2EXT)
+				type.basetype = SPIRType::FloatE5M2;
+			else
+				SPIRV_CROSS_THROW("Invalid encoding for OpTypeFloat 8.");
+		}
 		else
 			SPIRV_CROSS_THROW("Unrecognized bit-width of floating point type.");
 		type.width = width;
@@ -614,15 +670,33 @@ void Parser::parse(const Instruction &instruction)
 		auto &matrixbase = set<SPIRType>(id, base);
 
 		matrixbase.op = op;
-		matrixbase.cooperative.scope_id = ops[2];
-		matrixbase.cooperative.rows_id = ops[3];
-		matrixbase.cooperative.columns_id = ops[4];
-		matrixbase.cooperative.use_id = ops[5];
+		matrixbase.ext.cooperative.scope_id = ops[2];
+		matrixbase.ext.cooperative.rows_id = ops[3];
+		matrixbase.ext.cooperative.columns_id = ops[4];
+		matrixbase.ext.cooperative.use_id = ops[5];
 		matrixbase.self = id;
 		matrixbase.parent_type = ops[1];
 		break;
 	}
 
+	case OpTypeCooperativeVectorNV:
+	{
+		uint32_t id = ops[0];
+		auto &type = set<SPIRType>(id, op);
+
+		type.basetype = SPIRType::CoopVecNV;
+		type.op = op;
+		type.ext.coopVecNV.component_type_id = ops[1];
+		type.ext.coopVecNV.component_count_id = ops[2];
+		type.parent_type = ops[1];
+
+		// CoopVec-Nv can be used with integer operations like SMax where
+		// where spirv-opt does explicit checks on integer bitwidth
+		auto component_type = get<SPIRType>(type.ext.coopVecNV.component_type_id);
+		type.width = component_type.width;
+		break;
+	}
+
 	case OpTypeArray:
 	{
 		uint32_t id = ops[0];
@@ -820,6 +894,20 @@ void Parser::parse(const Instruction &instruction)
 		break;
 	}
 
+	case OpTypeTensorARM:
+	{
+		uint32_t id = ops[0];
+		auto &type = set<SPIRType>(id, op);
+		type.basetype = SPIRType::Tensor;
+		type.ext.tensor = {};
+		type.ext.tensor.type = ops[1];
+		if (length >= 3)
+			type.ext.tensor.rank = ops[2];
+		if (length >= 4)
+			type.ext.tensor.shape = ops[3];
+		break;
+	}
+
 	// Variable declaration
 	// All variables are essentially pointers with a storage qualifier.
 	case OpVariable:
@@ -866,17 +954,27 @@ void Parser::parse(const Instruction &instruction)
 		break;
 	}
 
-		// Constants
+	// Constants
 	case OpSpecConstant:
 	case OpConstant:
+	case OpConstantCompositeReplicateEXT:
+	case OpSpecConstantCompositeReplicateEXT:
 	{
 		uint32_t id = ops[1];
 		auto &type = get<SPIRType>(ops[0]);
-
-		if (type.width > 32)
-			set<SPIRConstant>(id, ops[0], ops[2] | (uint64_t(ops[3]) << 32), op == OpSpecConstant);
+		if (op == OpConstantCompositeReplicateEXT || op == OpSpecConstantCompositeReplicateEXT)
+		{
+			auto subconstant = uint32_t(ops[2]);
+			set<SPIRConstant>(id, ops[0], &subconstant, 1, op == OpSpecConstantCompositeReplicateEXT, true);
+		}
 		else
-			set<SPIRConstant>(id, ops[0], ops[2], op == OpSpecConstant);
+		{
+
+			if (type.width > 32)
+				set<SPIRConstant>(id, ops[0], ops[2] | (uint64_t(ops[3]) << 32), op == OpSpecConstant);
+			else
+				set<SPIRConstant>(id, ops[0], ops[2], op == OpSpecConstant);
+		}
 		break;
 	}
 
diff --git a/3rdparty/spirv-cross/spirv_reflect.cpp b/3rdparty/spirv-cross/spirv_reflect.cpp
index 552d671a6..ab5a91f9e 100644
--- a/3rdparty/spirv-cross/spirv_reflect.cpp
+++ b/3rdparty/spirv-cross/spirv_reflect.cpp
@@ -477,6 +477,12 @@ string CompilerReflection::execution_model_to_str(spv::ExecutionModel model)
 		return "rmiss";
 	case ExecutionModelCallableNV:
 		return "rcall";
+	case ExecutionModelMeshNV:
+	case ExecutionModelMeshEXT:
+		return "mesh";
+	case ExecutionModelTaskNV:
+	case ExecutionModelTaskEXT:
+		return "task";
 	default:
 		return "???";
 	}
@@ -504,7 +510,9 @@ void CompilerReflection::emit_entry_points()
 			json_stream->begin_json_object();
 			json_stream->emit_json_key_value("name", e.name);
 			json_stream->emit_json_key_value("mode", execution_model_to_str(e.execution_model));
-			if (e.execution_model == ExecutionModelGLCompute)
+			if (e.execution_model == ExecutionModelGLCompute || e.execution_model == spv::ExecutionModelMeshEXT ||
+			    e.execution_model == spv::ExecutionModelMeshNV || e.execution_model == spv::ExecutionModelTaskEXT ||
+			    e.execution_model == spv::ExecutionModelTaskNV)
 			{
 				const auto &spv_entry = get_entry_point(e.name, e.execution_model);
 
@@ -547,6 +555,7 @@ void CompilerReflection::emit_resources()
 	emit_resources("push_constants", res.push_constant_buffers);
 	emit_resources("counters", res.atomic_counters);
 	emit_resources("acceleration_structures", res.acceleration_structures);
+	emit_resources("tensors", res.tensors);
 }
 
 void CompilerReflection::emit_resources(const char *tag, const SmallVector<Resource> &resources)