From 6ca5060ce8fe121dc95d4cb99b5242f447f78030 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=91=D1=80=D0=B0=D0=BD=D0=B8=D0=BC=D0=B8=D1=80=20=D0=9A?=
 =?UTF-8?q?=D0=B0=D1=80=D0=B0=D1=9F=D0=B8=D1=9B?=
 <branimirkaradzic@gmail.com>
Date: Fri, 14 May 2021 19:37:10 -0700
Subject: [PATCH] Updated spirv-cross.

---
 3rdparty/spirv-cross/main.cpp          |   95 ++
 3rdparty/spirv-cross/spirv_common.hpp  |   24 +
 3rdparty/spirv-cross/spirv_cross.cpp   |  120 +-
 3rdparty/spirv-cross/spirv_cross.hpp   |   27 +-
 3rdparty/spirv-cross/spirv_cross_c.cpp |  100 ++
 3rdparty/spirv-cross/spirv_cross_c.h   |   26 +-
 3rdparty/spirv-cross/spirv_glsl.cpp    |  598 ++++++----
 3rdparty/spirv-cross/spirv_glsl.hpp    |   43 +-
 3rdparty/spirv-cross/spirv_hlsl.cpp    |  148 +--
 3rdparty/spirv-cross/spirv_msl.cpp     | 1479 +++++++++++++++++-------
 3rdparty/spirv-cross/spirv_msl.hpp     |   66 +-
 11 files changed, 1976 insertions(+), 750 deletions(-)
diff --git a/3rdparty/spirv-cross/main.cpp b/3rdparty/spirv-cross/main.cpp
index 2b99022e7..953abfadc 100644
--- a/3rdparty/spirv-cross/main.cpp
+++ b/3rdparty/spirv-cross/main.cpp
@@ -285,6 +285,61 @@ static bool write_string_to_file(const char *path, const char *string)
 #pragma warning(pop)
 #endif
 
+static void print_resources(const Compiler &compiler, spv::StorageClass storage,
+                            const SmallVector<BuiltInResource> &resources)
+{
+	fprintf(stderr, "%s\n", storage == StorageClassInput ? "builtin inputs" : "builtin outputs");
+	fprintf(stderr, "=============\n\n");
+	for (auto &res : resources)
+	{
+		bool active = compiler.has_active_builtin(res.builtin, storage);
+		const char *basetype = "?";
+		auto &type = compiler.get_type(res.value_type_id);
+		switch (type.basetype)
+		{
+		case SPIRType::Float: basetype = "float"; break;
+		case SPIRType::Int: basetype = "int"; break;
+		case SPIRType::UInt: basetype = "uint"; break;
+		default: break;
+		}
+
+		uint32_t array_size = 0;
+		bool array_size_literal = false;
+		if (!type.array.empty())
+		{
+			array_size = type.array.front();
+			array_size_literal = type.array_size_literal.front();
+		}
+
+		string type_str = basetype;
+		if (type.vecsize > 1)
+			type_str += std::to_string(type.vecsize);
+
+		if (array_size)
+		{
+			if (array_size_literal)
+				type_str += join("[", array_size, "]");
+			else
+				type_str += join("[", array_size, " (spec constant ID)]");
+		}
+
+		string builtin_str;
+		switch (res.builtin)
+		{
+		case spv::BuiltInPosition: builtin_str = "Position"; break;
+		case spv::BuiltInPointSize: builtin_str = "PointSize"; break;
+		case spv::BuiltInCullDistance: builtin_str = "CullDistance"; break;
+		case spv::BuiltInClipDistance: builtin_str = "ClipDistance"; break;
+		case spv::BuiltInTessLevelInner: builtin_str = "TessLevelInner"; break;
+		case spv::BuiltInTessLevelOuter: builtin_str = "TessLevelOuter"; break;
+		default: builtin_str = string("builtin #") + to_string(res.builtin);
+		}
+
+		fprintf(stderr, "Builtin %s (%s) (active: %s).\n", builtin_str.c_str(), type_str.c_str(), active ? "yes" : "no");
+	}
+	fprintf(stderr, "=============\n\n");
+}
+
 static void print_resources(const Compiler &compiler, const char *tag, const SmallVector<Resource> &resources)
 {
 	fprintf(stderr, "%s\n", tag);
@@ -475,6 +530,8 @@ static void print_resources(const Compiler &compiler, const ShaderResources &res
 	print_resources(compiler, "push", res.push_constant_buffers);
 	print_resources(compiler, "counters", res.atomic_counters);
 	print_resources(compiler, "acceleration structures", res.acceleration_structures);
+	print_resources(compiler, spv::StorageClassInput, res.builtin_inputs);
+	print_resources(compiler, spv::StorageClassOutput, res.builtin_outputs);
 }
 
 static void print_push_constant_resources(const Compiler &compiler, const SmallVector<Resource> &res)
@@ -621,6 +678,8 @@ struct CLIArguments
 	SmallVector<VariableTypeRemap> variable_type_remaps;
 	SmallVector<InterfaceVariableRename> interface_variable_renames;
 	SmallVector<HLSLVertexAttributeRemap> hlsl_attr_remap;
+	SmallVector<std::pair<uint32_t, uint32_t>> masked_stage_outputs;
+	SmallVector<BuiltIn> masked_stage_builtins;
 	string entry;
 	string entry_stage;
 
@@ -845,6 +904,11 @@ static void print_help_common()
 	                "\t\tGLSL: Rewrites [0, w] Z range (D3D/Metal/Vulkan) to GL-style [-w, w].\n"
 	                "\t\tHLSL/MSL: Rewrites [-w, w] Z range (GL) to D3D/Metal/Vulkan-style [0, w].\n"
 	                "\t[--flip-vert-y]:\n\t\tInverts gl_Position.y (or equivalent) at the end of a vertex shader. This is equivalent to using negative viewport height.\n"
+	                "\t[--mask-stage-output-location <location> <component>]:\n"
+	                "\t\tIf a stage output variable with matching location and component is active, optimize away the variable if applicable.\n"
+	                "\t[--mask-stage-output-builtin <Position|PointSize|ClipDistance|CullDistance>]:\n"
+	                "\t\tIf a stage output variable with matching builtin is active, "
+	                "optimize away the variable if it can affect cross-stage linking correctness.\n"
 	);
 	// clang-format on
 }
@@ -1103,6 +1167,11 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
 		compiler->set_variable_type_remap_callback(move(remap_cb));
 	}
 
+	for (auto &masked : args.masked_stage_outputs)
+		compiler->mask_stage_output_by_location(masked.first, masked.second);
+	for (auto &masked : args.masked_stage_builtins)
+		compiler->mask_stage_output_by_builtin(masked);
+
 	for (auto &rename : args.entry_point_rename)
 		compiler->rename_entry_point(rename.old_name, rename.new_name, rename.execution_model);
 
@@ -1346,6 +1415,7 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
 
 	if (args.dump_resources)
 	{
+		compiler->update_active_builtins();
 		print_resources(*compiler, res);
 		print_push_constant_resources(*compiler, res.push_constant_buffers);
 		print_spec_constants(*compiler);
@@ -1571,6 +1641,31 @@ static int main_inner(int argc, char *argv[])
 	cbs.add("--no-support-nonzero-baseinstance", [&](CLIParser &) { args.support_nonzero_baseinstance = false; });
 	cbs.add("--emit-line-directives", [&args](CLIParser &) { args.emit_line_directives = true; });
 
+	cbs.add("--mask-stage-output-location", [&](CLIParser &parser) {
+		uint32_t location = parser.next_uint();
+		uint32_t component = parser.next_uint();
+		args.masked_stage_outputs.push_back({ location, component });
+	});
+
+	cbs.add("--mask-stage-output-builtin", [&](CLIParser &parser) {
+		BuiltIn masked_builtin = BuiltInMax;
+		std::string builtin = parser.next_string();
+		if (builtin == "Position")
+			masked_builtin = BuiltInPosition;
+		else if (builtin == "PointSize")
+			masked_builtin = BuiltInPointSize;
+		else if (builtin == "CullDistance")
+			masked_builtin = BuiltInCullDistance;
+		else if (builtin == "ClipDistance")
+			masked_builtin = BuiltInClipDistance;
+		else
+		{
+			print_help();
+			exit(EXIT_FAILURE);
+		}
+		args.masked_stage_builtins.push_back(masked_builtin);
+	});
+
 	cbs.default_handler = [&args](const char *value) { args.input = value; };
 	cbs.add("-", [&args](CLIParser &) { args.input = "-"; });
 	cbs.error_handler = [] { print_help(); };
diff --git a/3rdparty/spirv-cross/spirv_common.hpp b/3rdparty/spirv-cross/spirv_common.hpp
index a7a84188f..f0024d7ed 100644
--- a/3rdparty/spirv-cross/spirv_common.hpp
+++ b/3rdparty/spirv-cross/spirv_common.hpp
@@ -1763,6 +1763,22 @@ struct SetBindingPair
 	}
 };
 
+struct LocationComponentPair
+{
+	uint32_t location;
+	uint32_t component;
+
+	inline bool operator==(const LocationComponentPair &other) const
+	{
+		return location == other.location && component == other.component;
+	}
+
+	inline bool operator<(const LocationComponentPair &other) const
+	{
+		return location < other.location || (location == other.location && component < other.component);
+	}
+};
+
 struct StageSetBinding
 {
 	spv::ExecutionModel model;
@@ -1785,6 +1801,14 @@ struct InternalHasher
 		return (hash_set * 0x10001b31) ^ hash_binding;
 	}
 
+	inline size_t operator()(const LocationComponentPair &value) const
+	{
+		// Quality of hash doesn't really matter here.
+		auto hash_set = std::hash<uint32_t>()(value.location);
+		auto hash_binding = std::hash<uint32_t>()(value.component);
+		return (hash_set * 0x10001b31) ^ hash_binding;
+	}
+
 	inline size_t operator()(const StageSetBinding &value) const
 	{
 		// Quality of hash doesn't really matter here.
diff --git a/3rdparty/spirv-cross/spirv_cross.cpp b/3rdparty/spirv-cross/spirv_cross.cpp
index f024fa78b..e948820d2 100644
--- a/3rdparty/spirv-cross/spirv_cross.cpp
+++ b/3rdparty/spirv-cross/spirv_cross.cpp
@@ -284,31 +284,6 @@ SPIRVariable *Compiler::maybe_get_backing_variable(uint32_t chain)
 	return var;
 }
 
-StorageClass Compiler::get_expression_effective_storage_class(uint32_t ptr)
-{
-	auto *var = maybe_get_backing_variable(ptr);
-
-	// If the expression has been lowered to a temporary, we need to use the Generic storage class.
-	// We're looking for the effective storage class of a given expression.
-	// An access chain or forwarded OpLoads from such access chains
-	// will generally have the storage class of the underlying variable, but if the load was not forwarded
-	// we have lost any address space qualifiers.
-	bool forced_temporary = ir.ids[ptr].get_type() == TypeExpression && !get<SPIRExpression>(ptr).access_chain &&
-	                        (forced_temporaries.count(ptr) != 0 || forwarded_temporaries.count(ptr) == 0);
-
-	if (var && !forced_temporary)
-	{
-		// Normalize SSBOs to StorageBuffer here.
-		if (var->storage == StorageClassUniform &&
-		    has_decoration(get<SPIRType>(var->basetype).self, DecorationBufferBlock))
-			return StorageClassStorageBuffer;
-		else
-			return var->storage;
-	}
-	else
-		return expression_type(ptr).storage;
-}
-
 void Compiler::register_read(uint32_t expr, uint32_t chain, bool forwarded)
 {
 	auto &e = get<SPIRExpression>(expr);
@@ -853,19 +828,79 @@ ShaderResources Compiler::get_shader_resources(const unordered_set<VariableID> *
 
 		// It is possible for uniform storage classes to be passed as function parameters, so detect
 		// that. To detect function parameters, check of StorageClass of variable is function scope.
-		if (var.storage == StorageClassFunction || !type.pointer || is_builtin_variable(var))
+		if (var.storage == StorageClassFunction || !type.pointer)
 			return;
 
 		if (active_variables && active_variables->find(var.self) == end(*active_variables))
 			return;
 
+		// In SPIR-V 1.4 and up, every global must be present in the entry point interface list,
+		// not just IO variables.
+		bool active_in_entry_point = true;
+		if (ir.get_spirv_version() < 0x10400)
+		{
+			if (var.storage == StorageClassInput || var.storage == StorageClassOutput)
+				active_in_entry_point = interface_variable_exists_in_entry_point(var.self);
+		}
+		else
+			active_in_entry_point = interface_variable_exists_in_entry_point(var.self);
+
+		if (!active_in_entry_point)
+			return;
+
+		bool is_builtin = is_builtin_variable(var);
+
+		if (is_builtin)
+		{
+			if (var.storage != StorageClassInput && var.storage != StorageClassOutput)
+				return;
+
+			auto &list = var.storage == StorageClassInput ? res.builtin_inputs : res.builtin_outputs;
+			BuiltInResource resource;
+
+			if (has_decoration(type.self, DecorationBlock))
+			{
+				resource.resource = { var.self, var.basetype, type.self,
+				                      get_remapped_declared_block_name(var.self, false) };
+
+				for (uint32_t i = 0; i < uint32_t(type.member_types.size()); i++)
+				{
+					resource.value_type_id = type.member_types[i];
+					resource.builtin = BuiltIn(get_member_decoration(type.self, i, DecorationBuiltIn));
+					list.push_back(resource);
+				}
+			}
+			else
+			{
+				bool strip_array =
+						!has_decoration(var.self, DecorationPatch) && (
+								get_execution_model() == ExecutionModelTessellationControl ||
+								(get_execution_model() == ExecutionModelTessellationEvaluation &&
+								 var.storage == StorageClassInput));
+
+				resource.resource = { var.self, var.basetype, type.self, get_name(var.self) };
+
+				if (strip_array && !type.array.empty())
+					resource.value_type_id = get_variable_data_type(var).parent_type;
+				else
+					resource.value_type_id = get_variable_data_type_id(var);
+
+				assert(resource.value_type_id);
+
+				resource.builtin = BuiltIn(get_decoration(var.self, DecorationBuiltIn));
+				list.push_back(std::move(resource));
+			}
+			return;
+		}
+
 		// Input
-		if (var.storage == StorageClassInput && interface_variable_exists_in_entry_point(var.self))
+		if (var.storage == StorageClassInput)
 		{
 			if (has_decoration(type.self, DecorationBlock))
 			{
 				res.stage_inputs.push_back(
-				    { var.self, var.basetype, type.self, get_remapped_declared_block_name(var.self, false) });
+						{ var.self, var.basetype, type.self,
+						  get_remapped_declared_block_name(var.self, false) });
 			}
 			else
 				res.stage_inputs.push_back({ var.self, var.basetype, type.self, get_name(var.self) });
@@ -876,12 +911,12 @@ ShaderResources Compiler::get_shader_resources(const unordered_set<VariableID> *
 			res.subpass_inputs.push_back({ var.self, var.basetype, type.self, get_name(var.self) });
 		}
 		// Outputs
-		else if (var.storage == StorageClassOutput && interface_variable_exists_in_entry_point(var.self))
+		else if (var.storage == StorageClassOutput)
 		{
 			if (has_decoration(type.self, DecorationBlock))
 			{
 				res.stage_outputs.push_back(
-				    { var.self, var.basetype, type.self, get_remapped_declared_block_name(var.self, false) });
+						{ var.self, var.basetype, type.self, get_remapped_declared_block_name(var.self, false) });
 			}
 			else
 				res.stage_outputs.push_back({ var.self, var.basetype, type.self, get_name(var.self) });
@@ -3185,6 +3220,29 @@ bool Compiler::AnalyzeVariableScopeAccessHandler::handle(spv::Op op, const uint3
 		break;
 	}
 
+	case OpSelect:
+	{
+		// In case of variable pointers, we might access a variable here.
+		// We cannot prove anything about these accesses however.
+		for (uint32_t i = 1; i < length; i++)
+		{
+			if (i >= 3)
+			{
+				auto *var = compiler.maybe_get_backing_variable(args[i]);
+				if (var)
+				{
+					accessed_variables_to_block[var->self].insert(current_block->self);
+					// Assume we can get partial writes to this variable.
+					partial_write_variables_to_block[var->self].insert(current_block->self);
+				}
+			}
+
+			// Might try to copy a Phi variable here.
+			notify_variable_access(args[i], current_block->self);
+		}
+		break;
+	}
+
 	case OpExtInst:
 	{
 		for (uint32_t i = 4; i < length; i++)
@@ -4071,7 +4129,7 @@ void Compiler::update_active_builtins()
 }
 
 // Returns whether this shader uses a builtin of the storage class
-bool Compiler::has_active_builtin(BuiltIn builtin, StorageClass storage)
+bool Compiler::has_active_builtin(BuiltIn builtin, StorageClass storage) const
 {
 	const Bitset *flags;
 	switch (storage)
diff --git a/3rdparty/spirv-cross/spirv_cross.hpp b/3rdparty/spirv-cross/spirv_cross.hpp
index 84e23ca30..764c6c47b 100644
--- a/3rdparty/spirv-cross/spirv_cross.hpp
+++ b/3rdparty/spirv-cross/spirv_cross.hpp
@@ -59,6 +59,27 @@ struct Resource
 	std::string name;
 };
 
+struct BuiltInResource
+{
+	// This is mostly here to support reflection of builtins such as Position/PointSize/CullDistance/ClipDistance.
+	// This needs to be different from Resource since we can collect builtins from blocks.
+	// A builtin present here does not necessarily mean it's considered an active builtin,
+	// since variable ID "activeness" is only tracked on OpVariable level, not Block members.
+	// For that, update_active_builtins() -> has_active_builtin() can be used to further refine the reflection.
+	spv::BuiltIn builtin;
+
+	// This is the actual value type of the builtin.
+	// Typically float4, float, array<float, N> for the gl_PerVertex builtins.
+	// If the builtin is a control point, the control point array type will be stripped away here as appropriate.
+	TypeID value_type_id;
+
+	// This refers to the base resource which contains the builtin.
+	// If resource is a Block, it can hold multiple builtins, or it might not be a block.
+	// For advanced reflection scenarios, all information in builtin/value_type_id can be deduced,
+	// it's just more convenient this way.
+	Resource resource;
+};
+
 struct ShaderResources
 {
 	SmallVector<Resource> uniform_buffers;
@@ -79,6 +100,9 @@ struct ShaderResources
 	// these correspond to separate texture2D and samplers respectively.
 	SmallVector<Resource> separate_images;
 	SmallVector<Resource> separate_samplers;
+
+	SmallVector<BuiltInResource> builtin_inputs;
+	SmallVector<BuiltInResource> builtin_outputs;
 };
 
 struct CombinedImageSampler
@@ -324,7 +348,7 @@ public:
 
 	// Traverses all reachable opcodes and sets active_builtins to a bitmask of all builtin variables which are accessed in the shader.
 	void update_active_builtins();
-	bool has_active_builtin(spv::BuiltIn builtin, spv::StorageClass storage);
+	bool has_active_builtin(spv::BuiltIn builtin, spv::StorageClass storage) const;
 
 	// Query and modify OpExecutionMode.
 	const Bitset &get_execution_mode_bitset() const;
@@ -647,7 +671,6 @@ protected:
 	bool expression_is_lvalue(uint32_t id) const;
 	bool variable_storage_is_aliased(const SPIRVariable &var);
 	SPIRVariable *maybe_get_backing_variable(uint32_t chain);
-	spv::StorageClass get_expression_effective_storage_class(uint32_t ptr);
 
 	void register_read(uint32_t expr, uint32_t chain, bool forwarded);
 	void register_write(uint32_t chain);
diff --git a/3rdparty/spirv-cross/spirv_cross_c.cpp b/3rdparty/spirv-cross/spirv_cross_c.cpp
index 730bfa3d6..866c25d10 100644
--- a/3rdparty/spirv-cross/spirv_cross_c.cpp
+++ b/3rdparty/spirv-cross/spirv_cross_c.cpp
@@ -197,8 +197,11 @@ struct spvc_resources_s : ScratchMemoryAllocation
 	SmallVector<spvc_reflected_resource> separate_images;
 	SmallVector<spvc_reflected_resource> separate_samplers;
 	SmallVector<spvc_reflected_resource> acceleration_structures;
+	SmallVector<spvc_reflected_builtin_resource> builtin_inputs;
+	SmallVector<spvc_reflected_builtin_resource> builtin_outputs;
 
 	bool copy_resources(SmallVector<spvc_reflected_resource> &outputs, const SmallVector<Resource> &inputs);
+	bool copy_resources(SmallVector<spvc_reflected_builtin_resource> &outputs, const SmallVector<BuiltInResource> &inputs);
 	bool copy_resources(const ShaderResources &resources);
 };
 
@@ -818,6 +821,44 @@ spvc_bool spvc_compiler_variable_is_depth_or_compare(spvc_compiler compiler, spv
 #endif
 }
 
+spvc_result spvc_compiler_mask_stage_output_by_location(spvc_compiler compiler,
+                                                        unsigned location, unsigned component)
+{
+#if SPIRV_CROSS_C_API_GLSL
+	if (compiler->backend == SPVC_BACKEND_NONE)
+	{
+		compiler->context->report_error("Cross-compilation related option used on NONE backend which only supports reflection.");
+		return SPVC_ERROR_INVALID_ARGUMENT;
+	}
+
+	static_cast<CompilerGLSL *>(compiler->compiler.get())->mask_stage_output_by_location(location, component);
+	return SPVC_SUCCESS;
+#else
+	(void)location;
+	(void)component;
+	compiler->context->report_error("Cross-compilation related option used on NONE backend which only supports reflection.");
+	return SPVC_ERROR_INVALID_ARGUMENT;
+#endif
+}
+
+spvc_result spvc_compiler_mask_stage_output_by_builtin(spvc_compiler compiler, SpvBuiltIn builtin)
+{
+#if SPIRV_CROSS_C_API_GLSL
+	if (compiler->backend == SPVC_BACKEND_NONE)
+	{
+		compiler->context->report_error("Cross-compilation related option used on NONE backend which only supports reflection.");
+		return SPVC_ERROR_INVALID_ARGUMENT;
+	}
+
+	static_cast<CompilerGLSL *>(compiler->compiler.get())->mask_stage_output_by_builtin(spv::BuiltIn(builtin));
+	return SPVC_SUCCESS;
+#else
+	(void)builtin;
+	compiler->context->report_error("Cross-compilation related option used on NONE backend which only supports reflection.");
+	return SPVC_ERROR_INVALID_ARGUMENT;
+#endif
+}
+
 spvc_result spvc_compiler_hlsl_set_root_constants_layout(spvc_compiler compiler,
                                                          const spvc_hlsl_root_constants *constant_info,
                                                          size_t count)
@@ -1551,6 +1592,30 @@ bool spvc_resources_s::copy_resources(SmallVector<spvc_reflected_resource> &outp
 	return true;
 }
 
+bool spvc_resources_s::copy_resources(SmallVector<spvc_reflected_builtin_resource> &outputs,
+                                      const SmallVector<BuiltInResource> &inputs)
+{
+	for (auto &i : inputs)
+	{
+		spvc_reflected_builtin_resource br;
+
+		br.value_type_id = i.value_type_id;
+		br.builtin = SpvBuiltIn(i.builtin);
+
+		auto &r = br.resource;
+		r.base_type_id = i.resource.base_type_id;
+		r.type_id = i.resource.type_id;
+		r.id = i.resource.id;
+		r.name = context->allocate_name(i.resource.name);
+		if (!r.name)
+			return false;
+
+		outputs.push_back(br);
+	}
+
+	return true;
+}
+
 bool spvc_resources_s::copy_resources(const ShaderResources &resources)
 {
 	if (!copy_resources(uniform_buffers, resources.uniform_buffers))
@@ -1577,6 +1642,10 @@ bool spvc_resources_s::copy_resources(const ShaderResources &resources)
 		return false;
 	if (!copy_resources(acceleration_structures, resources.acceleration_structures))
 		return false;
+	if (!copy_resources(builtin_inputs, resources.builtin_inputs))
+		return false;
+	if (!copy_resources(builtin_outputs, resources.builtin_outputs))
+		return false;
 
 	return true;
 }
@@ -1735,6 +1804,37 @@ spvc_result spvc_resources_get_resource_list_for_type(spvc_resources resources,
 	return SPVC_SUCCESS;
 }
 
+spvc_result spvc_resources_get_builtin_resource_list_for_type(
+		spvc_resources resources, spvc_builtin_resource_type type,
+		const spvc_reflected_builtin_resource **resource_list,
+		size_t *resource_size)
+{
+	const SmallVector<spvc_reflected_builtin_resource> *list = nullptr;
+	switch (type)
+	{
+	case SPVC_BUILTIN_RESOURCE_TYPE_STAGE_INPUT:
+		list = &resources->builtin_inputs;
+		break;
+
+	case SPVC_BUILTIN_RESOURCE_TYPE_STAGE_OUTPUT:
+		list = &resources->builtin_outputs;
+		break;
+
+	default:
+		break;
+	}
+
+	if (!list)
+	{
+		resources->context->report_error("Invalid argument.");
+		return SPVC_ERROR_INVALID_ARGUMENT;
+	}
+
+	*resource_size = list->size();
+	*resource_list = list->data();
+	return SPVC_SUCCESS;
+}
+
 void spvc_compiler_set_decoration(spvc_compiler compiler, SpvId id, SpvDecoration decoration, unsigned argument)
 {
 	compiler->compiler->set_decoration(id, static_cast<spv::Decoration>(decoration), argument);
diff --git a/3rdparty/spirv-cross/spirv_cross_c.h b/3rdparty/spirv-cross/spirv_cross_c.h
index 12f988019..1495a7eac 100644
--- a/3rdparty/spirv-cross/spirv_cross_c.h
+++ b/3rdparty/spirv-cross/spirv_cross_c.h
@@ -40,7 +40,7 @@ extern "C" {
 /* Bumped if ABI or API breaks backwards compatibility. */
 #define SPVC_C_API_VERSION_MAJOR 0
 /* Bumped if APIs or enumerations are added in a backwards compatible way. */
-#define SPVC_C_API_VERSION_MINOR 46
+#define SPVC_C_API_VERSION_MINOR 47
 /* Bumped if internal implementation details change. */
 #define SPVC_C_API_VERSION_PATCH 0
 
@@ -99,6 +99,13 @@ typedef struct spvc_reflected_resource
 	const char *name;
 } spvc_reflected_resource;
 
+typedef struct spvc_reflected_builtin_resource
+{
+	SpvBuiltIn builtin;
+	spvc_type_id value_type_id;
+	spvc_reflected_resource resource;
+} spvc_reflected_builtin_resource;
+
 /* See C++ API. */
 typedef struct spvc_entry_point
 {
@@ -221,6 +228,14 @@ typedef enum spvc_resource_type
 	SPVC_RESOURCE_TYPE_INT_MAX = 0x7fffffff
 } spvc_resource_type;
 
+typedef enum spvc_builtin_resource_type
+{
+	SPVC_BUILTIN_RESOURCE_TYPE_UNKNOWN = 0,
+	SPVC_BUILTIN_RESOURCE_TYPE_STAGE_INPUT = 1,
+	SPVC_BUILTIN_RESOURCE_TYPE_STAGE_OUTPUT = 2,
+	SPVC_BUILTIN_RESOURCE_TYPE_INT_MAX = 0x7fffffff
+} spvc_builtin_resource_type;
+
 /* Maps to spirv_cross::SPIRType::BaseType. */
 typedef enum spvc_basetype
 {
@@ -722,6 +737,10 @@ SPVC_PUBLIC_API spvc_result spvc_compiler_flatten_buffer_block(spvc_compiler com
 
 SPVC_PUBLIC_API spvc_bool spvc_compiler_variable_is_depth_or_compare(spvc_compiler compiler, spvc_variable_id id);
 
+SPVC_PUBLIC_API spvc_result spvc_compiler_mask_stage_output_by_location(spvc_compiler compiler,
+                                                                        unsigned location, unsigned component);
+SPVC_PUBLIC_API spvc_result spvc_compiler_mask_stage_output_by_builtin(spvc_compiler compiler, SpvBuiltIn builtin);
+
 /*
  * HLSL specifics.
  * Maps to C++ API.
@@ -805,6 +824,11 @@ SPVC_PUBLIC_API spvc_result spvc_resources_get_resource_list_for_type(spvc_resou
                                                                       const spvc_reflected_resource **resource_list,
                                                                       size_t *resource_size);
 
+SPVC_PUBLIC_API spvc_result spvc_resources_get_builtin_resource_list_for_type(
+		spvc_resources resources, spvc_builtin_resource_type type,
+		const spvc_reflected_builtin_resource **resource_list,
+		size_t *resource_size);
+
 /*
  * Decorations.
  * Maps to C++ API.
diff --git a/3rdparty/spirv-cross/spirv_glsl.cpp b/3rdparty/spirv-cross/spirv_glsl.cpp
index 31ff85042..a8d0925fc 100644
--- a/3rdparty/spirv-cross/spirv_glsl.cpp
+++ b/3rdparty/spirv-cross/spirv_glsl.cpp
@@ -559,18 +559,19 @@ string CompilerGLSL::compile()
 {
 	ir.fixup_reserved_names();
 
-	if (options.vulkan_semantics)
-		backend.allow_precision_qualifiers = true;
-	else
+	if (!options.vulkan_semantics)
 	{
 		// only NV_gpu_shader5 supports divergent indexing on OpenGL, and it does so without extra qualifiers
 		backend.nonuniform_qualifier = "";
 		backend.needs_row_major_load_workaround = true;
 	}
+	backend.allow_precision_qualifiers = options.vulkan_semantics || options.es;
 	backend.force_gl_in_out_block = true;
 	backend.supports_extensions = true;
 	backend.use_array_constructor = true;
 
+	backend.support_precise_qualifier = (!options.es && options.version >= 400) || (options.es && options.version >= 320);
+
 	if (is_legacy_es())
 		backend.support_case_fallthrough = false;
 
@@ -764,6 +765,8 @@ void CompilerGLSL::emit_header()
 			{
 				statement("#if defined(GL_AMD_gpu_shader_int16)");
 				statement("#extension GL_AMD_gpu_shader_int16 : require");
+				statement("#elif defined(GL_NV_gpu_shader5)");
+				statement("#extension GL_NV_gpu_shader5 : require");
 				statement("#else");
 				statement("#error No extension available for Int16.");
 				statement("#endif");
@@ -4395,6 +4398,16 @@ string CompilerGLSL::to_composite_constructor_expression(uint32_t id, bool uses_
 		return to_unpacked_expression(id);
 }
 
+string CompilerGLSL::to_non_uniform_aware_expression(uint32_t id)
+{
+	string expr = to_expression(id);
+
+	if (has_decoration(id, DecorationNonUniform))
+		convert_non_uniform_expression(expr, id);
+
+	return expr;
+}
+
 string CompilerGLSL::to_expression(uint32_t id, bool register_expression_read)
 {
 	auto itr = invalid_expressions.find(id);
@@ -5533,7 +5546,12 @@ void CompilerGLSL::emit_unary_op(uint32_t result_type, uint32_t result_id, uint3
 
 void CompilerGLSL::emit_binary_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, const char *op)
 {
-	bool forward = should_forward(op0) && should_forward(op1);
+	// Various FP arithmetic opcodes such as add, sub, mul will hit this.
+	bool force_temporary_precise = backend.support_precise_qualifier &&
+	                               has_decoration(result_id, DecorationNoContraction) &&
+	                               type_is_floating_point(get<SPIRType>(result_type));
+	bool forward = should_forward(op0) && should_forward(op1) && !force_temporary_precise;
+
 	emit_op(result_type, result_id,
 	        join(to_enclosed_unpacked_expression(op0), " ", op, " ", to_enclosed_unpacked_expression(op1)), forward);
 
@@ -5712,6 +5730,27 @@ void CompilerGLSL::emit_binary_func_op(uint32_t result_type, uint32_t result_id,
 	inherit_expression_dependencies(result_id, op1);
 }
 
+void CompilerGLSL::emit_atomic_func_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1,
+                                       const char *op)
+{
+	forced_temporaries.insert(result_id);
+	emit_op(result_type, result_id,
+	        join(op, "(", to_non_uniform_aware_expression(op0), ", ",
+	             to_unpacked_expression(op1), ")"), false);
+	flush_all_atomic_capable_variables();
+}
+
+void CompilerGLSL::emit_atomic_func_op(uint32_t result_type, uint32_t result_id,
+                                       uint32_t op0, uint32_t op1, uint32_t op2,
+                                       const char *op)
+{
+	forced_temporaries.insert(result_id);
+	emit_op(result_type, result_id,
+	        join(op, "(", to_non_uniform_aware_expression(op0), ", ",
+	             to_unpacked_expression(op1), ", ", to_unpacked_expression(op2), ")"), false);
+	flush_all_atomic_capable_variables();
+}
+
 void CompilerGLSL::emit_unary_func_op_cast(uint32_t result_type, uint32_t result_id, uint32_t op0, const char *op,
                                            SPIRType::BaseType input_type, SPIRType::BaseType expected_result_type)
 {
@@ -6214,7 +6253,7 @@ string CompilerGLSL::to_combined_image_sampler(VariableID image_id, VariableID s
 {
 	// Keep track of the array indices we have used to load the image.
 	// We'll need to use the same array index into the combined image sampler array.
-	auto image_expr = to_expression(image_id);
+	auto image_expr = to_non_uniform_aware_expression(image_id);
 	string array_expr;
 	auto array_index = image_expr.find_first_of('[');
 	if (array_index != string::npos)
@@ -6442,20 +6481,8 @@ std::string CompilerGLSL::to_texture_op(const Instruction &i, bool sparse, bool
 	auto &result_type = get<SPIRType>(result_type_id);
 
 	inherited_expressions.push_back(coord);
-
-	// Make sure non-uniform decoration is back-propagated to where it needs to be.
-	if (has_decoration(img, DecorationNonUniformEXT))
-	{
-		// In Vulkan GLSL, we cannot back-propgate nonuniform qualifiers if we
-		// use a combined image sampler constructor.
-		// We're only interested in back-propagating if we can trace back through access chains.
-		// If not, we will apply nonuniform to the sampled image expression itself.
-		auto *backing = maybe_get_backing_variable(img);
-		if (backing)
-			propagate_nonuniform_qualifier(img);
-		else
-			nonuniform_expression = true;
-	}
+	if (has_decoration(img, DecorationNonUniform) && !maybe_get_backing_variable(img))
+		nonuniform_expression = true;
 
 	switch (op)
 	{
@@ -6794,7 +6821,7 @@ std::string CompilerGLSL::convert_separate_image_to_expression(uint32_t id)
 					// Don't need to consider Shadow state since the dummy sampler is always non-shadow.
 					auto sampled_type = type;
 					sampled_type.basetype = SPIRType::SampledImage;
-					return join(type_to_glsl(sampled_type), "(", to_expression(id), ", ",
+					return join(type_to_glsl(sampled_type), "(", to_non_uniform_aware_expression(id), ", ",
 					            to_expression(dummy_sampler_id), ")");
 				}
 				else
@@ -6814,7 +6841,7 @@ std::string CompilerGLSL::convert_separate_image_to_expression(uint32_t id)
 		}
 	}
 
-	return to_expression(id);
+	return to_non_uniform_aware_expression(id);
 }
 
 // Returns the function args for a texture sampling function for the specified image and sampling characteristics.
@@ -6827,7 +6854,7 @@ string CompilerGLSL::to_function_args(const TextureFunctionArguments &args, bool
 	if (args.base.is_fetch)
 		farg_str = convert_separate_image_to_expression(img);
 	else
-		farg_str = to_expression(img);
+		farg_str = to_non_uniform_aware_expression(img);
 
 	if (args.nonuniform_expression && farg_str.find_first_of('[') != string::npos)
 	{
@@ -8317,12 +8344,35 @@ string CompilerGLSL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 			SPIRV_CROSS_THROW("Stencil export not supported in GLES.");
 	}
 
+	case BuiltInPrimitiveShadingRateKHR:
+	{
+		if (!options.vulkan_semantics)
+			SPIRV_CROSS_THROW("Can only use PrimitiveShadingRateKHR in Vulkan GLSL.");
+		require_extension_internal("GL_EXT_fragment_shading_rate");
+		return "gl_PrimitiveShadingRateEXT";
+	}
+
+	case BuiltInShadingRateKHR:
+	{
+		if (!options.vulkan_semantics)
+			SPIRV_CROSS_THROW("Can only use ShadingRateKHR in Vulkan GLSL.");
+		require_extension_internal("GL_EXT_fragment_shading_rate");
+		return "gl_ShadingRateEXT";
+	}
+
 	case BuiltInDeviceIndex:
 		if (!options.vulkan_semantics)
 			SPIRV_CROSS_THROW("Need Vulkan semantics for device group support.");
 		require_extension_internal("GL_EXT_device_group");
 		return "gl_DeviceIndex";
 
+	case BuiltInFullyCoveredEXT:
+		if (!options.es)
+			require_extension_internal("GL_NV_conservative_raster_underestimation");
+		else
+			SPIRV_CROSS_THROW("Need desktop GL to use GL_NV_conservative_raster_underestimation.");
+		return "gl_FragFullyCoveredNV";
+
 	default:
 		return join("gl_BuiltIn_", convert_to_string(builtin));
 	}
@@ -8345,7 +8395,7 @@ const char *CompilerGLSL::index_to_swizzle(uint32_t index)
 	}
 }
 
-void CompilerGLSL::access_chain_internal_append_index(std::string &expr, uint32_t /*base*/, const SPIRType *type,
+void CompilerGLSL::access_chain_internal_append_index(std::string &expr, uint32_t /*base*/, const SPIRType * /*type*/,
                                                       AccessChainFlags flags, bool & /*access_chain_is_arrayed*/,
                                                       uint32_t index)
 {
@@ -8354,27 +8404,19 @@ void CompilerGLSL::access_chain_internal_append_index(std::string &expr, uint32_
 
 	expr += "[";
 
-	// If we are indexing into an array of SSBOs or UBOs, we need to index it with a non-uniform qualifier.
-	bool nonuniform_index =
-	    has_decoration(index, DecorationNonUniformEXT) &&
-	    (has_decoration(type->self, DecorationBlock) || has_decoration(type->self, DecorationBufferBlock));
-	if (nonuniform_index)
-	{
-		expr += backend.nonuniform_qualifier;
-		expr += "(";
-	}
-
 	if (index_is_literal)
 		expr += convert_to_string(index);
 	else
 		expr += to_expression(index, register_expression_read);
 
-	if (nonuniform_index)
-		expr += ")";
-
 	expr += "]";
 }
 
+bool CompilerGLSL::access_chain_needs_stage_io_builtin_translation(uint32_t)
+{
+	return true;
+}
+
 string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indices, uint32_t count,
                                            AccessChainFlags flags, AccessChainMeta *meta)
 {
@@ -8584,7 +8626,7 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 				SPIRV_CROSS_THROW("Member index is out of bounds!");
 
 			BuiltIn builtin;
-			if (is_member_builtin(*type, index, &builtin))
+			if (is_member_builtin(*type, index, &builtin) && access_chain_needs_stage_io_builtin_translation(base))
 			{
 				if (access_chain_is_arrayed)
 				{
@@ -8652,13 +8694,30 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 
 			// Internally, access chain implementation can also be used on composites,
 			// ignore scalar access workarounds in this case.
-			StorageClass effective_storage;
-			if (expression_type(base).pointer)
-				effective_storage = get_expression_effective_storage_class(base);
-			else
-				effective_storage = StorageClassGeneric;
+			StorageClass effective_storage = StorageClassGeneric;
+			bool ignore_potential_sliced_writes = false;
+			if ((flags & ACCESS_CHAIN_FORCE_COMPOSITE_BIT) == 0)
+			{
+				if (expression_type(base).pointer)
+					effective_storage = get_expression_effective_storage_class(base);
 
-			if (!row_major_matrix_needs_conversion)
+				// Special consideration for control points.
+				// Control points can only be written by InvocationID, so there is no need
+				// to consider scalar access chains here.
+				// Cleans up some cases where it's very painful to determine the accurate storage class
+				// since blocks can be partially masked ...
+				auto *var = maybe_get_backing_variable(base);
+				if (var && var->storage == StorageClassOutput &&
+				    get_execution_model() == ExecutionModelTessellationControl &&
+				    !has_decoration(var->self, DecorationPatch))
+				{
+					ignore_potential_sliced_writes = true;
+				}
+			}
+			else
+				ignore_potential_sliced_writes = true;
+
+			if (!row_major_matrix_needs_conversion && !ignore_potential_sliced_writes)
 			{
 				// On some backends, we might not be able to safely access individual scalars in a vector.
 				// To work around this, we might have to cast the access chain reference to something which can,
@@ -8698,7 +8757,7 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 				expr += "]";
 			}
 
-			if (row_major_matrix_needs_conversion)
+			if (row_major_matrix_needs_conversion && !ignore_potential_sliced_writes)
 			{
 				prepare_access_chain_for_scalar_access(expr, get<SPIRType>(type->parent_type), effective_storage,
 				                                       is_packed);
@@ -9682,6 +9741,8 @@ void CompilerGLSL::emit_store_statement(uint32_t lhs_expression, uint32_t rhs_ex
 		if (!unroll_array_to_complex_store(lhs_expression, rhs_expression))
 		{
 			auto lhs = to_dereferenced_expression(lhs_expression);
+			if (has_decoration(lhs_expression, DecorationNonUniform))
+				convert_non_uniform_expression(lhs, lhs_expression);
 
 			// We might need to cast in order to store to a builtin.
 			cast_to_builtin_store(lhs_expression, rhs, expression_type(rhs_expression));
@@ -9857,12 +9918,10 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		// Also, loading from gl_SampleMask array needs special unroll.
 		unroll_array_from_complex_load(id, ptr, expr);
 
-		// Shouldn't need to check for ID, but current glslang codegen requires it in some cases
-		// when loading Image/Sampler descriptors. It does not hurt to check ID as well.
-		if (has_decoration(id, DecorationNonUniformEXT) || has_decoration(ptr, DecorationNonUniformEXT))
+		if (!type_is_opaque_value(type) && has_decoration(ptr, DecorationNonUniform))
 		{
-			propagate_nonuniform_qualifier(ptr);
-			convert_non_uniform_expression(type, expr);
+			// If we're loading something non-opaque, we need to handle non-uniform descriptor access.
+			convert_non_uniform_expression(expr, ptr);
 		}
 
 		if (forward && ptr_expression)
@@ -9885,7 +9944,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 			// it is an array, and our backend does not support arrays as value types.
 			// Emit the temporary, and copy it explicitly.
 			e = &emit_uninitialized_temporary_expression(result_type, id);
-			emit_array_copy(to_expression(id), ptr, StorageClassFunction, get_expression_effective_storage_class(ptr));
+			emit_array_copy(to_expression(id), id, ptr, StorageClassFunction, get_expression_effective_storage_class(ptr));
 		}
 		else
 			e = &emit_op(result_type, id, expr, forward, !usage_tracking);
@@ -9966,9 +10025,6 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		if (expr.expression_dependencies.empty())
 			forwarded_temporaries.erase(ops[1]);
 
-		if (has_decoration(ops[1], DecorationNonUniformEXT))
-			propagate_nonuniform_qualifier(ops[1]);
-
 		break;
 	}
 
@@ -10006,6 +10062,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		uint32_t result_type = ops[0];
 		uint32_t id = ops[1];
 		auto e = access_chain_internal(ops[2], &ops[3], length - 3, ACCESS_CHAIN_INDEX_IS_LITERAL_BIT, nullptr);
+		if (has_decoration(ops[2], DecorationNonUniform))
+			convert_non_uniform_expression(e, ops[2]);
 		set<SPIRExpression>(id, join(type_to_glsl(get<SPIRType>(result_type)), "(", e, ".length())"), result_type,
 		                    true);
 		break;
@@ -10307,14 +10365,16 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 			// Including the base will prevent this and would trigger multiple reads
 			// from expression causing it to be forced to an actual temporary in GLSL.
 			auto expr = access_chain_internal(ops[2], &ops[3], length,
-			                                  ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_CHAIN_ONLY_BIT, &meta);
+			                                  ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_CHAIN_ONLY_BIT |
+			                                  ACCESS_CHAIN_FORCE_COMPOSITE_BIT, &meta);
 			e = &emit_op(result_type, id, expr, true, should_suppress_usage_tracking(ops[2]));
 			inherit_expression_dependencies(id, ops[2]);
 			e->base_expression = ops[2];
 		}
 		else
 		{
-			auto expr = access_chain_internal(ops[2], &ops[3], length, ACCESS_CHAIN_INDEX_IS_LITERAL_BIT, &meta);
+			auto expr = access_chain_internal(ops[2], &ops[3], length,
+			                                  ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_FORCE_COMPOSITE_BIT, &meta);
 			e = &emit_op(result_type, id, expr, should_forward(ops[2]), should_suppress_usage_tracking(ops[2]));
 			inherit_expression_dependencies(id, ops[2]);
 		}
@@ -11221,9 +11281,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		// Ignore semantics for now, probably only relevant to CL.
 		uint32_t val = ops[5];
 		const char *op = check_atomic_image(ptr) ? "imageAtomicExchange" : "atomicExchange";
-		forced_temporaries.insert(id);
-		emit_binary_func_op(result_type, id, ptr, val, op);
-		flush_all_atomic_capable_variables();
+
+		emit_atomic_func_op(result_type, id, ptr, val, op);
 		break;
 	}
 
@@ -11236,9 +11295,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		uint32_t comp = ops[7];
 		const char *op = check_atomic_image(ptr) ? "imageAtomicCompSwap" : "atomicCompSwap";
 
-		forced_temporaries.insert(id);
-		emit_trinary_func_op(result_type, id, ptr, comp, val, op);
-		flush_all_atomic_capable_variables();
+		emit_atomic_func_op(result_type, id, ptr, comp, val, op);
 		break;
 	}
 
@@ -11253,7 +11310,9 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		                     (atomic_image && get<SPIRType>(type.image.type).basetype == SPIRType::UInt);
 		const char *op = atomic_image ? "imageAtomicAdd" : "atomicAdd";
 		const char *increment = unsigned_type ? "0u" : "0";
-		emit_op(ops[0], ops[1], join(op, "(", to_expression(ops[2]), ", ", increment, ")"), false);
+		emit_op(ops[0], ops[1],
+		        join(op, "(",
+		             to_non_uniform_aware_expression(ops[2]), ", ", increment, ")"), false);
 		flush_all_atomic_capable_variables();
 		break;
 	}
@@ -11266,7 +11325,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		// Ignore semantics for now, probably only relevant to CL.
 		uint32_t val = ops[3];
 		const char *op = check_atomic_image(ptr) ? "imageAtomicExchange" : "atomicExchange";
-		statement(op, "(", to_expression(ptr), ", ", to_expression(val), ");");
+		statement(op, "(", to_non_uniform_aware_expression(ptr), ", ", to_expression(val), ");");
 		flush_all_atomic_capable_variables();
 		break;
 	}
@@ -11301,7 +11360,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 			else
 				increment = "-1";
 
-			emit_op(ops[0], ops[1], join(op, "(", to_expression(ops[2]), ", ", increment, ")"), false);
+			emit_op(ops[0], ops[1],
+			        join(op, "(", to_non_uniform_aware_expression(ops[2]), ", ", increment, ")"), false);
 		}
 
 		flush_all_atomic_capable_variables();
@@ -11311,9 +11371,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	case OpAtomicIAdd:
 	{
 		const char *op = check_atomic_image(ops[2]) ? "imageAtomicAdd" : "atomicAdd";
-		forced_temporaries.insert(ops[1]);
-		emit_binary_func_op(ops[0], ops[1], ops[2], ops[5], op);
-		flush_all_atomic_capable_variables();
+		emit_atomic_func_op(ops[0], ops[1], ops[2], ops[5], op);
 		break;
 	}
 
@@ -11321,7 +11379,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	{
 		const char *op = check_atomic_image(ops[2]) ? "imageAtomicAdd" : "atomicAdd";
 		forced_temporaries.insert(ops[1]);
-		auto expr = join(op, "(", to_expression(ops[2]), ", -", to_enclosed_expression(ops[5]), ")");
+		auto expr = join(op, "(", to_non_uniform_aware_expression(ops[2]), ", -", to_enclosed_expression(ops[5]), ")");
 		emit_op(ops[0], ops[1], expr, should_forward(ops[2]) && should_forward(ops[5]));
 		flush_all_atomic_capable_variables();
 		break;
@@ -11331,9 +11389,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	case OpAtomicUMin:
 	{
 		const char *op = check_atomic_image(ops[2]) ? "imageAtomicMin" : "atomicMin";
-		forced_temporaries.insert(ops[1]);
-		emit_binary_func_op(ops[0], ops[1], ops[2], ops[5], op);
-		flush_all_atomic_capable_variables();
+		emit_atomic_func_op(ops[0], ops[1], ops[2], ops[5], op);
 		break;
 	}
 
@@ -11341,36 +11397,28 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	case OpAtomicUMax:
 	{
 		const char *op = check_atomic_image(ops[2]) ? "imageAtomicMax" : "atomicMax";
-		forced_temporaries.insert(ops[1]);
-		emit_binary_func_op(ops[0], ops[1], ops[2], ops[5], op);
-		flush_all_atomic_capable_variables();
+		emit_atomic_func_op(ops[0], ops[1], ops[2], ops[5], op);
 		break;
 	}
 
 	case OpAtomicAnd:
 	{
 		const char *op = check_atomic_image(ops[2]) ? "imageAtomicAnd" : "atomicAnd";
-		forced_temporaries.insert(ops[1]);
-		emit_binary_func_op(ops[0], ops[1], ops[2], ops[5], op);
-		flush_all_atomic_capable_variables();
+		emit_atomic_func_op(ops[0], ops[1], ops[2], ops[5], op);
 		break;
 	}
 
 	case OpAtomicOr:
 	{
 		const char *op = check_atomic_image(ops[2]) ? "imageAtomicOr" : "atomicOr";
-		forced_temporaries.insert(ops[1]);
-		emit_binary_func_op(ops[0], ops[1], ops[2], ops[5], op);
-		flush_all_atomic_capable_variables();
+		emit_atomic_func_op(ops[0], ops[1], ops[2], ops[5], op);
 		break;
 	}
 
 	case OpAtomicXor:
 	{
 		const char *op = check_atomic_image(ops[2]) ? "imageAtomicXor" : "atomicXor";
-		forced_temporaries.insert(ops[1]);
-		emit_binary_func_op(ops[0], ops[1], ops[2], ops[5], op);
-		flush_all_atomic_capable_variables();
+		emit_atomic_func_op(ops[0], ops[1], ops[2], ops[5], op);
 		break;
 	}
 
@@ -11465,16 +11513,33 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 
 	case OpImageQueryLod:
 	{
+		const char *op = nullptr;
 		if (!options.es && options.version < 400)
 		{
 			require_extension_internal("GL_ARB_texture_query_lod");
 			// For some reason, the ARB spec is all-caps.
-			GLSL_BFOP(textureQueryLOD);
+			op = "textureQueryLOD";
 		}
 		else if (options.es)
 			SPIRV_CROSS_THROW("textureQueryLod not supported in ES profile.");
 		else
-			GLSL_BFOP(textureQueryLod);
+			op = "textureQueryLod";
+
+		auto sampler_expr = to_expression(ops[2]);
+		if (has_decoration(ops[2], DecorationNonUniform))
+		{
+			if (maybe_get_backing_variable(ops[2]))
+				convert_non_uniform_expression(sampler_expr, ops[2]);
+			else if (*backend.nonuniform_qualifier != '\0')
+				sampler_expr = join(backend.nonuniform_qualifier, "(", sampler_expr, ")");
+		}
+
+		bool forward = should_forward(ops[3]);
+		emit_op(ops[0], ops[1],
+		        join(op, "(", sampler_expr, ", ", to_unpacked_expression(ops[3]), ")"),
+		        forward);
+		inherit_expression_dependencies(ops[1], ops[2]);
+		inherit_expression_dependencies(ops[1], ops[3]);
 		register_control_dependent_expression(ops[1]);
 		break;
 	}
@@ -11504,7 +11569,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 
 		string expr;
 		if (type.image.sampled == 2)
-			expr = join("imageSamples(", to_expression(ops[2]), ")");
+			expr = join("imageSamples(", to_non_uniform_aware_expression(ops[2]), ")");
 		else
 			expr = join("textureSamples(", convert_separate_image_to_expression(ops[2]), ")");
 
@@ -11615,10 +11680,10 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 						                  "operand mask was used.");
 
 					uint32_t samples = ops[5];
-					imgexpr = join("subpassLoad(", to_expression(ops[2]), ", ", to_expression(samples), ")");
+					imgexpr = join("subpassLoad(", to_non_uniform_aware_expression(ops[2]), ", ", to_expression(samples), ")");
 				}
 				else
-					imgexpr = join("subpassLoad(", to_expression(ops[2]), ")");
+					imgexpr = join("subpassLoad(", to_non_uniform_aware_expression(ops[2]), ")");
 			}
 			else
 			{
@@ -11630,13 +11695,13 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 						                  "operand mask was used.");
 
 					uint32_t samples = ops[5];
-					imgexpr = join("texelFetch(", to_expression(ops[2]), ", ivec2(gl_FragCoord.xy), ",
+					imgexpr = join("texelFetch(", to_non_uniform_aware_expression(ops[2]), ", ivec2(gl_FragCoord.xy), ",
 					               to_expression(samples), ")");
 				}
 				else
 				{
 					// Implement subpass loads via texture barrier style sampling.
-					imgexpr = join("texelFetch(", to_expression(ops[2]), ", ivec2(gl_FragCoord.xy), 0)");
+					imgexpr = join("texelFetch(", to_non_uniform_aware_expression(ops[2]), ", ivec2(gl_FragCoord.xy), 0)");
 				}
 			}
 			imgexpr = remap_swizzle(get<SPIRType>(result_type), 4, imgexpr);
@@ -11667,12 +11732,12 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 						                  "operand mask was used.");
 
 					uint32_t samples = ops[5];
-					statement(to_expression(sparse_code_id), " = sparseImageLoadARB(", to_expression(ops[2]), ", ",
+					statement(to_expression(sparse_code_id), " = sparseImageLoadARB(", to_non_uniform_aware_expression(ops[2]), ", ",
 					          coord_expr, ", ", to_expression(samples), ", ", to_expression(sparse_texel_id), ");");
 				}
 				else
 				{
-					statement(to_expression(sparse_code_id), " = sparseImageLoadARB(", to_expression(ops[2]), ", ",
+					statement(to_expression(sparse_code_id), " = sparseImageLoadARB(", to_non_uniform_aware_expression(ops[2]), ", ",
 					          coord_expr, ", ", to_expression(sparse_texel_id), ");");
 				}
 				imgexpr = join(type_to_glsl(get<SPIRType>(result_type)), "(", to_expression(sparse_code_id), ", ",
@@ -11689,10 +11754,10 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 
 					uint32_t samples = ops[5];
 					imgexpr =
-					    join("imageLoad(", to_expression(ops[2]), ", ", coord_expr, ", ", to_expression(samples), ")");
+					    join("imageLoad(", to_non_uniform_aware_expression(ops[2]), ", ", coord_expr, ", ", to_expression(samples), ")");
 				}
 				else
-					imgexpr = join("imageLoad(", to_expression(ops[2]), ", ", coord_expr, ")");
+					imgexpr = join("imageLoad(", to_non_uniform_aware_expression(ops[2]), ", ", coord_expr, ")");
 			}
 
 			if (!sparse)
@@ -11733,9 +11798,6 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		coord_expr = bitcast_expression(target_coord_type, expression_type(ops[3]).basetype, coord_expr);
 
 		auto expr = join(to_expression(ops[2]), ", ", coord_expr);
-		if (has_decoration(id, DecorationNonUniformEXT) || has_decoration(ops[2], DecorationNonUniformEXT))
-			convert_non_uniform_expression(expression_type(ops[2]), expr);
-
 		auto &e = set<SPIRExpression>(id, expr, result_type, true);
 
 		// When using the pointer, we need to know which variable it is actually loaded from.
@@ -11778,11 +11840,11 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 			if (operands != ImageOperandsSampleMask || length != 5)
 				SPIRV_CROSS_THROW("Multisampled image used in OpImageWrite, but unexpected operand mask was used.");
 			uint32_t samples = ops[4];
-			statement("imageStore(", to_expression(ops[0]), ", ", coord_expr, ", ", to_expression(samples), ", ",
+			statement("imageStore(", to_non_uniform_aware_expression(ops[0]), ", ", coord_expr, ", ", to_expression(samples), ", ",
 			          remap_swizzle(store_type, value_type.vecsize, to_expression(ops[2])), ");");
 		}
 		else
-			statement("imageStore(", to_expression(ops[0]), ", ", coord_expr, ", ",
+			statement("imageStore(", to_non_uniform_aware_expression(ops[0]), ", ", coord_expr, ", ",
 			          remap_swizzle(store_type, value_type.vecsize, to_expression(ops[2])), ");");
 
 		if (var && variable_storage_is_aliased(*var))
@@ -11807,7 +11869,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 					SPIRV_CROSS_THROW("At least ESSL 3.10 required for imageSize.");
 
 				// The size of an image is always constant.
-				expr = join("imageSize(", to_expression(ops[2]), ")");
+				expr = join("imageSize(", to_non_uniform_aware_expression(ops[2]), ")");
 			}
 			else
 			{
@@ -12332,9 +12394,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		flush_control_dependent_expressions(current_emitting_block->self);
 		break;
 	case OpTraceNV:
-		if (has_decoration(ops[0], DecorationNonUniformEXT))
-			propagate_nonuniform_qualifier(ops[0]);
-		statement("traceNV(", to_expression(ops[0]), ", ", to_expression(ops[1]), ", ", to_expression(ops[2]), ", ",
+		statement("traceNV(", to_non_uniform_aware_expression(ops[0]), ", ", to_expression(ops[1]), ", ", to_expression(ops[2]), ", ",
 		          to_expression(ops[3]), ", ", to_expression(ops[4]), ", ", to_expression(ops[5]), ", ",
 		          to_expression(ops[6]), ", ", to_expression(ops[7]), ", ", to_expression(ops[8]), ", ",
 		          to_expression(ops[9]), ", ", to_expression(ops[10]), ");");
@@ -12343,9 +12403,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	case OpTraceRayKHR:
 		if (!has_decoration(ops[10], DecorationLocation))
 			SPIRV_CROSS_THROW("A memory declaration object must be used in TraceRayKHR.");
-		if (has_decoration(ops[0], DecorationNonUniformEXT))
-			propagate_nonuniform_qualifier(ops[0]);
-		statement("traceRayEXT(", to_expression(ops[0]), ", ", to_expression(ops[1]), ", ", to_expression(ops[2]), ", ",
+		statement("traceRayEXT(", to_non_uniform_aware_expression(ops[0]), ", ", to_expression(ops[1]), ", ", to_expression(ops[2]), ", ",
 		          to_expression(ops[3]), ", ", to_expression(ops[4]), ", ", to_expression(ops[5]), ", ",
 		          to_expression(ops[6]), ", ", to_expression(ops[7]), ", ", to_expression(ops[8]), ", ",
 		          to_expression(ops[9]), ", ", get_decoration(ops[10], DecorationLocation), ");");
@@ -12644,6 +12702,11 @@ string CompilerGLSL::variable_decl(const SPIRType &type, const string &name, uin
 	return join(type_name, " ", name, type_to_array_glsl(type));
 }
 
+bool CompilerGLSL::variable_decl_is_remapped_storage(const SPIRVariable &var, StorageClass storage) const
+{
+	return var.storage == storage;
+}
+
 // Emit a structure member. Subclasses may override to modify output,
 // or to dynamically add a padding member if needed.
 void CompilerGLSL::emit_struct_member(const SPIRType &type, uint32_t member_type_id, uint32_t index,
@@ -12671,7 +12734,7 @@ void CompilerGLSL::emit_struct_padding_target(const SPIRType &)
 {
 }
 
-const char *CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const Bitset &flags)
+string CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const Bitset &flags)
 {
 	// GL_EXT_buffer_reference variables can be marked as restrict.
 	if (flags.get(DecorationRestrictPointerEXT))
@@ -12683,6 +12746,11 @@ const char *CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const B
 	    type.basetype != SPIRType::Sampler)
 		return "";
 
+	string qual;
+
+	if (flags.get(DecorationNoContraction) && backend.support_precise_qualifier)
+		qual = "precise ";
+
 	if (options.es)
 	{
 		auto &execution = get_entry_point();
@@ -12697,7 +12765,7 @@ const char *CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const B
 			                        options.fragment.default_int_precision == Options::Mediump &&
 			                        execution.model == ExecutionModelFragment;
 
-			return implied_fmediump || implied_imediump ? "" : "mediump ";
+			qual += (implied_fmediump || implied_imediump) ? "" : "mediump ";
 		}
 		else
 		{
@@ -12711,7 +12779,7 @@ const char *CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const B
 			                        execution.model == ExecutionModelFragment) ||
 			                       (execution.model != ExecutionModelFragment));
 
-			return implied_fhighp || implied_ihighp ? "" : "highp ";
+			qual += (implied_fhighp || implied_ihighp) ? "" : "highp ";
 		}
 	}
 	else if (backend.allow_precision_qualifiers)
@@ -12719,18 +12787,16 @@ const char *CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const B
 		// Vulkan GLSL supports precision qualifiers, even in desktop profiles, which is convenient.
 		// The default is highp however, so only emit mediump in the rare case that a shader has these.
 		if (flags.get(DecorationRelaxedPrecision))
-			return "mediump ";
-		else
-			return "";
+			qual += "mediump ";
 	}
-	else
-		return "";
+
+	return qual;
 }
 
-const char *CompilerGLSL::to_precision_qualifiers_glsl(uint32_t id)
+string CompilerGLSL::to_precision_qualifiers_glsl(uint32_t id)
 {
 	auto &type = expression_type(id);
-	bool use_precision_qualifiers = backend.allow_precision_qualifiers || options.es;
+	bool use_precision_qualifiers = backend.allow_precision_qualifiers;
 	if (use_precision_qualifiers && (type.basetype == SPIRType::Image || type.basetype == SPIRType::SampledImage))
 	{
 		// Force mediump for the sampler type. We cannot declare 16-bit or smaller image types.
@@ -12787,10 +12853,24 @@ string CompilerGLSL::to_qualifiers_glsl(uint32_t id)
 			res += "coherent ";
 		if (flags.get(DecorationRestrict))
 			res += "restrict ";
+
 		if (flags.get(DecorationNonWritable))
 			res += "readonly ";
+
+		bool formatted_load = type.image.format == ImageFormatUnknown;
 		if (flags.get(DecorationNonReadable))
+		{
 			res += "writeonly ";
+			formatted_load = false;
+		}
+
+		if (formatted_load)
+		{
+			if (!options.es)
+				require_extension_internal("GL_EXT_shader_image_load_formatted");
+			else
+				SPIRV_CROSS_THROW("Cannot use GL_EXT_shader_image_load_formatted in ESSL.");
+		}
 	}
 
 	res += to_precision_qualifiers_glsl(id);
@@ -12869,7 +12949,7 @@ string CompilerGLSL::variable_decl(const SPIRVariable &variable)
 		else if (options.force_zero_initialized_variables && type_can_zero_initialize(type))
 			res += join(" = ", to_zero_initialized_expression(get_variable_data_type_id(variable)));
 	}
-	else if (variable.initializer)
+	else if (variable.initializer && !variable_decl_is_remapped_storage(variable, StorageClassWorkgroup))
 	{
 		uint32_t expr = variable.initializer;
 		if (ir.ids[expr].get_type() != TypeUndef)
@@ -13559,7 +13639,7 @@ void CompilerGLSL::emit_function(SPIRFunction &func, const Bitset &return_flags)
 		auto &var = get<SPIRVariable>(v);
 		var.deferred_declaration = false;
 
-		if (var.storage == StorageClassWorkgroup)
+		if (variable_decl_is_remapped_storage(var, StorageClassWorkgroup))
 		{
 			// Special variable type which cannot have initializer,
 			// need to be declared as standalone variables.
@@ -14761,7 +14841,7 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 				// The backend is responsible for setting this up, and redirection the return values as appropriate.
 				if (ir.ids[block.return_value].get_type() != TypeUndef)
 				{
-					emit_array_copy("spvReturnValue", block.return_value, StorageClassFunction,
+					emit_array_copy("spvReturnValue", 0, block.return_value, StorageClassFunction,
 					                get_expression_effective_storage_class(block.return_value));
 				}
 
@@ -14980,7 +15060,7 @@ uint32_t CompilerGLSL::mask_relevant_memory_semantics(uint32_t semantics)
 	                    MemorySemanticsCrossWorkgroupMemoryMask | MemorySemanticsSubgroupMemoryMask);
 }
 
-void CompilerGLSL::emit_array_copy(const string &lhs, uint32_t rhs_id, StorageClass, StorageClass)
+void CompilerGLSL::emit_array_copy(const string &lhs, uint32_t, uint32_t rhs_id, StorageClass, StorageClass)
 {
 	statement(lhs, " = ", to_expression(rhs_id), ";");
 }
@@ -15120,6 +15200,8 @@ void CompilerGLSL::cast_from_builtin_load(uint32_t source_id, std::string &expr,
 	case BuiltInFragStencilRefEXT:
 	case BuiltInInstanceCustomIndexNV:
 	case BuiltInSampleMask:
+	case BuiltInPrimitiveShadingRateKHR:
+	case BuiltInShadingRateKHR:
 		expected_type = SPIRType::Int;
 		break;
 
@@ -15164,6 +15246,8 @@ void CompilerGLSL::cast_to_builtin_store(uint32_t target_id, std::string &expr,
 	case BuiltInViewportIndex:
 	case BuiltInFragStencilRefEXT:
 	case BuiltInSampleMask:
+	case BuiltInPrimitiveShadingRateKHR:
+	case BuiltInShadingRateKHR:
 		expected_type = SPIRType::Int;
 		break;
 
@@ -15179,64 +15263,62 @@ void CompilerGLSL::cast_to_builtin_store(uint32_t target_id, std::string &expr,
 	}
 }
 
-void CompilerGLSL::convert_non_uniform_expression(const SPIRType &type, std::string &expr)
+void CompilerGLSL::convert_non_uniform_expression(string &expr, uint32_t ptr_id)
 {
 	if (*backend.nonuniform_qualifier == '\0')
 		return;
 
-	// Handle SPV_EXT_descriptor_indexing.
-	if (type.basetype == SPIRType::Sampler || type.basetype == SPIRType::SampledImage ||
-	    type.basetype == SPIRType::Image || type.basetype == SPIRType::AccelerationStructure)
+	auto *var = maybe_get_backing_variable(ptr_id);
+	if (!var)
+		return;
+
+	if (var->storage != StorageClassUniformConstant &&
+	    var->storage != StorageClassStorageBuffer &&
+	    var->storage != StorageClassUniform)
+		return;
+
+	auto &backing_type = get<SPIRType>(var->basetype);
+	if (backing_type.array.empty())
+		return;
+
+	// If we get here, we know we're accessing an arrayed resource which
+	// might require nonuniform qualifier.
+
+	auto start_array_index = expr.find_first_of('[');
+
+	if (start_array_index == string::npos)
+		return;
+
+	// We've opened a bracket, track expressions until we can close the bracket.
+	// This must be our resource index.
+	size_t end_array_index = string::npos;
+	unsigned bracket_count = 1;
+	for (size_t index = start_array_index + 1; index < expr.size(); index++)
 	{
-		// The image/sampler ID must be declared as non-uniform.
-		// However, it is not legal GLSL to have
-		// nonuniformEXT(samplers[index]), so we must move the nonuniform qualifier
-		// to the array indexing, like
-		// samplers[nonuniformEXT(index)].
-		// While the access chain will generally be nonuniformEXT, it's not necessarily so,
-		// so we might have to fixup the OpLoad-ed expression late.
-
-		auto start_array_index = expr.find_first_of('[');
-
-		if (start_array_index == string::npos)
-			return;
-
-		// Check for the edge case that a non-arrayed resource was marked to be nonuniform,
-		// and the bracket we found is actually part of non-resource related data.
-		if (expr.find_first_of(',') < start_array_index)
-			return;
-
-		// We've opened a bracket, track expressions until we can close the bracket.
-		// This must be our image index.
-		size_t end_array_index = string::npos;
-		unsigned bracket_count = 1;
-		for (size_t index = start_array_index + 1; index < expr.size(); index++)
+		if (expr[index] == ']')
 		{
-			if (expr[index] == ']')
+			if (--bracket_count == 0)
 			{
-				if (--bracket_count == 0)
-				{
-					end_array_index = index;
-					break;
-				}
+				end_array_index = index;
+				break;
 			}
-			else if (expr[index] == '[')
-				bracket_count++;
 		}
-
-		assert(bracket_count == 0);
-
-		// Doesn't really make sense to declare a non-arrayed image with nonuniformEXT, but there's
-		// nothing we can do here to express that.
-		if (start_array_index == string::npos || end_array_index == string::npos || end_array_index < start_array_index)
-			return;
-
-		start_array_index++;
-
-		expr = join(expr.substr(0, start_array_index), backend.nonuniform_qualifier, "(",
-		            expr.substr(start_array_index, end_array_index - start_array_index), ")",
-		            expr.substr(end_array_index, string::npos));
+		else if (expr[index] == '[')
+			bracket_count++;
 	}
+
+	assert(bracket_count == 0);
+
+	// Doesn't really make sense to declare a non-arrayed image with nonuniformEXT, but there's
+	// nothing we can do here to express that.
+	if (start_array_index == string::npos || end_array_index == string::npos || end_array_index < start_array_index)
+		return;
+
+	start_array_index++;
+
+	expr = join(expr.substr(0, start_array_index), backend.nonuniform_qualifier, "(",
+	            expr.substr(start_array_index, end_array_index - start_array_index), ")",
+	            expr.substr(end_array_index, string::npos));
 }
 
 void CompilerGLSL::emit_block_hints(const SPIRBlock &)
@@ -15342,40 +15424,6 @@ void CompilerGLSL::emit_line_directive(uint32_t file_id, uint32_t line_literal)
 	}
 }
 
-void CompilerGLSL::propagate_nonuniform_qualifier(uint32_t id)
-{
-	// SPIR-V might only tag the very last ID with NonUniformEXT, but for codegen,
-	// we need to know NonUniformEXT a little earlier, when the resource is actually loaded.
-	// Back-propagate the qualifier based on the expression dependency chain.
-
-	if (!has_decoration(id, DecorationNonUniformEXT))
-	{
-		set_decoration(id, DecorationNonUniformEXT);
-		force_recompile();
-	}
-
-	auto *e = maybe_get<SPIRExpression>(id);
-	auto *combined = maybe_get<SPIRCombinedImageSampler>(id);
-	auto *chain = maybe_get<SPIRAccessChain>(id);
-	if (e)
-	{
-		for (auto &expr : e->expression_dependencies)
-			propagate_nonuniform_qualifier(expr);
-		for (auto &expr : e->implied_read_expressions)
-			propagate_nonuniform_qualifier(expr);
-	}
-	else if (combined)
-	{
-		propagate_nonuniform_qualifier(combined->image);
-		propagate_nonuniform_qualifier(combined->sampler);
-	}
-	else if (chain)
-	{
-		for (auto &expr : chain->implied_read_expressions)
-			propagate_nonuniform_qualifier(expr);
-	}
-}
-
 void CompilerGLSL::emit_copy_logical_type(uint32_t lhs_id, uint32_t lhs_type_id, uint32_t rhs_id, uint32_t rhs_type_id,
                                           SmallVector<uint32_t> chain)
 {
@@ -15788,3 +15836,149 @@ void CompilerGLSL::rewrite_load_for_wrapped_row_major(std::string &expr, TypeID
 		expr = join("spvWorkaroundRowMajor(", expr, ")");
 	}
 }
+
+void CompilerGLSL::mask_stage_output_by_location(uint32_t location, uint32_t component)
+{
+	masked_output_locations.insert({ location, component });
+}
+
+void CompilerGLSL::mask_stage_output_by_builtin(BuiltIn builtin)
+{
+	masked_output_builtins.insert(builtin);
+}
+
+bool CompilerGLSL::is_stage_output_variable_masked(const SPIRVariable &var) const
+{
+	auto &type = get<SPIRType>(var.basetype);
+	bool is_block = has_decoration(type.self, DecorationBlock);
+	// Blocks by themselves are never masked. Must be masked per-member.
+	if (is_block)
+		return false;
+
+	bool is_builtin = has_decoration(var.self, DecorationBuiltIn);
+
+	if (is_builtin)
+	{
+		return is_stage_output_builtin_masked(BuiltIn(get_decoration(var.self, DecorationBuiltIn)));
+	}
+	else
+	{
+		if (!has_decoration(var.self, DecorationLocation))
+			return false;
+
+		return is_stage_output_location_masked(
+				get_decoration(var.self, DecorationLocation),
+				get_decoration(var.self, DecorationComponent));
+	}
+}
+
+bool CompilerGLSL::is_stage_output_block_member_masked(const SPIRVariable &var, uint32_t index, bool strip_array) const
+{
+	auto &type = get<SPIRType>(var.basetype);
+	bool is_block = has_decoration(type.self, DecorationBlock);
+	if (!is_block)
+		return false;
+
+	BuiltIn builtin = BuiltInMax;
+	if (is_member_builtin(type, index, &builtin))
+	{
+		return is_stage_output_builtin_masked(builtin);
+	}
+	else
+	{
+		uint32_t location = get_declared_member_location(var, index, strip_array);
+		uint32_t component = get_member_decoration(type.self, index, DecorationComponent);
+		return is_stage_output_location_masked(location, component);
+	}
+}
+
+bool CompilerGLSL::is_stage_output_location_masked(uint32_t location, uint32_t component) const
+{
+	return masked_output_locations.count({ location, component }) != 0;
+}
+
+bool CompilerGLSL::is_stage_output_builtin_masked(spv::BuiltIn builtin) const
+{
+	return masked_output_builtins.count(builtin) != 0;
+}
+
+uint32_t CompilerGLSL::get_declared_member_location(const SPIRVariable &var, uint32_t mbr_idx, bool strip_array) const
+{
+	auto &block_type = get<SPIRType>(var.basetype);
+	if (has_member_decoration(block_type.self, mbr_idx, DecorationLocation))
+		return get_member_decoration(block_type.self, mbr_idx, DecorationLocation);
+	else
+		return get_accumulated_member_location(var, mbr_idx, strip_array);
+}
+
+uint32_t CompilerGLSL::get_accumulated_member_location(const SPIRVariable &var, uint32_t mbr_idx, bool strip_array) const
+{
+	auto &type = strip_array ? get_variable_element_type(var) : get_variable_data_type(var);
+	uint32_t location = get_decoration(var.self, DecorationLocation);
+
+	for (uint32_t i = 0; i < mbr_idx; i++)
+	{
+		auto &mbr_type = get<SPIRType>(type.member_types[i]);
+
+		// Start counting from any place we have a new location decoration.
+		if (has_member_decoration(type.self, mbr_idx, DecorationLocation))
+			location = get_member_decoration(type.self, mbr_idx, DecorationLocation);
+
+		uint32_t location_count = type_to_location_count(mbr_type);
+		location += location_count;
+	}
+
+	return location;
+}
+
+StorageClass CompilerGLSL::get_expression_effective_storage_class(uint32_t ptr)
+{
+	auto *var = maybe_get_backing_variable(ptr);
+
+	// If the expression has been lowered to a temporary, we need to use the Generic storage class.
+	// We're looking for the effective storage class of a given expression.
+	// An access chain or forwarded OpLoads from such access chains
+	// will generally have the storage class of the underlying variable, but if the load was not forwarded
+	// we have lost any address space qualifiers.
+	bool forced_temporary = ir.ids[ptr].get_type() == TypeExpression && !get<SPIRExpression>(ptr).access_chain &&
+	                        (forced_temporaries.count(ptr) != 0 || forwarded_temporaries.count(ptr) == 0);
+
+	if (var && !forced_temporary)
+	{
+		if (variable_decl_is_remapped_storage(*var, StorageClassWorkgroup))
+			return StorageClassWorkgroup;
+		if (variable_decl_is_remapped_storage(*var, StorageClassStorageBuffer))
+			return StorageClassStorageBuffer;
+
+		// Normalize SSBOs to StorageBuffer here.
+		if (var->storage == StorageClassUniform &&
+		    has_decoration(get<SPIRType>(var->basetype).self, DecorationBufferBlock))
+			return StorageClassStorageBuffer;
+		else
+			return var->storage;
+	}
+	else
+		return expression_type(ptr).storage;
+}
+
+uint32_t CompilerGLSL::type_to_location_count(const SPIRType &type) const
+{
+	uint32_t count;
+	if (type.basetype == SPIRType::Struct)
+	{
+		uint32_t mbr_count = uint32_t(type.member_types.size());
+		count = 0;
+		for (uint32_t i = 0; i < mbr_count; i++)
+			count += type_to_location_count(get<SPIRType>(type.member_types[i]));
+	}
+	else
+	{
+		count = type.columns > 1 ? type.columns : 1;
+	}
+
+	uint32_t dim_count = uint32_t(type.array.size());
+	for (uint32_t i = 0; i < dim_count; i++)
+		count *= to_array_size_literal(type, i);
+
+	return count;
+}
diff --git a/3rdparty/spirv-cross/spirv_glsl.hpp b/3rdparty/spirv-cross/spirv_glsl.hpp
index 0e0303824..a3501ca99 100644
--- a/3rdparty/spirv-cross/spirv_glsl.hpp
+++ b/3rdparty/spirv-cross/spirv_glsl.hpp
@@ -65,7 +65,8 @@ enum AccessChainFlagBits
 	ACCESS_CHAIN_PTR_CHAIN_BIT = 1 << 2,
 	ACCESS_CHAIN_SKIP_REGISTER_EXPRESSION_READ_BIT = 1 << 3,
 	ACCESS_CHAIN_LITERAL_MSB_FORCE_ID = 1 << 4,
-	ACCESS_CHAIN_FLATTEN_ALL_MEMBERS_BIT = 1 << 5
+	ACCESS_CHAIN_FLATTEN_ALL_MEMBERS_BIT = 1 << 5,
+	ACCESS_CHAIN_FORCE_COMPOSITE_BIT = 1 << 6
 };
 typedef uint32_t AccessChainFlags;
 
@@ -250,6 +251,16 @@ public:
 	// - Images which are statically used at least once with Dref opcodes.
 	bool variable_is_depth_or_compare(VariableID id) const;
 
+	// If a shader output is active in this stage, but inactive in a subsequent stage,
+	// this can be signalled here. This can be used to work around certain cross-stage matching problems
+	// which plagues MSL and HLSL in certain scenarios.
+	// An output which matches one of these will not be emitted in stage output interfaces, but rather treated as a private
+	// variable.
+	// This option is only meaningful for MSL and HLSL, since GLSL matches by location directly.
+	// Masking builtins only takes effect if the builtin in question is part of the stage output interface.
+	void mask_stage_output_by_location(uint32_t location, uint32_t component);
+	void mask_stage_output_by_builtin(spv::BuiltIn builtin);
+
 protected:
 	struct ShaderSubgroupSupportHelper
 	{
@@ -375,6 +386,7 @@ protected:
 	virtual std::string constant_expression_vector(const SPIRConstant &c, uint32_t vector);
 	virtual void emit_fixup();
 	virtual std::string variable_decl(const SPIRType &type, const std::string &name, uint32_t id = 0);
+	virtual bool variable_decl_is_remapped_storage(const SPIRVariable &var, spv::StorageClass storage) const;
 	virtual std::string to_func_call_arg(const SPIRFunction::Parameter &arg, uint32_t id);
 
 	struct TextureFunctionBaseArguments
@@ -569,6 +581,7 @@ protected:
 		bool use_array_constructor = false;
 		bool needs_row_major_load_workaround = false;
 		bool support_pointer_to_pointer = false;
+		bool support_precise_qualifier = false;
 	} backend;
 
 	void emit_struct(SPIRType &type);
@@ -616,6 +629,8 @@ protected:
 	void emit_trinary_func_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, uint32_t op2,
 	                          const char *op);
 	void emit_binary_func_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, const char *op);
+	void emit_atomic_func_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, const char *op);
+	void emit_atomic_func_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, uint32_t op2, const char *op);
 
 	void emit_unary_func_op_cast(uint32_t result_type, uint32_t result_id, uint32_t op0, const char *op,
 	                             SPIRType::BaseType input_type, SPIRType::BaseType expected_result_type);
@@ -661,6 +676,9 @@ protected:
 	std::string access_chain_internal(uint32_t base, const uint32_t *indices, uint32_t count, AccessChainFlags flags,
 	                                  AccessChainMeta *meta);
 
+	spv::StorageClass get_expression_effective_storage_class(uint32_t ptr);
+	virtual bool access_chain_needs_stage_io_builtin_translation(uint32_t base);
+
 	virtual void prepare_access_chain_for_scalar_access(std::string &expr, const SPIRType &type,
 	                                                    spv::StorageClass storage, bool &is_packed);
 
@@ -691,6 +709,7 @@ protected:
 	void emit_uninitialized_temporary(uint32_t type, uint32_t id);
 	SPIRExpression &emit_uninitialized_temporary_expression(uint32_t type, uint32_t id);
 	void append_global_func_args(const SPIRFunction &func, uint32_t index, SmallVector<std::string> &arglist);
+	std::string to_non_uniform_aware_expression(uint32_t id);
 	std::string to_expression(uint32_t id, bool register_expression_read = true);
 	std::string to_composite_constructor_expression(uint32_t id, bool uses_buffer_offset);
 	std::string to_rerolled_array_expression(const std::string &expr, const SPIRType &type);
@@ -716,17 +735,17 @@ protected:
 	virtual std::string to_qualifiers_glsl(uint32_t id);
 	void fixup_io_block_patch_qualifiers(const SPIRVariable &var);
 	void emit_output_variable_initializer(const SPIRVariable &var);
-	const char *to_precision_qualifiers_glsl(uint32_t id);
+	std::string to_precision_qualifiers_glsl(uint32_t id);
 	virtual const char *to_storage_qualifiers_glsl(const SPIRVariable &var);
-	const char *flags_to_qualifiers_glsl(const SPIRType &type, const Bitset &flags);
+	std::string flags_to_qualifiers_glsl(const SPIRType &type, const Bitset &flags);
 	const char *format_to_glsl(spv::ImageFormat format);
 	virtual std::string layout_for_member(const SPIRType &type, uint32_t index);
 	virtual std::string to_interpolation_qualifiers(const Bitset &flags);
 	std::string layout_for_variable(const SPIRVariable &variable);
 	std::string to_combined_image_sampler(VariableID image_id, VariableID samp_id);
 	virtual bool skip_argument(uint32_t id) const;
-	virtual void emit_array_copy(const std::string &lhs, uint32_t rhs_id, spv::StorageClass lhs_storage,
-	                             spv::StorageClass rhs_storage);
+	virtual void emit_array_copy(const std::string &lhs, uint32_t lhs_id, uint32_t rhs_id,
+	                             spv::StorageClass lhs_storage, spv::StorageClass rhs_storage);
 	virtual void emit_block_hints(const SPIRBlock &block);
 	virtual std::string to_initializer_expression(const SPIRVariable &var);
 	virtual std::string to_zero_initialized_expression(uint32_t type_id);
@@ -741,6 +760,7 @@ protected:
 	uint32_t type_to_packed_alignment(const SPIRType &type, const Bitset &flags, BufferPackingStandard packing);
 	uint32_t type_to_packed_array_stride(const SPIRType &type, const Bitset &flags, BufferPackingStandard packing);
 	uint32_t type_to_packed_size(const SPIRType &type, const Bitset &flags, BufferPackingStandard packing);
+	uint32_t type_to_location_count(const SPIRType &type) const;
 
 	std::string bitcast_glsl(const SPIRType &result_type, uint32_t arg);
 	virtual std::string bitcast_glsl_op(const SPIRType &result_type, const SPIRType &argument_type);
@@ -881,7 +901,7 @@ protected:
 	virtual void cast_from_builtin_load(uint32_t source_id, std::string &expr, const SPIRType &expr_type);
 	void unroll_array_from_complex_load(uint32_t target_id, uint32_t source_id, std::string &expr);
 	bool unroll_array_to_complex_store(uint32_t target_id, uint32_t source_id);
-	void convert_non_uniform_expression(const SPIRType &type, std::string &expr);
+	void convert_non_uniform_expression(std::string &expr, uint32_t ptr_id);
 
 	void handle_store_to_invariant_variable(uint32_t store_id, uint32_t value_id);
 	void disallow_forwarding_in_expression_chain(const SPIRExpression &expr);
@@ -900,10 +920,17 @@ protected:
 	void fixup_type_alias();
 	void reorder_type_alias();
 
-	void propagate_nonuniform_qualifier(uint32_t id);
-
 	static const char *vector_swizzle(int vecsize, int index);
 
+	bool is_stage_output_location_masked(uint32_t location, uint32_t component) const;
+	bool is_stage_output_builtin_masked(spv::BuiltIn builtin) const;
+	bool is_stage_output_variable_masked(const SPIRVariable &var) const;
+	bool is_stage_output_block_member_masked(const SPIRVariable &var, uint32_t index, bool strip_array) const;
+	uint32_t get_accumulated_member_location(const SPIRVariable &var, uint32_t mbr_idx, bool strip_array) const;
+	uint32_t get_declared_member_location(const SPIRVariable &var, uint32_t mbr_idx, bool strip_array) const;
+	std::unordered_set<LocationComponentPair, InternalHasher> masked_output_locations;
+	std::unordered_set<uint32_t> masked_output_builtins;
+
 private:
 	void init();
 };
diff --git a/3rdparty/spirv-cross/spirv_hlsl.cpp b/3rdparty/spirv-cross/spirv_hlsl.cpp
index c0b6d1d9a..9435c4e06 100644
--- a/3rdparty/spirv-cross/spirv_hlsl.cpp
+++ b/3rdparty/spirv-cross/spirv_hlsl.cpp
@@ -570,7 +570,7 @@ void CompilerHLSL::emit_builtin_outputs_in_struct()
 		switch (builtin)
 		{
 		case BuiltInPosition:
-			type = "float4";
+			type = is_position_invariant() && backend.support_precise_qualifier ? "precise float4" : "float4";
 			semantic = legacy ? "POSITION" : "SV_Position";
 			break;
 
@@ -818,8 +818,8 @@ string CompilerHLSL::to_interpolation_qualifiers(const Bitset &flags)
 		res += "patch "; // Seems to be different in actual HLSL.
 	if (flags.get(DecorationSample))
 		res += "sample ";
-	if (flags.get(DecorationInvariant))
-		res += "invariant "; // Not supported?
+	if (flags.get(DecorationInvariant) && backend.support_precise_qualifier)
+		res += "precise "; // Not supported?
 
 	return res;
 }
@@ -865,24 +865,10 @@ void CompilerHLSL::emit_io_block(const SPIRVariable &var)
 	begin_scope();
 	type.member_name_cache.clear();
 
-	uint32_t base_location = get_decoration(var.self, DecorationLocation);
-
 	for (uint32_t i = 0; i < uint32_t(type.member_types.size()); i++)
 	{
-		string semantic;
-		if (has_member_decoration(type.self, i, DecorationLocation))
-		{
-			uint32_t location = get_member_decoration(type.self, i, DecorationLocation);
-			semantic = join(" : ", to_semantic(location, execution.model, var.storage));
-		}
-		else
-		{
-			// If the block itself has a location, but not its members, use the implicit location.
-			// There could be a conflict if the block members partially specialize the locations.
-			// It is unclear how SPIR-V deals with this. Assume this does not happen for now.
-			uint32_t location = base_location + i;
-			semantic = join(" : ", to_semantic(location, execution.model, var.storage));
-		}
+		uint32_t location = get_accumulated_member_location(var, i, false);
+		string semantic = join(" : ", to_semantic(location, execution.model, var.storage));
 
 		add_member_name(type, i);
 
@@ -2249,7 +2235,7 @@ void CompilerHLSL::emit_push_constant_block(const SPIRVariable &var)
 
 string CompilerHLSL::to_sampler_expression(uint32_t id)
 {
-	auto expr = join("_", to_expression(id));
+	auto expr = join("_", to_non_uniform_aware_expression(id));
 	auto index = expr.find_first_of('[');
 	if (index == string::npos)
 	{
@@ -2754,14 +2740,17 @@ void CompilerHLSL::emit_texture_op(const Instruction &i, bool sparse)
 	bool proj = false;
 	const uint32_t *opt = nullptr;
 	auto *combined_image = maybe_get<SPIRCombinedImageSampler>(img);
-	auto img_expr = to_expression(combined_image ? combined_image->image : img);
+
+	if (combined_image && has_decoration(img, DecorationNonUniform))
+	{
+		set_decoration(combined_image->image, DecorationNonUniform);
+		set_decoration(combined_image->sampler, DecorationNonUniform);
+	}
+
+	auto img_expr = to_non_uniform_aware_expression(combined_image ? combined_image->image : img);
 
 	inherited_expressions.push_back(coord);
 
-	// Make sure non-uniform decoration is back-propagated to where it needs to be.
-	if (has_decoration(img, DecorationNonUniformEXT))
-		propagate_nonuniform_qualifier(img);
-
 	switch (op)
 	{
 	case OpImageSampleDrefImplicitLod:
@@ -3016,7 +3005,7 @@ void CompilerHLSL::emit_texture_op(const Instruction &i, bool sparse)
 	{
 		string sampler_expr;
 		if (combined_image)
-			sampler_expr = to_expression(combined_image->sampler);
+			sampler_expr = to_non_uniform_aware_expression(combined_image->sampler);
 		else
 			sampler_expr = to_sampler_expression(img);
 		expr += sampler_expr;
@@ -3812,6 +3801,10 @@ void CompilerHLSL::read_access_chain(string *expr, const string &lhs, const SPIR
 		SPIRV_CROSS_THROW("Reading types other than 32-bit from ByteAddressBuffer not yet supported, unless SM 6.2 and "
 		                  "native 16-bit types are enabled.");
 
+	string base = chain.base;
+	if (has_decoration(chain.self, DecorationNonUniform))
+		convert_non_uniform_expression(base, chain.self);
+
 	bool templated_load = hlsl_options.shader_model >= 62;
 	string load_expr;
 
@@ -3844,7 +3837,7 @@ void CompilerHLSL::read_access_chain(string *expr, const string &lhs, const SPIR
 		if (templated_load)
 			load_op = "Load";
 
-		load_expr = join(chain.base, ".", load_op, template_expr, "(", chain.dynamic_index, chain.static_index, ")");
+		load_expr = join(base, ".", load_op, template_expr, "(", chain.dynamic_index, chain.static_index, ")");
 	}
 	else if (type.columns == 1)
 	{
@@ -3866,7 +3859,7 @@ void CompilerHLSL::read_access_chain(string *expr, const string &lhs, const SPIR
 
 		for (uint32_t r = 0; r < type.vecsize; r++)
 		{
-			load_expr += join(chain.base, ".Load", template_expr, "(", chain.dynamic_index,
+			load_expr += join(base, ".Load", template_expr, "(", chain.dynamic_index,
 			                  chain.static_index + r * chain.matrix_stride, ")");
 			if (r + 1 < type.vecsize)
 				load_expr += ", ";
@@ -3915,7 +3908,7 @@ void CompilerHLSL::read_access_chain(string *expr, const string &lhs, const SPIR
 
 		for (uint32_t c = 0; c < type.columns; c++)
 		{
-			load_expr += join(chain.base, ".", load_op, template_expr, "(", chain.dynamic_index,
+			load_expr += join(base, ".", load_op, template_expr, "(", chain.dynamic_index,
 			                  chain.static_index + c * chain.matrix_stride, ")");
 			if (c + 1 < type.columns)
 				load_expr += ", ";
@@ -3944,7 +3937,7 @@ void CompilerHLSL::read_access_chain(string *expr, const string &lhs, const SPIR
 		{
 			for (uint32_t r = 0; r < type.vecsize; r++)
 			{
-				load_expr += join(chain.base, ".Load", template_expr, "(", chain.dynamic_index,
+				load_expr += join(base, ".Load", template_expr, "(", chain.dynamic_index,
 				                  chain.static_index + c * (type.width / 8) + r * chain.matrix_stride, ")");
 
 				if ((r + 1 < type.vecsize) || (c + 1 < type.columns))
@@ -3981,9 +3974,6 @@ void CompilerHLSL::emit_load(const Instruction &instruction)
 		uint32_t id = ops[1];
 		uint32_t ptr = ops[2];
 
-		if (has_decoration(ptr, DecorationNonUniformEXT))
-			propagate_nonuniform_qualifier(ptr);
-
 		auto &type = get<SPIRType>(result_type);
 		bool composite_load = !type.array.empty() || type.basetype == SPIRType::Struct;
 
@@ -4122,9 +4112,6 @@ void CompilerHLSL::write_access_chain(const SPIRAccessChain &chain, uint32_t val
 	// Make sure we trigger a read of the constituents in the access chain.
 	track_expression_read(chain.self);
 
-	if (has_decoration(chain.self, DecorationNonUniformEXT))
-		propagate_nonuniform_qualifier(chain.self);
-
 	SPIRType target_type;
 	target_type.basetype = SPIRType::UInt;
 	target_type.vecsize = type.vecsize;
@@ -4148,6 +4135,10 @@ void CompilerHLSL::write_access_chain(const SPIRAccessChain &chain, uint32_t val
 
 	bool templated_store = hlsl_options.shader_model >= 62;
 
+	auto base = chain.base;
+	if (has_decoration(chain.self, DecorationNonUniform))
+		convert_non_uniform_expression(base, chain.self);
+
 	string template_expr;
 	if (templated_store)
 		template_expr = join("<", type_to_glsl(type), ">");
@@ -4183,7 +4174,7 @@ void CompilerHLSL::write_access_chain(const SPIRAccessChain &chain, uint32_t val
 		}
 		else
 			store_op = "Store";
-		statement(chain.base, ".", store_op, template_expr, "(", chain.dynamic_index, chain.static_index, ", ",
+		statement(base, ".", store_op, template_expr, "(", chain.dynamic_index, chain.static_index, ", ",
 		          store_expr, ");");
 	}
 	else if (type.columns == 1)
@@ -4214,7 +4205,7 @@ void CompilerHLSL::write_access_chain(const SPIRAccessChain &chain, uint32_t val
 					store_expr = join(bitcast_op, "(", store_expr, ")");
 			}
 
-			statement(chain.base, ".Store", template_expr, "(", chain.dynamic_index,
+			statement(base, ".Store", template_expr, "(", chain.dynamic_index,
 			          chain.static_index + chain.matrix_stride * r, ", ", store_expr, ");");
 		}
 	}
@@ -4258,7 +4249,7 @@ void CompilerHLSL::write_access_chain(const SPIRAccessChain &chain, uint32_t val
 					store_expr = join(bitcast_op, "(", store_expr, ")");
 			}
 
-			statement(chain.base, ".", store_op, template_expr, "(", chain.dynamic_index,
+			statement(base, ".", store_op, template_expr, "(", chain.dynamic_index,
 			          chain.static_index + c * chain.matrix_stride, ", ", store_expr, ");");
 		}
 	}
@@ -4282,7 +4273,7 @@ void CompilerHLSL::write_access_chain(const SPIRAccessChain &chain, uint32_t val
 				auto bitcast_op = bitcast_glsl_op(target_type, type);
 				if (!bitcast_op.empty())
 					store_expr = join(bitcast_op, "(", store_expr, ")");
-				statement(chain.base, ".Store", template_expr, "(", chain.dynamic_index,
+				statement(base, ".Store", template_expr, "(", chain.dynamic_index,
 				          chain.static_index + c * (type.width / 8) + r * chain.matrix_stride, ", ", store_expr, ");");
 			}
 		}
@@ -4384,9 +4375,6 @@ void CompilerHLSL::emit_access_chain(const Instruction &instruction)
 			inherit_expression_dependencies(ops[1], ops[i]);
 			add_implied_read_expression(e, ops[i]);
 		}
-
-		if (has_decoration(ops[1], DecorationNonUniformEXT))
-			propagate_nonuniform_qualifier(ops[1]);
 	}
 	else
 	{
@@ -4486,13 +4474,16 @@ void CompilerHLSL::emit_atomic(const uint32_t *ops, uint32_t length, spv::Op op)
 
 		if (data_type.storage == StorageClassImage || !chain)
 		{
-			statement(atomic_op, "(", to_expression(ops[0]), ", ", to_expression(ops[3]), ", ", to_expression(tmp_id),
-			          ");");
+			statement(atomic_op, "(", to_non_uniform_aware_expression(ops[0]), ", ",
+			          to_expression(ops[3]), ", ", to_expression(tmp_id), ");");
 		}
 		else
 		{
+			string base = chain->base;
+			if (has_decoration(chain->self, DecorationNonUniform))
+				convert_non_uniform_expression(base, chain->self);
 			// RWByteAddress buffer is always uint in its underlying type.
-			statement(chain->base, ".", atomic_op, "(", chain->dynamic_index, chain->static_index, ", ",
+			statement(base, ".", atomic_op, "(", chain->dynamic_index, chain->static_index, ", ",
 			          to_expression(ops[3]), ", ", to_expression(tmp_id), ");");
 		}
 	}
@@ -4510,14 +4501,17 @@ void CompilerHLSL::emit_atomic(const uint32_t *ops, uint32_t length, spv::Op op)
 		SPIRType::BaseType expr_type;
 		if (data_type.storage == StorageClassImage || !chain)
 		{
-			statement(atomic_op, "(", to_expression(ops[2]), ", ", value_expr, ", ", to_name(id), ");");
+			statement(atomic_op, "(", to_non_uniform_aware_expression(ops[2]), ", ", value_expr, ", ", to_name(id), ");");
 			expr_type = data_type.basetype;
 		}
 		else
 		{
 			// RWByteAddress buffer is always uint in its underlying type.
+			string base = chain->base;
+			if (has_decoration(chain->self, DecorationNonUniform))
+				convert_non_uniform_expression(base, chain->self);
 			expr_type = SPIRType::UInt;
-			statement(chain->base, ".", atomic_op, "(", chain->dynamic_index, chain->static_index, ", ", value_expr,
+			statement(base, ".", atomic_op, "(", chain->dynamic_index, chain->static_index, ", ", value_expr,
 			          ", ", to_name(id), ");");
 		}
 
@@ -4618,13 +4612,35 @@ void CompilerHLSL::emit_subgroup_op(const Instruction &i)
 	}
 
 	case OpGroupNonUniformShuffle:
-		SPIRV_CROSS_THROW("Cannot trivially implement Shuffle in HLSL.");
+		emit_binary_func_op(result_type, id, ops[3], ops[4], "WaveReadLaneAt");
+		break;
 	case OpGroupNonUniformShuffleXor:
-		SPIRV_CROSS_THROW("Cannot trivially implement ShuffleXor in HLSL.");
+	{
+		bool forward = should_forward(ops[3]);
+		emit_op(ops[0], ops[1],
+		        join("WaveReadLaneAt(", to_unpacked_expression(ops[3]), ", ",
+		             "WaveGetLaneIndex() ^ ", to_enclosed_expression(ops[4]), ")"), forward);
+		inherit_expression_dependencies(ops[1], ops[3]);
+		break;
+	}
 	case OpGroupNonUniformShuffleUp:
-		SPIRV_CROSS_THROW("Cannot trivially implement ShuffleUp in HLSL.");
+	{
+		bool forward = should_forward(ops[3]);
+		emit_op(ops[0], ops[1],
+		        join("WaveReadLaneAt(", to_unpacked_expression(ops[3]), ", ",
+		             "WaveGetLaneIndex() - ", to_enclosed_expression(ops[4]), ")"), forward);
+		inherit_expression_dependencies(ops[1], ops[3]);
+		break;
+	}
 	case OpGroupNonUniformShuffleDown:
-		SPIRV_CROSS_THROW("Cannot trivially implement ShuffleDown in HLSL.");
+	{
+		bool forward = should_forward(ops[3]);
+		emit_op(ops[0], ops[1],
+		        join("WaveReadLaneAt(", to_unpacked_expression(ops[3]), ", ",
+		             "WaveGetLaneIndex() + ", to_enclosed_expression(ops[4]), ")"), forward);
+		inherit_expression_dependencies(ops[1], ops[3]);
+		break;
+	}
 
 	case OpGroupNonUniformAll:
 		emit_unary_func_op(result_type, id, ops[3], "WaveActiveAllTrue");
@@ -5150,7 +5166,7 @@ void CompilerHLSL::emit_instruction(const Instruction &instruction)
 		auto dummy_samples_levels = join(get_fallback_name(id), "_dummy_parameter");
 		statement("uint ", dummy_samples_levels, ";");
 
-		auto expr = join("spvTextureSize(", to_expression(ops[2]), ", ",
+		auto expr = join("spvTextureSize(", to_non_uniform_aware_expression(ops[2]), ", ",
 		                 bitcast_expression(SPIRType::UInt, ops[3]), ", ", dummy_samples_levels, ")");
 
 		auto &restype = get<SPIRType>(ops[0]);
@@ -5176,9 +5192,9 @@ void CompilerHLSL::emit_instruction(const Instruction &instruction)
 
 		string expr;
 		if (uav)
-			expr = join("spvImageSize(", to_expression(ops[2]), ", ", dummy_samples_levels, ")");
+			expr = join("spvImageSize(", to_non_uniform_aware_expression(ops[2]), ", ", dummy_samples_levels, ")");
 		else
-			expr = join("spvTextureSize(", to_expression(ops[2]), ", 0u, ", dummy_samples_levels, ")");
+			expr = join("spvTextureSize(", to_non_uniform_aware_expression(ops[2]), ", 0u, ", dummy_samples_levels, ")");
 
 		auto &restype = get<SPIRType>(ops[0]);
 		expr = bitcast_expression(restype, SPIRType::UInt, expr);
@@ -5208,9 +5224,9 @@ void CompilerHLSL::emit_instruction(const Instruction &instruction)
 		statement(variable_decl(type, to_name(id)), ";");
 
 		if (uav)
-			statement("spvImageSize(", to_expression(ops[2]), ", ", to_name(id), ");");
+			statement("spvImageSize(", to_non_uniform_aware_expression(ops[2]), ", ", to_name(id), ");");
 		else
-			statement("spvTextureSize(", to_expression(ops[2]), ", 0u, ", to_name(id), ");");
+			statement("spvTextureSize(", to_non_uniform_aware_expression(ops[2]), ", 0u, ", to_name(id), ");");
 
 		auto &restype = get<SPIRType>(ops[0]);
 		auto expr = bitcast_expression(restype, SPIRType::UInt, to_name(id));
@@ -5241,16 +5257,16 @@ void CompilerHLSL::emit_instruction(const Instruction &instruction)
 				if (operands != ImageOperandsSampleMask || instruction.length != 6)
 					SPIRV_CROSS_THROW("Multisampled image used in OpImageRead, but unexpected operand mask was used.");
 				uint32_t sample = ops[5];
-				imgexpr = join(to_expression(ops[2]), ".Load(int2(gl_FragCoord.xy), ", to_expression(sample), ")");
+				imgexpr = join(to_non_uniform_aware_expression(ops[2]), ".Load(int2(gl_FragCoord.xy), ", to_expression(sample), ")");
 			}
 			else
-				imgexpr = join(to_expression(ops[2]), ".Load(int3(int2(gl_FragCoord.xy), 0))");
+				imgexpr = join(to_non_uniform_aware_expression(ops[2]), ".Load(int3(int2(gl_FragCoord.xy), 0))");
 
 			pure = true;
 		}
 		else
 		{
-			imgexpr = join(to_expression(ops[2]), "[", to_expression(ops[3]), "]");
+			imgexpr = join(to_non_uniform_aware_expression(ops[2]), "[", to_expression(ops[3]), "]");
 			// The underlying image type in HLSL depends on the image format, unlike GLSL, where all images are "vec4",
 			// except that the underlying type changes how the data is interpreted.
 
@@ -5299,7 +5315,7 @@ void CompilerHLSL::emit_instruction(const Instruction &instruction)
 			value_expr = remap_swizzle(narrowed_type, expression_type(ops[2]).vecsize, value_expr);
 		}
 
-		statement(to_expression(ops[0]), "[", to_expression(ops[1]), "] = ", value_expr, ";");
+		statement(to_non_uniform_aware_expression(ops[0]), "[", to_expression(ops[1]), "] = ", value_expr, ";");
 		if (var && variable_storage_is_aliased(*var))
 			flush_all_aliased_variables();
 		break;
@@ -5311,10 +5327,7 @@ void CompilerHLSL::emit_instruction(const Instruction &instruction)
 		uint32_t id = ops[1];
 
 		auto expr = to_expression(ops[2]);
-		if (has_decoration(id, DecorationNonUniformEXT) || has_decoration(ops[2], DecorationNonUniformEXT))
-			convert_non_uniform_expression(expression_type(ops[2]), expr);
 		expr += join("[", to_expression(ops[3]), "]");
-
 		auto &e = set<SPIRExpression>(id, expr, result_type, true);
 
 		// When using the pointer, we need to know which variable it is actually loaded from.
@@ -5492,7 +5505,7 @@ void CompilerHLSL::emit_instruction(const Instruction &instruction)
 
 	case OpArrayLength:
 	{
-		auto *var = maybe_get<SPIRVariable>(ops[2]);
+		auto *var = maybe_get_backing_variable(ops[2]);
 		if (!var)
 			SPIRV_CROSS_THROW("Array length must point directly to an SSBO block.");
 
@@ -5502,7 +5515,7 @@ void CompilerHLSL::emit_instruction(const Instruction &instruction)
 
 		// This must be 32-bit uint, so we're good to go.
 		emit_uninitialized_temporary_expression(ops[0], ops[1]);
-		statement(to_expression(ops[2]), ".GetDimensions(", to_expression(ops[1]), ");");
+		statement(to_non_uniform_aware_expression(ops[2]), ".GetDimensions(", to_expression(ops[1]), ");");
 		uint32_t offset = type_struct_member_offset(type, ops[3]);
 		uint32_t stride = type_struct_member_array_stride(type, ops[3]);
 		statement(to_expression(ops[1]), " = (", to_expression(ops[1]), " - ", offset, ") / ", stride, ";");
@@ -5718,6 +5731,9 @@ string CompilerHLSL::compile()
 	backend.nonuniform_qualifier = "NonUniformResourceIndex";
 	backend.support_case_fallthrough = false;
 
+	// SM 4.1 does not support precise for some reason.
+	backend.support_precise_qualifier = hlsl_options.shader_model >= 50 || hlsl_options.shader_model == 40;
+
 	fixup_type_alias();
 	reorder_type_alias();
 	build_function_control_flow_graphs_and_analyze();
diff --git a/3rdparty/spirv-cross/spirv_msl.cpp b/3rdparty/spirv-cross/spirv_msl.cpp
index b428afdf6..5570a211e 100644
--- a/3rdparty/spirv-cross/spirv_msl.cpp
+++ b/3rdparty/spirv-cross/spirv_msl.cpp
@@ -67,6 +67,52 @@ void CompilerMSL::add_msl_resource_binding(const MSLResourceBinding &binding)
 {
 	StageSetBinding tuple = { binding.stage, binding.desc_set, binding.binding };
 	resource_bindings[tuple] = { binding, false };
+
+	// If we might need to pad argument buffer members to positionally align
+	// arg buffer indexes, also maintain a lookup by argument buffer index.
+	if (msl_options.pad_argument_buffer_resources)
+	{
+		StageSetBinding arg_idx_tuple = { binding.stage, binding.desc_set, k_unknown_component };
+
+#define ADD_ARG_IDX_TO_BINDING_NUM_LOOKUP(rez) \
+	arg_idx_tuple.binding = binding.msl_##rez; \
+	resource_arg_buff_idx_to_binding_number[arg_idx_tuple] = binding.binding
+
+		switch (binding.basetype)
+		{
+		case SPIRType::Void:
+		case SPIRType::Boolean:
+		case SPIRType::SByte:
+		case SPIRType::UByte:
+		case SPIRType::Short:
+		case SPIRType::UShort:
+		case SPIRType::Int:
+		case SPIRType::UInt:
+		case SPIRType::Int64:
+		case SPIRType::UInt64:
+		case SPIRType::AtomicCounter:
+		case SPIRType::Half:
+		case SPIRType::Float:
+		case SPIRType::Double:
+			ADD_ARG_IDX_TO_BINDING_NUM_LOOKUP(buffer);
+			break;
+		case SPIRType::Image:
+			ADD_ARG_IDX_TO_BINDING_NUM_LOOKUP(texture);
+			break;
+		case SPIRType::Sampler:
+			ADD_ARG_IDX_TO_BINDING_NUM_LOOKUP(sampler);
+			break;
+		case SPIRType::SampledImage:
+			ADD_ARG_IDX_TO_BINDING_NUM_LOOKUP(texture);
+			ADD_ARG_IDX_TO_BINDING_NUM_LOOKUP(sampler);
+			break;
+		default:
+			SPIRV_CROSS_THROW("Unexpected argument buffer resource base type. When padding argument buffer elements, "
+			                  "all descriptor set resources must be supplied with a base type by the app.");
+			break;
+		}
+#undef ADD_ARG_IDX_TO_BINDING_NUM_LOOKUP
+	}
 }
 
 void CompilerMSL::add_dynamic_buffer(uint32_t desc_set, uint32_t binding, uint32_t index)
@@ -100,7 +146,9 @@ void CompilerMSL::set_argument_buffer_device_address_space(uint32_t desc_set, bo
 
 bool CompilerMSL::is_msl_shader_input_used(uint32_t location)
 {
-	return location_inputs_in_use.count(location) != 0;
+	// Don't report internal location allocations to app.
+	return location_inputs_in_use.count(location) != 0 &&
+	       location_inputs_in_use_fallback.count(location) == 0;
 }
 
 uint32_t CompilerMSL::get_automatic_builtin_input_location(spv::BuiltIn builtin) const
@@ -1685,8 +1733,11 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 	// Add the global variables as arguments to the function
 	if (func_id != ir.default_entry_point)
 	{
-		bool added_in = false;
-		bool added_out = false;
+		bool control_point_added_in = false;
+		bool control_point_added_out = false;
+		bool patch_added_in = false;
+		bool patch_added_out = false;
+
 		for (uint32_t arg_id : added_arg_ids)
 		{
 			auto &var = get<SPIRVariable>(arg_id);
@@ -1694,34 +1745,72 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 			auto *p_type = &get<SPIRType>(type_id);
 			BuiltIn bi_type = BuiltIn(get_decoration(arg_id, DecorationBuiltIn));
 
-			if (((is_tessellation_shader() && var.storage == StorageClassInput) ||
-			     (get_execution_model() == ExecutionModelTessellationControl && var.storage == StorageClassOutput)) &&
-			    !(has_decoration(arg_id, DecorationPatch) || is_patch_block(*p_type)) &&
-			    (!is_builtin_variable(var) || bi_type == BuiltInPosition || bi_type == BuiltInPointSize ||
-			     bi_type == BuiltInClipDistance || bi_type == BuiltInCullDistance ||
-			     p_type->basetype == SPIRType::Struct))
+			bool is_patch = has_decoration(arg_id, DecorationPatch) || is_patch_block(*p_type);
+			bool is_block = has_decoration(p_type->self, DecorationBlock);
+			bool is_control_point_storage =
+					!is_patch &&
+					((is_tessellation_shader() && var.storage == StorageClassInput) ||
+					 (get_execution_model() == ExecutionModelTessellationControl && var.storage == StorageClassOutput));
+			bool is_patch_block_storage = is_patch && is_block && var.storage == StorageClassOutput;
+			bool is_builtin = is_builtin_variable(var);
+			bool variable_is_stage_io =
+					!is_builtin || bi_type == BuiltInPosition || bi_type == BuiltInPointSize ||
+					bi_type == BuiltInClipDistance || bi_type == BuiltInCullDistance ||
+					p_type->basetype == SPIRType::Struct;
+			bool is_redirected_to_global_stage_io = (is_control_point_storage || is_patch_block_storage) &&
+			                                        variable_is_stage_io;
+
+			// If output is masked it is not considered part of the global stage IO interface.
+			if (is_redirected_to_global_stage_io && var.storage == StorageClassOutput)
+				is_redirected_to_global_stage_io = !is_stage_output_variable_masked(var);
+
+			if (is_redirected_to_global_stage_io)
 			{
 				// Tessellation control shaders see inputs and per-vertex outputs as arrays.
 				// Similarly, tessellation evaluation shaders see per-vertex inputs as arrays.
 				// We collected them into a structure; we must pass the array of this
 				// structure to the function.
 				std::string name;
+				if (is_patch)
+					name = var.storage == StorageClassInput ? patch_stage_in_var_name : patch_stage_out_var_name;
+				else
+					name = var.storage == StorageClassInput ? "gl_in" : "gl_out";
+
+				if (var.storage == StorageClassOutput && has_decoration(p_type->self, DecorationBlock))
+				{
+					// If we're redirecting a block, we might still need to access the original block
+					// variable if we're masking some members.
+					for (uint32_t mbr_idx = 0; mbr_idx < uint32_t(p_type->member_types.size()); mbr_idx++)
+					{
+						if (is_stage_output_block_member_masked(var, mbr_idx, true))
+						{
+							func.add_parameter(var.basetype, var.self, true);
+							break;
+						}
+					}
+				}
+
+				// Tessellation control shaders see inputs and per-vertex outputs as arrays.
+				// Similarly, tessellation evaluation shaders see per-vertex inputs as arrays.
+				// We collected them into a structure; we must pass the array of this
+				// structure to the function.
 				if (var.storage == StorageClassInput)
 				{
+					auto &added_in = is_patch ? patch_added_in : control_point_added_in;
 					if (added_in)
 						continue;
-					name = "gl_in";
-					arg_id = stage_in_ptr_var_id;
+					arg_id = is_patch ? patch_stage_in_var_id : stage_in_ptr_var_id;
 					added_in = true;
 				}
 				else if (var.storage == StorageClassOutput)
 				{
+					auto &added_out = is_patch ? patch_added_out : control_point_added_out;
 					if (added_out)
 						continue;
-					name = "gl_out";
-					arg_id = stage_out_ptr_var_id;
+					arg_id = is_patch ? patch_stage_out_var_id : stage_out_ptr_var_id;
 					added_out = true;
 				}
+
 				type_id = get<SPIRVariable>(arg_id).basetype;
 				uint32_t next_id = ir.increase_bound_by(1);
 				func.add_parameter(type_id, next_id, true);
@@ -1729,7 +1818,7 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 
 				set_name(next_id, name);
 			}
-			else if (is_builtin_variable(var) && p_type->basetype == SPIRType::Struct)
+			else if (is_builtin && has_decoration(p_type->self, DecorationBlock))
 			{
 				// Get the pointee type
 				type_id = get_pointee_type_id(type_id);
@@ -1739,7 +1828,7 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 				for (auto &mbr_type_id : p_type->member_types)
 				{
 					BuiltIn builtin = BuiltInMax;
-					bool is_builtin = is_member_builtin(*p_type, mbr_idx, &builtin);
+					is_builtin = is_member_builtin(*p_type, mbr_idx, &builtin);
 					if (is_builtin && has_active_builtin(builtin, var.storage))
 					{
 						// Add a arg variable with the same type and decorations as the member
@@ -1825,28 +1914,20 @@ void CompilerMSL::mark_as_packable(SPIRType &type)
 	}
 }
 
-uint32_t CompilerMSL::type_to_location_count(const SPIRType &type) const
-{
-	// In MSL, we cannot place structs in any context where we need locations.
-	assert(type.basetype != SPIRType::Struct);
-
-	uint32_t dim = 1;
-	for (uint32_t i = 0; i < type.array.size(); i++)
-		dim *= to_array_size_literal(type, i);
-
-	uint32_t count = dim * type.columns;
-	return count;
-}
-
 // If a shader input exists at the location, it is marked as being used by this shader
-void CompilerMSL::mark_location_as_used_by_shader(uint32_t location, const SPIRType &type, StorageClass storage)
+void CompilerMSL::mark_location_as_used_by_shader(uint32_t location, const SPIRType &type,
+                                                  StorageClass storage, bool fallback)
 {
 	if (storage != StorageClassInput)
 		return;
 
 	uint32_t count = type_to_location_count(type);
 	for (uint32_t i = 0; i < count; i++)
+	{
 		location_inputs_in_use.insert(location + i);
+		if (fallback)
+			location_inputs_in_use_fallback.insert(location + i);
+	}
 }
 
 uint32_t CompilerMSL::get_target_components_for_fragment_location(uint32_t location) const
@@ -2066,10 +2147,11 @@ void CompilerMSL::add_plain_variable_to_interface_block(StorageClass storage, co
 			{
 				entry_func.fixup_hooks_in.push_back([=, &var]() {
 					uint32_t index = get_extended_decoration(var.self, SPIRVCrossDecorationInterfaceMemberIndex);
+					auto invocation = to_tesc_invocation_id();
 					statement(to_expression(stage_out_ptr_var_id), "[",
-					          builtin_to_glsl(BuiltInInvocationId, StorageClassInput), "].",
+					          invocation, "].",
 					          to_member_name(ib_type, index), " = ", to_expression(var.initializer), "[",
-					          builtin_to_glsl(BuiltInInvocationId, StorageClassInput), "];");
+					          invocation, "];");
 				});
 			}
 			else
@@ -2087,7 +2169,9 @@ void CompilerMSL::add_plain_variable_to_interface_block(StorageClass storage, co
 		uint32_t locn = get_decoration(var.self, DecorationLocation);
 		if (storage == StorageClassInput)
 		{
-			type_id = ensure_correct_input_type(var.basetype, locn, location_meta ? location_meta->num_components : 0);
+			type_id = ensure_correct_input_type(var.basetype, locn,
+			                                    location_meta ? location_meta->num_components : 0,
+			                                    meta.strip_array);
 			if (!location_meta)
 				var.basetype = type_id;
 
@@ -2257,8 +2341,8 @@ void CompilerMSL::add_composite_variable_to_interface_block(StorageClass storage
 			uint32_t locn = get_decoration(var.self, DecorationLocation) + i;
 			if (storage == StorageClassInput)
 			{
-				var.basetype = ensure_correct_input_type(var.basetype, locn);
-				uint32_t mbr_type_id = ensure_correct_input_type(usable_type->self, locn);
+				var.basetype = ensure_correct_input_type(var.basetype, locn, 0, meta.strip_array);
+				uint32_t mbr_type_id = ensure_correct_input_type(usable_type->self, locn, 0, meta.strip_array);
 				if (storage == StorageClassInput && pull_model_inputs.count(var.self))
 					ib_type.member_types[ib_mbr_idx] = build_msl_interpolant_type(mbr_type_id, is_noperspective);
 				else
@@ -2273,11 +2357,11 @@ void CompilerMSL::add_composite_variable_to_interface_block(StorageClass storage
 			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
 			mark_location_as_used_by_shader(locn, *usable_type, storage);
 		}
-		else if (is_builtin && builtin == BuiltInClipDistance)
+		else if (is_builtin && (builtin == BuiltInClipDistance || builtin == BuiltInCullDistance))
 		{
-			// Declare the ClipDistance as [[user(clipN)]].
-			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationBuiltIn, BuiltInClipDistance);
-			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, i);
+			// Declare the Clip/CullDistance as [[user(clip/cullN)]].
+			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationBuiltIn, builtin);
+			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationIndex, i);
 		}
 
 		if (get_decoration_bitset(var.self).get(DecorationIndex))
@@ -2351,34 +2435,6 @@ void CompilerMSL::add_composite_variable_to_interface_block(StorageClass storage
 	}
 }
 
-uint32_t CompilerMSL::get_accumulated_member_location(const SPIRVariable &var, uint32_t mbr_idx, bool strip_array)
-{
-	auto &type = strip_array ? get_variable_element_type(var) : get_variable_data_type(var);
-	uint32_t location = get_decoration(var.self, DecorationLocation);
-
-	for (uint32_t i = 0; i < mbr_idx; i++)
-	{
-		auto &mbr_type = get<SPIRType>(type.member_types[i]);
-
-		// Start counting from any place we have a new location decoration.
-		if (has_member_decoration(type.self, mbr_idx, DecorationLocation))
-			location = get_member_decoration(type.self, mbr_idx, DecorationLocation);
-
-		uint32_t location_count = 1;
-
-		if (mbr_type.columns > 1)
-			location_count = mbr_type.columns;
-
-		if (!mbr_type.array.empty())
-			for (uint32_t j = 0; j < uint32_t(mbr_type.array.size()); j++)
-				location_count *= to_array_size_literal(mbr_type, j);
-
-		location += location_count;
-	}
-
-	return location;
-}
-
 void CompilerMSL::add_composite_member_variable_to_interface_block(StorageClass storage, const string &ib_var_ref,
                                                                    SPIRType &ib_type, SPIRVariable &var,
                                                                    uint32_t mbr_idx, InterfaceBlockMeta &meta)
@@ -2474,11 +2530,11 @@ void CompilerMSL::add_composite_member_variable_to_interface_block(StorageClass
 			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
 			mark_location_as_used_by_shader(locn, *usable_type, storage);
 		}
-		else if (is_builtin && builtin == BuiltInClipDistance)
+		else if (is_builtin && (builtin == BuiltInClipDistance || builtin == BuiltInCullDistance))
 		{
-			// Declare the ClipDistance as [[user(clipN)]].
-			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationBuiltIn, BuiltInClipDistance);
-			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, i);
+			// Declare the Clip/CullDistance as [[user(clip/cullN)]].
+			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationBuiltIn, builtin);
+			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationIndex, i);
 		}
 
 		if (has_member_decoration(var_type.self, mbr_idx, DecorationComponent))
@@ -2501,7 +2557,7 @@ void CompilerMSL::add_composite_member_variable_to_interface_block(StorageClass
 		set_extended_member_decoration(ib_type.self, ib_mbr_idx, SPIRVCrossDecorationInterfaceMemberIndex, mbr_idx);
 
 		// Unflatten or flatten from [[stage_in]] or [[stage_out]] as appropriate.
-		if (!meta.strip_array)
+		if (!meta.strip_array && meta.allow_local_declaration)
 		{
 			switch (storage)
 			{
@@ -2602,7 +2658,7 @@ void CompilerMSL::add_plain_member_variable_to_interface_block(StorageClass stor
 		// so redirect to qualified name.
 		set_member_qualified_name(var_type.self, mbr_idx, qual_var_name);
 	}
-	else if (!meta.strip_array)
+	else if (!meta.strip_array && meta.allow_local_declaration)
 	{
 		// Unflatten or flatten from [[stage_in]] or [[stage_out]] as appropriate.
 		switch (storage)
@@ -2631,7 +2687,7 @@ void CompilerMSL::add_plain_member_variable_to_interface_block(StorageClass stor
 		uint32_t locn = get_member_decoration(var_type.self, mbr_idx, DecorationLocation);
 		if (storage == StorageClassInput)
 		{
-			mbr_type_id = ensure_correct_input_type(mbr_type_id, locn);
+			mbr_type_id = ensure_correct_input_type(mbr_type_id, locn, 0, meta.strip_array);
 			var_type.member_types[mbr_idx] = mbr_type_id;
 			if (storage == StorageClassInput && pull_model_inputs.count(var.self))
 				ib_type.member_types[ib_mbr_idx] = build_msl_interpolant_type(mbr_type_id, is_noperspective);
@@ -2648,7 +2704,7 @@ void CompilerMSL::add_plain_member_variable_to_interface_block(StorageClass stor
 		uint32_t locn = get_accumulated_member_location(var, mbr_idx, meta.strip_array);
 		if (storage == StorageClassInput)
 		{
-			mbr_type_id = ensure_correct_input_type(mbr_type_id, locn);
+			mbr_type_id = ensure_correct_input_type(mbr_type_id, locn, 0, meta.strip_array);
 			var_type.member_types[mbr_idx] = mbr_type_id;
 			if (storage == StorageClassInput && pull_model_inputs.count(var.self))
 				ib_type.member_types[ib_mbr_idx] = build_msl_interpolant_type(mbr_type_id, is_noperspective);
@@ -2691,14 +2747,12 @@ void CompilerMSL::add_plain_member_variable_to_interface_block(StorageClass stor
 		{
 			entry_func.fixup_hooks_in.push_back([=, &var]() {
 				auto &type = this->get<SPIRType>(var.basetype);
-				uint32_t index = get_extended_decoration(var.self, SPIRVCrossDecorationInterfaceMemberIndex);
-				index += mbr_idx;
-
-				AccessChainMeta chain_meta;
-				auto constant_chain = access_chain_internal(var.initializer, &builtin_invocation_id_id, 1, 0, &chain_meta);
+				uint32_t index = get_extended_member_decoration(var.self, mbr_idx, SPIRVCrossDecorationInterfaceMemberIndex);
 
+				auto invocation = to_tesc_invocation_id();
+				auto constant_chain = join(to_expression(var.initializer), "[", invocation, "]");
 				statement(to_expression(stage_out_ptr_var_id), "[",
-				          builtin_to_glsl(BuiltInInvocationId, StorageClassInput), "].",
+				          invocation, "].",
 				          to_member_name(ib_type, index), " = ",
 				          constant_chain, ".", to_member_name(type, mbr_idx), ";");
 			});
@@ -2743,100 +2797,19 @@ void CompilerMSL::add_tess_level_input_to_interface_block(const std::string &ib_
 	BuiltIn builtin = BuiltIn(get_decoration(var.self, DecorationBuiltIn));
 
 	// Force the variable to have the proper name.
-	set_name(var.self, builtin_to_glsl(builtin, StorageClassFunction));
+	string var_name = builtin_to_glsl(builtin, StorageClassFunction);
+	set_name(var.self, var_name);
 
-	if (get_execution_mode_bitset().get(ExecutionModeTriangles))
-	{
-		// Triangles are tricky, because we want only one member in the struct.
+	// We need to declare the variable early and at entry-point scope.
+	entry_func.add_local_variable(var.self);
+	vars_needing_early_declaration.push_back(var.self);
+	bool triangles = get_execution_mode_bitset().get(ExecutionModeTriangles);
+	string mbr_name;
 
-		// We need to declare the variable early and at entry-point scope.
-		entry_func.add_local_variable(var.self);
-		vars_needing_early_declaration.push_back(var.self);
-
-		string mbr_name = "gl_TessLevel";
-
-		// If we already added the other one, we can skip this step.
-		if (!added_builtin_tess_level)
-		{
-			// Add a reference to the variable type to the interface struct.
-			uint32_t ib_mbr_idx = uint32_t(ib_type.member_types.size());
-
-			uint32_t type_id = build_extended_vector_type(var_type.self, 4);
-
-			ib_type.member_types.push_back(type_id);
-
-			// Give the member a name
-			set_member_name(ib_type.self, ib_mbr_idx, mbr_name);
-
-			// We cannot decorate both, but the important part is that
-			// it's marked as builtin so we can get automatic attribute assignment if needed.
-			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationBuiltIn, builtin);
-
-			// There is no qualified alias since we need to flatten the internal array on return.
-			if (get_decoration_bitset(var.self).get(DecorationLocation))
-			{
-				uint32_t locn = get_decoration(var.self, DecorationLocation);
-				set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-				mark_location_as_used_by_shader(locn, var_type, StorageClassInput);
-			}
-			else if (inputs_by_builtin.count(builtin))
-			{
-				uint32_t locn = inputs_by_builtin[builtin].location;
-				set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-				mark_location_as_used_by_shader(locn, var_type, StorageClassInput);
-			}
-
-			added_builtin_tess_level = true;
-		}
-
-		switch (builtin)
-		{
-		case BuiltInTessLevelOuter:
-			entry_func.fixup_hooks_in.push_back([=, &var]() {
-				statement(to_name(var.self), "[0] = ", ib_var_ref, ".", mbr_name, ".x;");
-				statement(to_name(var.self), "[1] = ", ib_var_ref, ".", mbr_name, ".y;");
-				statement(to_name(var.self), "[2] = ", ib_var_ref, ".", mbr_name, ".z;");
-			});
-			break;
-
-		case BuiltInTessLevelInner:
-			entry_func.fixup_hooks_in.push_back(
-			    [=, &var]() { statement(to_name(var.self), "[0] = ", ib_var_ref, ".", mbr_name, ".w;"); });
-			break;
-
-		default:
-			assert(false);
-			break;
-		}
-	}
-	else
-	{
-		// Add a reference to the variable type to the interface struct.
-		uint32_t ib_mbr_idx = uint32_t(ib_type.member_types.size());
-
-		uint32_t type_id = build_extended_vector_type(var_type.self, builtin == BuiltInTessLevelOuter ? 4 : 2);
-		// Change the type of the variable, too.
-		uint32_t ptr_type_id = ir.increase_bound_by(1);
-		auto &new_var_type = set<SPIRType>(ptr_type_id, get<SPIRType>(type_id));
-		new_var_type.pointer = true;
-		new_var_type.pointer_depth++;
-		new_var_type.storage = StorageClassInput;
-		new_var_type.parent_type = type_id;
-		var.basetype = ptr_type_id;
-
-		ib_type.member_types.push_back(type_id);
-
-		// Give the member a name
-		string mbr_name = to_expression(var.self);
-		set_member_name(ib_type.self, ib_mbr_idx, mbr_name);
-
-		// Since vectors can be indexed like arrays, there is no need to unpack this. We can
-		// just refer to the vector directly. So give it a qualified alias.
-		string qual_var_name = ib_var_ref + "." + mbr_name;
-		ir.meta[var.self].decoration.qualified_alias = qual_var_name;
-
-		set_member_decoration(ib_type.self, ib_mbr_idx, DecorationBuiltIn, builtin);
+	// Add a reference to the variable type to the interface struct.
+	uint32_t ib_mbr_idx = uint32_t(ib_type.member_types.size());
 
+	const auto mark_locations = [&](const SPIRType &new_var_type) {
 		if (get_decoration_bitset(var.self).get(DecorationLocation))
 		{
 			uint32_t locn = get_decoration(var.self, DecorationLocation);
@@ -2849,6 +2822,170 @@ void CompilerMSL::add_tess_level_input_to_interface_block(const std::string &ib_
 			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
 			mark_location_as_used_by_shader(locn, new_var_type, StorageClassInput);
 		}
+	};
+
+	if (triangles)
+	{
+		// Triangles are tricky, because we want only one member in the struct.
+		mbr_name = "gl_TessLevel";
+
+		// If we already added the other one, we can skip this step.
+		if (!added_builtin_tess_level)
+		{
+			uint32_t type_id = build_extended_vector_type(var_type.self, 4);
+
+			ib_type.member_types.push_back(type_id);
+
+			// Give the member a name
+			set_member_name(ib_type.self, ib_mbr_idx, mbr_name);
+
+			// We cannot decorate both, but the important part is that
+			// it's marked as builtin so we can get automatic attribute assignment if needed.
+			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationBuiltIn, builtin);
+
+			mark_locations(var_type);
+			added_builtin_tess_level = true;
+		}
+	}
+	else
+	{
+		mbr_name = var_name;
+
+		uint32_t type_id = build_extended_vector_type(var_type.self, builtin == BuiltInTessLevelOuter ? 4 : 2);
+
+		uint32_t ptr_type_id = ir.increase_bound_by(1);
+		auto &new_var_type = set<SPIRType>(ptr_type_id, get<SPIRType>(type_id));
+		new_var_type.pointer = true;
+		new_var_type.pointer_depth++;
+		new_var_type.storage = StorageClassInput;
+		new_var_type.parent_type = type_id;
+
+		ib_type.member_types.push_back(type_id);
+
+		// Give the member a name
+		set_member_name(ib_type.self, ib_mbr_idx, mbr_name);
+		set_member_decoration(ib_type.self, ib_mbr_idx, DecorationBuiltIn, builtin);
+
+		mark_locations(new_var_type);
+	}
+
+	if (builtin == BuiltInTessLevelOuter)
+	{
+		entry_func.fixup_hooks_in.push_back([=]() {
+			statement(var_name, "[0] = ", ib_var_ref, ".", mbr_name, ".x;");
+			statement(var_name, "[1] = ", ib_var_ref, ".", mbr_name, ".y;");
+			statement(var_name, "[2] = ", ib_var_ref, ".", mbr_name, ".z;");
+			if (!triangles)
+				statement(var_name, "[3] = ", ib_var_ref, ".", mbr_name, ".w;");
+		});
+	}
+	else
+	{
+		entry_func.fixup_hooks_in.push_back([=]() {
+			if (triangles)
+			{
+				statement(var_name, "[0] = ", ib_var_ref, ".", mbr_name, ".w;");
+			}
+			else
+			{
+				statement(var_name, "[0] = ", ib_var_ref, ".", mbr_name, ".x;");
+				statement(var_name, "[1] = ", ib_var_ref, ".", mbr_name, ".y;");
+			}
+		});
+	}
+}
+
+bool CompilerMSL::variable_storage_requires_stage_io(spv::StorageClass storage) const
+{
+	if (storage == StorageClassOutput)
+		return !capture_output_to_buffer;
+	else if (storage == StorageClassInput)
+		return !(get_execution_model() == ExecutionModelTessellationControl && msl_options.multi_patch_workgroup);
+	else
+		return false;
+}
+
+string CompilerMSL::to_tesc_invocation_id()
+{
+	if (msl_options.multi_patch_workgroup)
+	{
+		// n.b. builtin_invocation_id_id here is the dispatch global invocation ID,
+		// not the TC invocation ID.
+		return join(to_expression(builtin_invocation_id_id), ".x % ", get_entry_point().output_vertices);
+	}
+	else
+		return builtin_to_glsl(BuiltInInvocationId, StorageClassInput);
+}
+
+void CompilerMSL::emit_local_masked_variable(const SPIRVariable &masked_var, bool strip_array)
+{
+	auto &entry_func = get<SPIRFunction>(ir.default_entry_point);
+	bool threadgroup_storage = variable_decl_is_remapped_storage(masked_var, StorageClassWorkgroup);
+
+	if (threadgroup_storage && msl_options.multi_patch_workgroup)
+	{
+		// We need one threadgroup block per patch, so fake this.
+		entry_func.fixup_hooks_in.push_back([this, &masked_var]() {
+			auto &type = get_variable_data_type(masked_var);
+			add_local_variable_name(masked_var.self);
+
+			bool old_is_builtin = is_using_builtin_array;
+			is_using_builtin_array = true;
+
+			const uint32_t max_control_points_per_patch = 32u;
+			uint32_t max_num_instances =
+					(max_control_points_per_patch + get_entry_point().output_vertices - 1u) /
+					get_entry_point().output_vertices;
+			statement("threadgroup ", type_to_glsl(type), " ",
+			          "spvStorage", to_name(masked_var.self), "[", max_num_instances, "]",
+			          type_to_array_glsl(type), ";");
+
+			// Assign a threadgroup slice to each PrimitiveID.
+			// We assume here that workgroup size is rounded to 32,
+			// since that's the maximum number of control points per patch.
+			// We cannot size the array based on fixed dispatch parameters,
+			// since Metal does not allow that. :(
+			// FIXME: We will likely need an option to support passing down target workgroup size,
+			// so we can emit appropriate size here.
+			statement("threadgroup ", type_to_glsl(type), " ",
+			          "(&", to_name(masked_var.self), ")",
+			          type_to_array_glsl(type), " = spvStorage", to_name(masked_var.self), "[",
+			          "(", to_expression(builtin_invocation_id_id), ".x / ",
+			          get_entry_point().output_vertices, ") % ",
+			          max_num_instances, "];");
+
+			is_using_builtin_array = old_is_builtin;
+		});
+	}
+	else
+	{
+		entry_func.add_local_variable(masked_var.self);
+	}
+
+	if (!threadgroup_storage)
+	{
+		vars_needing_early_declaration.push_back(masked_var.self);
+	}
+	else if (masked_var.initializer)
+	{
+		// Cannot directly initialize threadgroup variables. Need fixup hooks.
+		ID initializer = masked_var.initializer;
+		if (strip_array)
+		{
+			entry_func.fixup_hooks_in.push_back([this, &masked_var, initializer]() {
+				auto invocation = to_tesc_invocation_id();
+				statement(to_expression(masked_var.self), "[",
+				          invocation, "] = ",
+				          to_expression(initializer), "[",
+				          invocation, "];");
+			});
+		}
+		else
+		{
+			entry_func.fixup_hooks_in.push_back([this, &masked_var, initializer]() {
+				statement(to_expression(masked_var.self), " = ", to_expression(initializer), ";");
+			});
+		}
 	}
 }
 
@@ -2862,11 +2999,25 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 	auto &var_type = meta.strip_array ? get_variable_element_type(var) : get_variable_data_type(var);
 	bool is_builtin = is_builtin_variable(var);
 	auto builtin = BuiltIn(get_decoration(var.self, DecorationBuiltIn));
+	bool is_block = has_decoration(var_type.self, DecorationBlock);
+
+	// If stage variables are masked out, emit them as plain variables instead.
+	// For builtins, we query them one by one later.
+	// IO blocks are not masked here, we need to mask them per-member instead.
+	if (storage == StorageClassOutput && is_stage_output_variable_masked(var))
+	{
+		// If we ignore an output, we must still emit it, since it might be used by app.
+		// Instead, just emit it as early declaration.
+		emit_local_masked_variable(var, meta.strip_array);
+		return;
+	}
 
 	if (var_type.basetype == SPIRType::Struct)
 	{
-		if (!is_builtin_type(var_type) && (!capture_output_to_buffer || storage == StorageClassInput) &&
-		    !meta.strip_array)
+		bool block_requires_flattening = variable_storage_requires_stage_io(storage) || is_block;
+		bool needs_local_declaration = !is_builtin && block_requires_flattening && meta.allow_local_declaration;
+
+		if (needs_local_declaration)
 		{
 			// For I/O blocks or structs, we will need to pass the block itself around
 			// to functions if they are used globally in leaf functions.
@@ -2874,11 +3025,10 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 			// we unflatten I/O blocks while running the shader,
 			// and pass the actual struct type down to leaf functions.
 			// We then unflatten inputs, and flatten outputs in the "fixup" stages.
-			entry_func.add_local_variable(var.self);
-			vars_needing_early_declaration.push_back(var.self);
+			emit_local_masked_variable(var, meta.strip_array);
 		}
 
-		if (capture_output_to_buffer && storage != StorageClassInput && !has_decoration(var_type.self, DecorationBlock))
+		if (!block_requires_flattening)
 		{
 			// In Metal tessellation shaders, the interface block itself is arrayed. This makes things
 			// very complicated, since stage-in structures in MSL don't support nested structures.
@@ -2889,6 +3039,8 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 		}
 		else
 		{
+			bool masked_block = false;
+
 			// Flatten the struct members into the interface struct
 			for (uint32_t mbr_idx = 0; mbr_idx < uint32_t(var_type.member_types.size()); mbr_idx++)
 			{
@@ -2896,18 +3048,48 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 				is_builtin = is_member_builtin(var_type, mbr_idx, &builtin);
 				auto &mbr_type = get<SPIRType>(var_type.member_types[mbr_idx]);
 
-				if (!is_builtin || has_active_builtin(builtin, storage))
+				if (storage == StorageClassOutput && is_stage_output_block_member_masked(var, mbr_idx, meta.strip_array))
+				{
+					if (is_block)
+						masked_block = true;
+
+					// Non-builtin block output variables are just ignored, since they will still access
+					// the block variable as-is. They're just not flattened.
+					if (is_builtin && !meta.strip_array)
+					{
+						// Emit a fake variable instead.
+						uint32_t ids = ir.increase_bound_by(2);
+						uint32_t ptr_type_id = ids + 0;
+						uint32_t var_id = ids + 1;
+
+						auto ptr_type = mbr_type;
+						ptr_type.pointer = true;
+						ptr_type.pointer_depth++;
+						ptr_type.parent_type = var_type.member_types[mbr_idx];
+						ptr_type.storage = StorageClassOutput;
+
+						uint32_t initializer = 0;
+						if (var.initializer)
+							if (auto *c = maybe_get<SPIRConstant>(var.initializer))
+								initializer = c->subconstants[mbr_idx];
+
+						set<SPIRType>(ptr_type_id, ptr_type);
+						set<SPIRVariable>(var_id, ptr_type_id, StorageClassOutput, initializer);
+						entry_func.add_local_variable(var_id);
+						vars_needing_early_declaration.push_back(var_id);
+						set_name(var_id, builtin_to_glsl(builtin, StorageClassOutput));
+						set_decoration(var_id, DecorationBuiltIn, builtin);
+					}
+				}
+				else if (!is_builtin || has_active_builtin(builtin, storage))
 				{
 					bool is_composite_type = is_matrix(mbr_type) || is_array(mbr_type);
 					bool attribute_load_store =
 					    storage == StorageClassInput && get_execution_model() != ExecutionModelFragment;
-					bool storage_is_stage_io =
-					    (storage == StorageClassInput && !(get_execution_model() == ExecutionModelTessellationControl &&
-					                                       msl_options.multi_patch_workgroup)) ||
-					    storage == StorageClassOutput;
+					bool storage_is_stage_io = variable_storage_requires_stage_io(storage);
 
-					// ClipDistance always needs to be declared as user attributes.
-					if (builtin == BuiltInClipDistance)
+					// Clip/CullDistance always need to be declared as user attributes.
+					if (builtin == BuiltInClipDistance || builtin == BuiltInCullDistance)
 						is_builtin = false;
 
 					if ((!is_builtin || attribute_load_store) && storage_is_stage_io && is_composite_type)
@@ -2921,6 +3103,29 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 					}
 				}
 			}
+
+			// If we're redirecting a block, we might still need to access the original block
+			// variable if we're masking some members.
+			if (masked_block && !needs_local_declaration &&
+			    (!is_builtin_variable(var) || get_execution_model() == ExecutionModelTessellationControl))
+			{
+				if (is_builtin_variable(var))
+				{
+					// Ensure correct names for the block members if we're actually going to
+					// declare gl_PerVertex.
+					for (uint32_t mbr_idx = 0; mbr_idx < uint32_t(var_type.member_types.size()); mbr_idx++)
+					{
+						set_member_name(var_type.self, mbr_idx, builtin_to_glsl(
+								BuiltIn(get_member_decoration(var_type.self, mbr_idx, DecorationBuiltIn)),
+								StorageClassOutput));
+					}
+
+					set_name(var_type.self, "gl_PerVertex");
+					set_name(var.self, "gl_out_masked");
+					stage_out_masked_builtin_type_id = var_type.self;
+				}
+				emit_local_masked_variable(var, meta.strip_array);
+			}
 		}
 	}
 	else if (get_execution_model() == ExecutionModelTessellationEvaluation && storage == StorageClassInput &&
@@ -2934,14 +3139,11 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 		if (!is_builtin || has_active_builtin(builtin, storage))
 		{
 			bool is_composite_type = is_matrix(var_type) || is_array(var_type);
-			bool storage_is_stage_io =
-			    (storage == StorageClassInput &&
-			     !(get_execution_model() == ExecutionModelTessellationControl && msl_options.multi_patch_workgroup)) ||
-			    (storage == StorageClassOutput && !capture_output_to_buffer);
+			bool storage_is_stage_io = variable_storage_requires_stage_io(storage);
 			bool attribute_load_store = storage == StorageClassInput && get_execution_model() != ExecutionModelFragment;
 
-			// ClipDistance always needs to be declared as user attributes.
-			if (builtin == BuiltInClipDistance)
+			// Clip/CullDistance always needs to be declared as user attributes.
+			if (builtin == BuiltInClipDistance || builtin == BuiltInCullDistance)
 				is_builtin = false;
 
 			// MSL does not allow matrices or arrays in input or output variables, so need to handle it specially.
@@ -2979,10 +3181,16 @@ void CompilerMSL::fix_up_interface_member_indices(StorageClass storage, uint32_t
 		auto &var = get<SPIRVariable>(var_id);
 
 		auto &type = get_variable_element_type(var);
-		if (storage == StorageClassInput && type.basetype == SPIRType::Struct)
-		{
-			uint32_t mbr_idx = get_extended_member_decoration(ib_type_id, i, SPIRVCrossDecorationInterfaceMemberIndex);
 
+		bool flatten_composites = variable_storage_requires_stage_io(var.storage);
+		bool is_block = has_decoration(type.self, DecorationBlock);
+
+		uint32_t mbr_idx = uint32_t(-1);
+		if (type.basetype == SPIRType::Struct && (flatten_composites || is_block))
+			mbr_idx = get_extended_member_decoration(ib_type_id, i, SPIRVCrossDecorationInterfaceMemberIndex);
+
+		if (mbr_idx != uint32_t(-1))
+		{
 			// Only set the lowest InterfaceMemberIndex for each variable member.
 			// IB struct members will be emitted in-order w.r.t. interface member index.
 			if (!has_extended_member_decoration(var_id, mbr_idx, SPIRVCrossDecorationInterfaceMemberIndex))
@@ -3024,23 +3232,49 @@ uint32_t CompilerMSL::add_interface_block(StorageClass storage, bool patch)
 		auto &type = this->get<SPIRType>(var.basetype);
 
 		bool is_builtin = is_builtin_variable(var);
-		auto bi_type = BuiltIn(get_decoration(var_id, DecorationBuiltIn));
+		bool is_block = has_decoration(type.self, DecorationBlock);
+
+		auto bi_type = BuiltInMax;
+		bool builtin_is_gl_in_out = false;
+		if (is_builtin && !is_block)
+		{
+			bi_type = BuiltIn(get_decoration(var_id, DecorationBuiltIn));
+			builtin_is_gl_in_out = bi_type == BuiltInPosition || bi_type == BuiltInPointSize ||
+			                       bi_type == BuiltInClipDistance || bi_type == BuiltInCullDistance;
+		}
+
+		if (is_builtin && is_block)
+			builtin_is_gl_in_out = true;
+
 		uint32_t location = get_decoration(var_id, DecorationLocation);
 
+		bool builtin_is_stage_in_out = builtin_is_gl_in_out ||
+		                               bi_type == BuiltInLayer || bi_type == BuiltInViewportIndex ||
+		                               bi_type == BuiltInBaryCoordNV || bi_type == BuiltInBaryCoordNoPerspNV ||
+		                               bi_type == BuiltInFragDepth ||
+		                               bi_type == BuiltInFragStencilRefEXT || bi_type == BuiltInSampleMask;
+
 		// These builtins are part of the stage in/out structs.
 		bool is_interface_block_builtin =
-		    (bi_type == BuiltInPosition || bi_type == BuiltInPointSize || bi_type == BuiltInClipDistance ||
-		     bi_type == BuiltInCullDistance || bi_type == BuiltInLayer || bi_type == BuiltInViewportIndex ||
-		     bi_type == BuiltInBaryCoordNV || bi_type == BuiltInBaryCoordNoPerspNV || bi_type == BuiltInFragDepth ||
-		     bi_type == BuiltInFragStencilRefEXT || bi_type == BuiltInSampleMask) ||
-		    (get_execution_model() == ExecutionModelTessellationEvaluation &&
-		     (bi_type == BuiltInTessLevelOuter || bi_type == BuiltInTessLevelInner));
+				builtin_is_stage_in_out ||
+				(get_execution_model() == ExecutionModelTessellationEvaluation &&
+				 (bi_type == BuiltInTessLevelOuter || bi_type == BuiltInTessLevelInner));
 
 		bool is_active = interface_variable_exists_in_entry_point(var.self);
 		if (is_builtin && is_active)
 		{
 			// Only emit the builtin if it's active in this entry point. Interface variable list might lie.
-			is_active = has_active_builtin(bi_type, storage);
+			if (is_block)
+			{
+				// If any builtin is active, the block is active.
+				uint32_t mbr_cnt = uint32_t(type.member_types.size());
+				for (uint32_t i = 0; !is_active && i < mbr_cnt; i++)
+					is_active = has_active_builtin(BuiltIn(get_member_decoration(type.self, i, DecorationBuiltIn)), storage);
+			}
+			else
+			{
+				is_active = has_active_builtin(bi_type, storage);
+			}
 		}
 
 		bool filter_patch_decoration = (has_decoration(var_id, DecorationPatch) || is_patch_block(type)) == patch;
@@ -3048,7 +3282,7 @@ uint32_t CompilerMSL::add_interface_block(StorageClass storage, bool patch)
 		bool hidden = is_hidden_variable(var, incl_builtins);
 
 		// ClipDistance is never hidden, we need to emulate it when used as an input.
-		if (bi_type == BuiltInClipDistance)
+		if (bi_type == BuiltInClipDistance || bi_type == BuiltInCullDistance)
 			hidden = false;
 
 		// It's not enough to simply avoid marking fragment outputs if the pipeline won't
@@ -3283,7 +3517,11 @@ uint32_t CompilerMSL::add_interface_block(StorageClass storage, bool patch)
 		     (get_execution_model() == ExecutionModelTessellationEvaluation && storage == StorageClassInput)) &&
 		    !patch;
 
+		// Fixing up flattened stores in TESC is impossible since the memory is group shared either via
+		// device (not masked) or threadgroup (masked) storage classes and it's race condition city.
 		meta.strip_array = strip_array;
+		meta.allow_local_declaration = !strip_array && !(get_execution_model() == ExecutionModelTessellationControl &&
+		                                                 storage == StorageClassOutput);
 		add_variable_to_interface_block(storage, ib_var_ref, ib_type, *p_var, meta);
 	}
 
@@ -3294,7 +3532,7 @@ uint32_t CompilerMSL::add_interface_block(StorageClass storage, bool patch)
 		// the struct containing them is the correct size and layout.
 		for (auto &input : inputs_by_location)
 		{
-			if (is_msl_shader_input_used(input.first))
+			if (location_inputs_in_use.count(input.first) != 0)
 				continue;
 
 			// Create a fake variable to put at the location.
@@ -3336,18 +3574,18 @@ uint32_t CompilerMSL::add_interface_block(StorageClass storage, bool patch)
 			auto &fake_var = set<SPIRVariable>(var_id, ptr_type_id, storage);
 			set_decoration(var_id, DecorationLocation, input.first);
 			meta.strip_array = true;
+			meta.allow_local_declaration = false;
 			add_variable_to_interface_block(storage, ib_var_ref, ib_type, fake_var, meta);
 		}
 	}
 
 	// Sort the members of the structure by their locations.
-	MemberSorter member_sorter(ib_type, ir.meta[ib_type_id], MemberSorter::Location);
+	MemberSorter member_sorter(ib_type, ir.meta[ib_type_id], MemberSorter::LocationThenBuiltInType);
 	member_sorter.sort();
 
 	// The member indices were saved to the original variables, but after the members
 	// were sorted, those indices are now likely incorrect. Fix those up now.
-	if (!patch)
-		fix_up_interface_member_indices(storage, ib_type_id);
+	fix_up_interface_member_indices(storage, ib_type_id);
 
 	// For patch inputs, add one more member, holding the array of control point data.
 	if (get_execution_model() == ExecutionModelTessellationEvaluation && storage == StorageClassInput && patch &&
@@ -3461,10 +3699,16 @@ uint32_t CompilerMSL::ensure_correct_builtin_type(uint32_t type_id, BuiltIn buil
 // Ensure that the type is compatible with the shader input.
 // If it is, simply return the given type ID.
 // Otherwise, create a new type, and return its ID.
-uint32_t CompilerMSL::ensure_correct_input_type(uint32_t type_id, uint32_t location, uint32_t num_components)
+uint32_t CompilerMSL::ensure_correct_input_type(uint32_t type_id, uint32_t location, uint32_t num_components, bool strip_array)
 {
 	auto &type = get<SPIRType>(type_id);
 
+	uint32_t max_array_dimensions = strip_array ? 1 : 0;
+
+	// Struct and array types must match exactly.
+	if (type.basetype == SPIRType::Struct || type.array.size() > max_array_dimensions)
+		return type_id;
+
 	auto p_va = inputs_by_location.find(location);
 	if (p_va == end(inputs_by_location))
 	{
@@ -4599,6 +4843,16 @@ void CompilerMSL::emit_custom_functions()
 			statement("");
 			break;
 
+		// "fsub" intrinsic support
+		case SPVFuncImplFSub:
+			statement("template<typename T>");
+			statement("T spvFSub(T l, T r)");
+			begin_scope();
+			statement("return fma(T(-1), r, l);");
+			end_scope();
+			statement("");
+			break;
+
 		// "fmul' intrinsic support
 		case SPVFuncImplFMul:
 			statement("template<typename T>");
@@ -6361,6 +6615,10 @@ void CompilerMSL::emit_specialization_constants_and_structs()
 			if (patch_stage_in_var_id && get_patch_stage_in_struct_type().self == type_id)
 				is_declarable_struct = false;
 
+			// Special case. Declare builtin struct anyways if we need to emit a threadgroup version of it.
+			if (stage_out_masked_builtin_type_id == type_id)
+				is_declarable_struct = true;
+
 			// Align and emit declarable structs...but avoid declaring each more than once.
 			if (is_declarable_struct && declared_structs.count(type_id) == 0)
 			{
@@ -6406,16 +6664,22 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 	if (ptr_type.storage == StorageClassOutput && get_execution_model() == ExecutionModelTessellationEvaluation)
 		return false;
 
-	bool multi_patch_tess_ctl = get_execution_model() == ExecutionModelTessellationControl &&
-	                            msl_options.multi_patch_workgroup && ptr_type.storage == StorageClassInput;
-	bool flat_matrix = is_matrix(result_type) && ptr_type.storage == StorageClassInput && !multi_patch_tess_ctl;
-	bool flat_struct = result_type.basetype == SPIRType::Struct && ptr_type.storage == StorageClassInput;
-	bool flat_data_type = flat_matrix || is_array(result_type) || flat_struct;
-	if (!flat_data_type)
-		return false;
-
 	if (has_decoration(ptr, DecorationPatch))
 		return false;
+	bool ptr_is_io_variable = ir.ids[ptr].get_type() == TypeVariable;
+
+	bool flattened_io = variable_storage_requires_stage_io(ptr_type.storage);
+
+	bool flat_data_type = flattened_io &&
+	                      (is_matrix(result_type) || is_array(result_type) || result_type.basetype == SPIRType::Struct);
+
+	// Edge case, even with multi-patch workgroups, we still need to unroll load
+	// if we're loading control points directly.
+	if (ptr_is_io_variable && is_array(result_type))
+		flat_data_type = true;
+
+	if (!flat_data_type)
+		return false;
 
 	// Now, we must unflatten a composite type and take care of interleaving array access with gl_in/gl_out.
 	// Lots of painful code duplication since we *really* should not unroll these kinds of loads in entry point fixup
@@ -6424,12 +6688,31 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 
 	uint32_t interface_index = get_extended_decoration(ptr, SPIRVCrossDecorationInterfaceMemberIndex);
 	auto *var = maybe_get_backing_variable(ptr);
-	bool ptr_is_io_variable = ir.ids[ptr].get_type() == TypeVariable;
 	auto &expr_type = get_pointee_type(ptr_type.self);
 
 	const auto &iface_type = expression_type(stage_in_ptr_var_id);
 
-	if (result_type.array.size() > 2)
+	if (!flattened_io)
+	{
+		// Simplest case for multi-patch workgroups, just unroll array as-is.
+		if (interface_index == uint32_t(-1))
+			return false;
+
+		expr += type_to_glsl(result_type) + "({ ";
+		uint32_t num_control_points = to_array_size_literal(result_type, uint32_t(result_type.array.size()) - 1);
+
+		for (uint32_t i = 0; i < num_control_points; i++)
+		{
+			const uint32_t indices[2] = { i, interface_index };
+			AccessChainMeta meta;
+			expr += access_chain_internal(stage_in_ptr_var_id, indices, 2,
+			                              ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+			if (i + 1 < num_control_points)
+				expr += ", ";
+		}
+		expr += " })";
+	}
+	else if (result_type.array.size() > 2)
 	{
 		SPIRV_CROSS_THROW("Cannot load tessellation IO variables with more than 2 dimensions.");
 	}
@@ -6439,7 +6722,7 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 			SPIRV_CROSS_THROW("Loading an array-of-array must be loaded directly from an IO variable.");
 		if (interface_index == uint32_t(-1))
 			SPIRV_CROSS_THROW("Interface index is unknown. Cannot continue.");
-		if (result_type.basetype == SPIRType::Struct || flat_matrix)
+		if (result_type.basetype == SPIRType::Struct || is_matrix(result_type))
 			SPIRV_CROSS_THROW("Cannot load array-of-array of composite type in tessellation IO.");
 
 		expr += type_to_glsl(result_type) + "({ ";
@@ -6453,44 +6736,19 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 			expr += type_to_glsl(sub_type) + "({ ";
 			interface_index = base_interface_index;
 			uint32_t array_size = to_array_size_literal(result_type, 0);
-			if (multi_patch_tess_ctl)
+			for (uint32_t j = 0; j < array_size; j++, interface_index++)
 			{
-				for (uint32_t j = 0; j < array_size; j++)
-				{
-					const uint32_t indices[3] = { i, interface_index, j };
+				const uint32_t indices[2] = { i, interface_index };
 
-					AccessChainMeta meta;
-					expr +=
-					    access_chain_internal(stage_in_ptr_var_id, indices, 3,
-					                          ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
-					// If the expression has more vector components than the result type, insert
-					// a swizzle. This shouldn't happen normally on valid SPIR-V, but it might
-					// happen if we replace the type of an input variable.
-					if (!is_matrix(sub_type) && sub_type.basetype != SPIRType::Struct &&
-					    expr_type.vecsize > sub_type.vecsize)
-						expr += vector_swizzle(sub_type.vecsize, 0);
+				AccessChainMeta meta;
+				expr += access_chain_internal(stage_in_ptr_var_id, indices, 2,
+				                              ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+				if (!is_matrix(sub_type) && sub_type.basetype != SPIRType::Struct &&
+					expr_type.vecsize > sub_type.vecsize)
+					expr += vector_swizzle(sub_type.vecsize, 0);
 
-					if (j + 1 < array_size)
-						expr += ", ";
-				}
-			}
-			else
-			{
-				for (uint32_t j = 0; j < array_size; j++, interface_index++)
-				{
-					const uint32_t indices[2] = { i, interface_index };
-
-					AccessChainMeta meta;
-					expr +=
-					    access_chain_internal(stage_in_ptr_var_id, indices, 2,
-					                          ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
-					if (!is_matrix(sub_type) && sub_type.basetype != SPIRType::Struct &&
-					    expr_type.vecsize > sub_type.vecsize)
-						expr += vector_swizzle(sub_type.vecsize, 0);
-
-					if (j + 1 < array_size)
-						expr += ", ";
-				}
+				if (j + 1 < array_size)
+					expr += ", ";
 			}
 			expr += " })";
 			if (i + 1 < num_control_points)
@@ -6498,7 +6756,7 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 		}
 		expr += " })";
 	}
-	else if (flat_struct)
+	else if (result_type.basetype == SPIRType::Struct)
 	{
 		bool is_array_of_struct = is_array(result_type);
 		if (is_array_of_struct && !ptr_is_io_variable)
@@ -6531,7 +6789,7 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 
 				const auto &mbr_type = get<SPIRType>(struct_type.member_types[j]);
 				const auto &expr_mbr_type = get<SPIRType>(expr_type.member_types[j]);
-				if (is_matrix(mbr_type) && ptr_type.storage == StorageClassInput && !multi_patch_tess_ctl)
+				if (is_matrix(mbr_type) && ptr_type.storage == StorageClassInput)
 				{
 					expr += type_to_glsl(mbr_type) + "(";
 					for (uint32_t k = 0; k < mbr_type.columns; k++, interface_index++)
@@ -6541,8 +6799,8 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 							const uint32_t indices[2] = { i, interface_index };
 							AccessChainMeta meta;
 							expr += access_chain_internal(
-							    stage_in_ptr_var_id, indices, 2,
-							    ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+									stage_in_ptr_var_id, indices, 2,
+									ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
 						}
 						else
 							expr += to_expression(ptr) + "." + to_member_name(iface_type, interface_index);
@@ -6558,48 +6816,23 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 				{
 					expr += type_to_glsl(mbr_type) + "({ ";
 					uint32_t array_size = to_array_size_literal(mbr_type, 0);
-					if (multi_patch_tess_ctl)
+					for (uint32_t k = 0; k < array_size; k++, interface_index++)
 					{
-						for (uint32_t k = 0; k < array_size; k++)
+						if (is_array_of_struct)
 						{
-							if (is_array_of_struct)
-							{
-								const uint32_t indices[3] = { i, interface_index, k };
-								AccessChainMeta meta;
-								expr += access_chain_internal(
-								    stage_in_ptr_var_id, indices, 3,
-								    ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
-							}
-							else
-								expr += join(to_expression(ptr), ".", to_member_name(iface_type, interface_index), "[",
-								             k, "]");
-							if (expr_mbr_type.vecsize > mbr_type.vecsize)
-								expr += vector_swizzle(mbr_type.vecsize, 0);
-
-							if (k + 1 < array_size)
-								expr += ", ";
+							const uint32_t indices[2] = { i, interface_index };
+							AccessChainMeta meta;
+							expr += access_chain_internal(
+									stage_in_ptr_var_id, indices, 2,
+									ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
 						}
-					}
-					else
-					{
-						for (uint32_t k = 0; k < array_size; k++, interface_index++)
-						{
-							if (is_array_of_struct)
-							{
-								const uint32_t indices[2] = { i, interface_index };
-								AccessChainMeta meta;
-								expr += access_chain_internal(
-								    stage_in_ptr_var_id, indices, 2,
-								    ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
-							}
-							else
-								expr += to_expression(ptr) + "." + to_member_name(iface_type, interface_index);
-							if (expr_mbr_type.vecsize > mbr_type.vecsize)
-								expr += vector_swizzle(mbr_type.vecsize, 0);
+						else
+							expr += to_expression(ptr) + "." + to_member_name(iface_type, interface_index);
+						if (expr_mbr_type.vecsize > mbr_type.vecsize)
+							expr += vector_swizzle(mbr_type.vecsize, 0);
 
-							if (k + 1 < array_size)
-								expr += ", ";
-						}
+						if (k + 1 < array_size)
+							expr += ", ";
 					}
 					expr += " })";
 				}
@@ -6629,7 +6862,7 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 		if (is_array_of_struct)
 			expr += " })";
 	}
-	else if (flat_matrix)
+	else if (is_matrix(result_type))
 	{
 		bool is_array_of_matrix = is_array(result_type);
 		if (is_array_of_matrix && !ptr_is_io_variable)
@@ -6655,9 +6888,8 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 					const uint32_t indices[2] = { i, interface_index };
 
 					AccessChainMeta meta;
-					expr +=
-					    access_chain_internal(stage_in_ptr_var_id, indices, 2,
-					                          ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+					expr += access_chain_internal(stage_in_ptr_var_id, indices, 2,
+					                              ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
 					if (expr_type.vecsize > result_type.vecsize)
 						expr += vector_swizzle(result_type.vecsize, 0);
 					if (j + 1 < result_type.columns)
@@ -6748,48 +6980,102 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 	bool patch = false;
 	bool flat_data = false;
 	bool ptr_is_chain = false;
-	bool multi_patch = get_execution_model() == ExecutionModelTessellationControl && msl_options.multi_patch_workgroup;
+	bool flatten_composites = false;
+
+	bool is_block = false;
+
+	if (var)
+		is_block = has_decoration(get_variable_data_type(*var).self, DecorationBlock);
 
 	if (var)
 	{
+		flatten_composites = variable_storage_requires_stage_io(var->storage);
 		patch = has_decoration(ops[2], DecorationPatch) || is_patch_block(get_variable_data_type(*var));
 
 		// Should match strip_array in add_interface_block.
 		flat_data = var->storage == StorageClassInput ||
 		            (var->storage == StorageClassOutput && get_execution_model() == ExecutionModelTessellationControl);
 
+		// Patch inputs are treated as normal block IO variables, so they don't deal with this path at all.
+		if (patch && (!is_block || var->storage == StorageClassInput))
+			flat_data = false;
+
 		// We might have a chained access chain, where
 		// we first take the access chain to the control point, and then we chain into a member or something similar.
 		// In this case, we need to skip gl_in/gl_out remapping.
+		// Also, skip ptr chain for patches.
 		ptr_is_chain = var->self != ID(ops[2]);
 	}
 
-	BuiltIn bi_type = BuiltIn(get_decoration(ops[2], DecorationBuiltIn));
-	if (var && flat_data && !patch &&
-	    (!is_builtin_variable(*var) || bi_type == BuiltInPosition || bi_type == BuiltInPointSize ||
-	     bi_type == BuiltInClipDistance || bi_type == BuiltInCullDistance ||
-	     get_variable_data_type(*var).basetype == SPIRType::Struct))
+	bool builtin_variable = false;
+	bool variable_is_flat = false;
+
+	if (var && flat_data)
 	{
+		builtin_variable = is_builtin_variable(*var);
+
+		BuiltIn bi_type = BuiltInMax;
+		if (builtin_variable && !is_block)
+			bi_type = BuiltIn(get_decoration(var->self, DecorationBuiltIn));
+
+		variable_is_flat = !builtin_variable || is_block ||
+		                   bi_type == BuiltInPosition || bi_type == BuiltInPointSize ||
+		                   bi_type == BuiltInClipDistance || bi_type == BuiltInCullDistance;
+	}
+
+	if (variable_is_flat)
+	{
+		// If output is masked, it is emitted as a "normal" variable, just go through normal code paths.
+		// Only check this for the first level of access chain.
+		// Dealing with this for partial access chains should be possible, but awkward.
+		if (var->storage == StorageClassOutput && !ptr_is_chain)
+		{
+			bool masked = false;
+			if (is_block)
+			{
+				uint32_t relevant_member_index = patch ? 3 : 4;
+				// FIXME: This won't work properly if the application first access chains into gl_out element,
+				// then access chains into the member. Super weird, but theoretically possible ...
+				if (length > relevant_member_index)
+				{
+					uint32_t mbr_idx = get<SPIRConstant>(ops[relevant_member_index]).scalar();
+					masked = is_stage_output_block_member_masked(*var, mbr_idx, true);
+				}
+			}
+			else if (var)
+				masked = is_stage_output_variable_masked(*var);
+
+			if (masked)
+				return false;
+		}
+
 		AccessChainMeta meta;
 		SmallVector<uint32_t> indices;
 		uint32_t next_id = ir.increase_bound_by(1);
 
 		indices.reserve(length - 3 + 1);
 
-		uint32_t first_non_array_index = ptr_is_chain ? 3 : 4;
-		VariableID stage_var_id = var->storage == StorageClassInput ? stage_in_ptr_var_id : stage_out_ptr_var_id;
+		uint32_t first_non_array_index = (ptr_is_chain ? 3 : 4) - (patch ? 1 : 0);
+
+		VariableID stage_var_id;
+		if (patch)
+			stage_var_id = var->storage == StorageClassInput ? patch_stage_in_var_id : patch_stage_out_var_id;
+		else
+			stage_var_id = var->storage == StorageClassInput ? stage_in_ptr_var_id : stage_out_ptr_var_id;
+
 		VariableID ptr = ptr_is_chain ? VariableID(ops[2]) : stage_var_id;
-		if (!ptr_is_chain)
+		if (!ptr_is_chain && !patch)
 		{
 			// Index into gl_in/gl_out with first array index.
-			indices.push_back(ops[3]);
+			indices.push_back(ops[first_non_array_index - 1]);
 		}
 
 		auto &result_ptr_type = get<SPIRType>(ops[0]);
 
 		uint32_t const_mbr_id = next_id++;
-		uint32_t index = get_extended_decoration(var->self, SPIRVCrossDecorationInterfaceMemberIndex);
-		if (var->storage == StorageClassInput || has_decoration(get_variable_element_type(*var).self, DecorationBlock))
+		uint32_t index = get_extended_decoration(ops[2], SPIRVCrossDecorationInterfaceMemberIndex);
+
+		if (flatten_composites || is_block)
 		{
 			uint32_t i = first_non_array_index;
 			auto *type = &get_variable_element_type(*var);
@@ -6797,19 +7083,20 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 			{
 				// Maybe this is a struct type in the input class, in which case
 				// we put it as a decoration on the corresponding member.
-				index = get_extended_member_decoration(var->self, get_constant(ops[first_non_array_index]).scalar(),
+				uint32_t mbr_idx = get_constant(ops[first_non_array_index]).scalar();
+				index = get_extended_member_decoration(var->self, mbr_idx,
 				                                       SPIRVCrossDecorationInterfaceMemberIndex);
 				assert(index != uint32_t(-1));
 				i++;
-				type = &get<SPIRType>(type->member_types[get_constant(ops[first_non_array_index]).scalar()]);
+				type = &get<SPIRType>(type->member_types[mbr_idx]);
 			}
 
 			// In this case, we're poking into flattened structures and arrays, so now we have to
 			// combine the following indices. If we encounter a non-constant index,
 			// we're hosed.
-			for (; i < length; ++i)
+			for (; flatten_composites && i < length; ++i)
 			{
-				if ((multi_patch || (!is_array(*type) && !is_matrix(*type))) && type->basetype != SPIRType::Struct)
+				if (!is_array(*type) && !is_matrix(*type) && type->basetype != SPIRType::Struct)
 					break;
 
 				auto *c = maybe_get<SPIRConstant>(ops[i]);
@@ -6820,7 +7107,16 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 				// We're in flattened space, so just increment the member index into IO block.
 				// We can only do this once in the current implementation, so either:
 				// Struct, Matrix or 1-dimensional array for a control point.
-				index += c->scalar();
+				if (type->basetype == SPIRType::Struct && var->storage == StorageClassOutput)
+				{
+					// Need to consider holes, since individual block members might be masked away.
+					uint32_t mbr_idx = c->scalar();
+					for (uint32_t j = 0; j < mbr_idx; j++)
+						if (!is_stage_output_block_member_masked(*var, j, true))
+							index++;
+				}
+				else
+					index += c->scalar();
 
 				if (type->parent_type)
 					type = &get<SPIRType>(type->parent_type);
@@ -6828,31 +7124,48 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 					type = &get<SPIRType>(type->member_types[c->scalar()]);
 			}
 
-			if ((!multi_patch && (is_matrix(result_ptr_type) || is_array(result_ptr_type))) ||
-			    result_ptr_type.basetype == SPIRType::Struct)
-			{
-				// We're not going to emit the actual member name, we let any further OpLoad take care of that.
-				// Tag the access chain with the member index we're referencing.
-				set_extended_decoration(ops[1], SPIRVCrossDecorationInterfaceMemberIndex, index);
-			}
-			else
+			// We're not going to emit the actual member name, we let any further OpLoad take care of that.
+			// Tag the access chain with the member index we're referencing.
+			bool defer_access_chain = flatten_composites && (is_matrix(result_ptr_type) || is_array(result_ptr_type) ||
+			                                                 result_ptr_type.basetype == SPIRType::Struct);
+
+			if (!defer_access_chain)
 			{
 				// Access the appropriate member of gl_in/gl_out.
 				set<SPIRConstant>(const_mbr_id, get_uint_type_id(), index, false);
 				indices.push_back(const_mbr_id);
 
+				// Member index is now irrelevant.
+				index = uint32_t(-1);
+
 				// Append any straggling access chain indices.
 				if (i < length)
 					indices.insert(indices.end(), ops + i, ops + length);
 			}
+			else
+			{
+				// We must have consumed the entire access chain if we're deferring it.
+				assert(i == length);
+			}
+
+			if (index != uint32_t(-1))
+				set_extended_decoration(ops[1], SPIRVCrossDecorationInterfaceMemberIndex, index);
+			else
+				unset_extended_decoration(ops[1], SPIRVCrossDecorationInterfaceMemberIndex);
 		}
 		else
 		{
-			assert(index != uint32_t(-1));
-			set<SPIRConstant>(const_mbr_id, get_uint_type_id(), index, false);
-			indices.push_back(const_mbr_id);
+			if (index != uint32_t(-1))
+			{
+				set<SPIRConstant>(const_mbr_id, get_uint_type_id(), index, false);
+				indices.push_back(const_mbr_id);
+			}
 
-			indices.insert(indices.end(), ops + 4, ops + length);
+			// Member index is now irrelevant.
+			index = uint32_t(-1);
+			unset_extended_decoration(ops[1], SPIRVCrossDecorationInterfaceMemberIndex);
+
+			indices.insert(indices.end(), ops + first_non_array_index, ops + length);
 		}
 
 		// We use the pointer to the base of the input/output array here,
@@ -6862,7 +7175,7 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 		if (!ptr_is_chain)
 		{
 			// This is the start of an access chain, use ptr_chain to index into control point array.
-			e = access_chain(ptr, indices.data(), uint32_t(indices.size()), result_ptr_type, &meta, true);
+			e = access_chain(ptr, indices.data(), uint32_t(indices.size()), result_ptr_type, &meta, !patch);
 		}
 		else
 		{
@@ -6878,7 +7191,7 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 			// First one is the gl_in/gl_out struct itself, then an index into that array.
 			// If we have traversed further, we use a normal access chain formulation.
 			auto *ptr_expr = maybe_get<SPIRExpression>(ptr);
-			if (ptr_expr && ptr_expr->implied_read_expressions.size() == 2)
+			if (flatten_composites && ptr_expr && ptr_expr->implied_read_expressions.size() == 2)
 			{
 				e = join(to_expression(ptr),
 				         access_chain_internal(stage_var_id, indices.data(), uint32_t(indices.size()),
@@ -7011,6 +7324,21 @@ void CompilerMSL::prepare_access_chain_for_scalar_access(std::string &expr, cons
 	}
 }
 
+bool CompilerMSL::access_chain_needs_stage_io_builtin_translation(uint32_t base)
+{
+	auto *var = maybe_get_backing_variable(base);
+	if (!var || !is_tessellation_shader())
+		return true;
+
+	// We only need to rewrite builtin access chains when accessing flattened builtins like gl_ClipDistance_N.
+	// Avoid overriding it back to just gl_ClipDistance.
+	// This can only happen in scenarios where we cannot flatten/unflatten access chains, so, the only case
+	// where this triggers is evaluation shader inputs.
+	bool redirect_builtin = get_execution_model() == ExecutionModelTessellationEvaluation ?
+	                        var->storage == StorageClassOutput : false;
+	return redirect_builtin;
+}
+
 // Sets the interface member index for an access chain to a pull-model interpolant.
 void CompilerMSL::fix_up_interpolant_access_chain(const uint32_t *ops, uint32_t length)
 {
@@ -7261,19 +7589,26 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 		break;
 
 	case OpFMul:
-		if (msl_options.invariant_float_math)
+		if (msl_options.invariant_float_math || has_decoration(ops[1], DecorationNoContraction))
 			MSL_BFOP(spvFMul);
 		else
 			MSL_BOP(*);
 		break;
 
 	case OpFAdd:
-		if (msl_options.invariant_float_math)
+		if (msl_options.invariant_float_math || has_decoration(ops[1], DecorationNoContraction))
 			MSL_BFOP(spvFAdd);
 		else
 			MSL_BOP(+);
 		break;
 
+	case OpFSub:
+		if (msl_options.invariant_float_math || has_decoration(ops[1], DecorationNoContraction))
+			MSL_BFOP(spvFSub);
+		else
+			MSL_BOP(-);
+		break;
+
 	// Atomics
 	case OpAtomicExchange:
 	{
@@ -7715,7 +8050,7 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 	case OpVectorTimesMatrix:
 	case OpMatrixTimesVector:
 	{
-		if (!msl_options.invariant_float_math)
+		if (!msl_options.invariant_float_math && !has_decoration(ops[1], DecorationNoContraction))
 		{
 			CompilerGLSL::emit_instruction(instruction);
 			break;
@@ -7757,7 +8092,7 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 
 	case OpMatrixTimesMatrix:
 	{
-		if (!msl_options.invariant_float_math)
+		if (!msl_options.invariant_float_math && !has_decoration(ops[1], DecorationNoContraction))
 		{
 			CompilerGLSL::emit_instruction(instruction);
 			break;
@@ -8046,20 +8381,46 @@ void CompilerMSL::emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uin
 	flush_all_active_variables();
 }
 
-void CompilerMSL::emit_array_copy(const string &lhs, uint32_t rhs_id, StorageClass lhs_storage,
-                                  StorageClass rhs_storage)
+static bool storage_class_array_is_thread(StorageClass storage)
+{
+	switch (storage)
+	{
+	case StorageClassInput:
+	case StorageClassOutput:
+	case StorageClassGeneric:
+	case StorageClassFunction:
+	case StorageClassPrivate:
+		return true;
+
+	default:
+		return false;
+	}
+}
+
+void CompilerMSL::emit_array_copy(const string &lhs, uint32_t lhs_id, uint32_t rhs_id,
+								  StorageClass lhs_storage, StorageClass rhs_storage)
 {
 	// Allow Metal to use the array<T> template to make arrays a value type.
 	// This, however, cannot be used for threadgroup address specifiers, so consider the custom array copy as fallback.
-	bool lhs_thread = (lhs_storage == StorageClassOutput || lhs_storage == StorageClassFunction ||
-	                   lhs_storage == StorageClassGeneric || lhs_storage == StorageClassPrivate);
-	bool rhs_thread = (rhs_storage == StorageClassInput || rhs_storage == StorageClassFunction ||
-	                   rhs_storage == StorageClassOutput ||
-	                   rhs_storage == StorageClassGeneric || rhs_storage == StorageClassPrivate);
+	bool lhs_is_thread_storage = storage_class_array_is_thread(lhs_storage);
+	bool rhs_is_thread_storage = storage_class_array_is_thread(rhs_storage);
+
+	bool lhs_is_array_template = lhs_is_thread_storage;
+	bool rhs_is_array_template = rhs_is_thread_storage;
+
+	// Special considerations for stage IO variables.
+	// If the variable is actually backed by non-user visible device storage, we use array templates for those.
+	auto *lhs_var = maybe_get_backing_variable(lhs_id);
+	if (lhs_var && lhs_storage == StorageClassStorageBuffer && storage_class_array_is_thread(lhs_var->storage))
+		lhs_is_array_template = true;
+
+	auto *rhs_var = maybe_get_backing_variable(rhs_id);
+	if (rhs_var && rhs_storage == StorageClassStorageBuffer && storage_class_array_is_thread(rhs_var->storage))
+		rhs_is_array_template = true;
 
 	// If threadgroup storage qualifiers are *not* used:
 	// Avoid spvCopy* wrapper functions; Otherwise, spvUnsafeArray<> template cannot be used with that storage qualifier.
-	if (lhs_thread && rhs_thread && !using_builtin_array())
+	if (lhs_is_array_template && rhs_is_array_template && !using_builtin_array())
 	{
 		statement(lhs, " = ", to_expression(rhs_id), ";");
 	}
@@ -8101,15 +8462,15 @@ void CompilerMSL::emit_array_copy(const string &lhs, uint32_t rhs_id, StorageCla
 			add_spv_func_and_recompile(SPVFuncImplArrayCopy);
 
 		const char *tag = nullptr;
-		if (lhs_thread && is_constant)
+		if (lhs_is_thread_storage && is_constant)
 			tag = "FromConstantToStack";
 		else if (lhs_storage == StorageClassWorkgroup && is_constant)
 			tag = "FromConstantToThreadGroup";
-		else if (lhs_thread && rhs_thread)
+		else if (lhs_is_thread_storage && rhs_is_thread_storage)
 			tag = "FromStackToStack";
-		else if (lhs_storage == StorageClassWorkgroup && rhs_thread)
+		else if (lhs_storage == StorageClassWorkgroup && rhs_is_thread_storage)
 			tag = "FromStackToThreadGroup";
-		else if (lhs_thread && rhs_storage == StorageClassWorkgroup)
+		else if (lhs_is_thread_storage && rhs_storage == StorageClassWorkgroup)
 			tag = "FromThreadGroupToStack";
 		else if (lhs_storage == StorageClassWorkgroup && rhs_storage == StorageClassWorkgroup)
 			tag = "FromThreadGroupToThreadGroup";
@@ -8119,19 +8480,21 @@ void CompilerMSL::emit_array_copy(const string &lhs, uint32_t rhs_id, StorageCla
 			tag = "FromConstantToDevice";
 		else if (lhs_storage == StorageClassStorageBuffer && rhs_storage == StorageClassWorkgroup)
 			tag = "FromThreadGroupToDevice";
-		else if (lhs_storage == StorageClassStorageBuffer && rhs_thread)
+		else if (lhs_storage == StorageClassStorageBuffer && rhs_is_thread_storage)
 			tag = "FromStackToDevice";
 		else if (lhs_storage == StorageClassWorkgroup && rhs_storage == StorageClassStorageBuffer)
 			tag = "FromDeviceToThreadGroup";
-		else if (lhs_thread && rhs_storage == StorageClassStorageBuffer)
+		else if (lhs_is_thread_storage && rhs_storage == StorageClassStorageBuffer)
 			tag = "FromDeviceToStack";
 		else
 			SPIRV_CROSS_THROW("Unknown storage class used for copying arrays.");
 
 		// Pass internal array of spvUnsafeArray<> into wrapper functions
-		if (lhs_thread && !msl_options.force_native_arrays)
+		if (lhs_is_array_template && rhs_is_array_template && !msl_options.force_native_arrays)
+			statement("spvArrayCopy", tag, type.array.size(), "(", lhs, ".elements, ", to_expression(rhs_id), ".elements);");
+		if (lhs_is_array_template && !msl_options.force_native_arrays)
 			statement("spvArrayCopy", tag, type.array.size(), "(", lhs, ".elements, ", to_expression(rhs_id), ");");
-		else if (rhs_thread && !msl_options.force_native_arrays)
+		else if (rhs_is_array_template && !msl_options.force_native_arrays)
 			statement("spvArrayCopy", tag, type.array.size(), "(", lhs, ", ", to_expression(rhs_id), ".elements);");
 		else
 			statement("spvArrayCopy", tag, type.array.size(), "(", lhs, ", ", to_expression(rhs_id), ");");
@@ -8198,8 +8561,9 @@ bool CompilerMSL::maybe_emit_array_assignment(uint32_t id_lhs, uint32_t id_rhs)
 	if (p_v_lhs)
 		flush_variable_declaration(p_v_lhs->self);
 
-	emit_array_copy(to_expression(id_lhs), id_rhs, get_expression_effective_storage_class(id_lhs),
-	                get_expression_effective_storage_class(id_rhs));
+	auto lhs_storage = get_expression_effective_storage_class(id_lhs);
+	auto rhs_storage = get_expression_effective_storage_class(id_rhs);
+	emit_array_copy(to_expression(id_lhs), id_lhs, id_rhs, lhs_storage, rhs_storage);
 	register_write(id_lhs);
 
 	return true;
@@ -8651,7 +9015,8 @@ void CompilerMSL::emit_interface_block(uint32_t ib_var_id)
 	{
 		auto &ib_var = get<SPIRVariable>(ib_var_id);
 		auto &ib_type = get_variable_data_type(ib_var);
-		assert(ib_type.basetype == SPIRType::Struct && !ib_type.member_types.empty());
+		//assert(ib_type.basetype == SPIRType::Struct && !ib_type.member_types.empty());
+		assert(ib_type.basetype == SPIRType::Struct);
 		emit_struct(ib_type);
 	}
 }
@@ -10045,8 +10410,10 @@ string CompilerMSL::to_struct_member(const SPIRType &type, uint32_t member_type_
 		// we need flat arrays, but if we're somehow declaring gl_PerVertex for constant array reasons, we want
 		// template array types to be declared.
 		bool is_ib_in_out =
-				((stage_out_var_id && get_stage_out_struct_type().self == type.self) ||
-				 (stage_in_var_id && get_stage_in_struct_type().self == type.self));
+				((stage_out_var_id && get_stage_out_struct_type().self == type.self &&
+				  variable_storage_requires_stage_io(StorageClassOutput)) ||
+				 (stage_in_var_id && get_stage_in_struct_type().self == type.self &&
+				  variable_storage_requires_stage_io(StorageClassInput)));
 		if (is_ib_in_out && is_member_builtin(type, index, &builtin))
 			is_using_builtin_array = true;
 		array_type = type_to_array_glsl(physical_type);
@@ -10168,8 +10535,14 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in
 				return string(" [[") + builtin_qualifier(builtin) + "]]" + (mbr_type.array.empty() ? "" : " ");
 
 			case BuiltInClipDistance:
-				if (has_member_decoration(type.self, index, DecorationLocation))
-					return join(" [[user(clip", get_member_decoration(type.self, index, DecorationLocation), ")]]");
+				if (has_member_decoration(type.self, index, DecorationIndex))
+					return join(" [[user(clip", get_member_decoration(type.self, index, DecorationIndex), ")]]");
+				else
+					return string(" [[") + builtin_qualifier(builtin) + "]]" + (mbr_type.array.empty() ? "" : " ");
+
+			case BuiltInCullDistance:
+				if (has_member_decoration(type.self, index, DecorationIndex))
+					return join(" [[user(cull", get_member_decoration(type.self, index, DecorationIndex), ")]]");
 				else
 					return string(" [[") + builtin_qualifier(builtin) + "]]" + (mbr_type.array.empty() ? "" : " ");
 
@@ -10290,7 +10663,9 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in
 				break;
 
 			case BuiltInClipDistance:
-				return join(" [[user(clip", get_member_decoration(type.self, index, DecorationLocation), ")]]");
+				return join(" [[user(clip", get_member_decoration(type.self, index, DecorationIndex), ")]]");
+			case BuiltInCullDistance:
+				return join(" [[user(cull", get_member_decoration(type.self, index, DecorationIndex), ")]]");
 
 			default:
 				break;
@@ -10482,12 +10857,16 @@ uint32_t CompilerMSL::get_or_allocate_builtin_input_member_location(spv::BuiltIn
 	auto &mbr_type = get<SPIRType>(get<SPIRType>(type_id).member_types[index]);
 	uint32_t count = type_to_location_count(mbr_type);
 
-	// This should always be 1.
-	if (count != 1)
-		return k_unknown_location;
-
 	loc = 0;
-	while (location_inputs_in_use.count(loc) != 0)
+
+	const auto location_range_in_use = [this](uint32_t location, uint32_t location_count) -> bool {
+		for (uint32_t i = 0; i < location_count; i++)
+			if (location_inputs_in_use.count(location + i) != 0)
+				return true;
+		return false;
+	};
+
+	while (location_range_in_use(loc, count))
 		loc++;
 
 	set_member_decoration(type_id, index, DecorationLocation, loc);
@@ -10503,7 +10882,7 @@ uint32_t CompilerMSL::get_or_allocate_builtin_input_member_location(spv::BuiltIn
 	else
 		builtin_to_automatic_input_location[builtin] = loc;
 
-	mark_location_as_used_by_shader(loc, mbr_type, StorageClassInput);
+	mark_location_as_used_by_shader(loc, mbr_type, StorageClassInput, true);
 	return loc;
 }
 
@@ -10640,7 +11019,25 @@ string CompilerMSL::get_type_address_space(const SPIRType &type, uint32_t id, bo
 
 	case StorageClassOutput:
 		if (capture_output_to_buffer)
-			addr_space = "device";
+		{
+			if (var && type.storage == StorageClassOutput)
+			{
+				bool is_masked = is_stage_output_variable_masked(*var);
+
+				if (is_masked)
+				{
+					if (is_tessellation_shader())
+						addr_space = "threadgroup";
+					else
+						addr_space = "thread";
+				}
+				else if (variable_decl_is_remapped_storage(*var, StorageClassWorkgroup))
+					addr_space = "threadgroup";
+			}
+
+			if (!addr_space)
+				addr_space = "device";
+		}
 		break;
 
 	default:
@@ -11416,7 +11813,7 @@ void CompilerMSL::fix_up_shader_inputs_outputs()
 				entry_func.fixup_hooks_in.push_back([=]() {
 					statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = min(",
 					          to_expression(builtin_invocation_id_id), ".x / ", this->get_entry_point().output_vertices,
-					          ", spvIndirectParams[1]);");
+					          ", spvIndirectParams[1] - 1);");
 				});
 				break;
 			case BuiltInPatchVertices:
@@ -12003,7 +12400,7 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 	auto &var = get<SPIRVariable>(arg.id);
 	auto &type = get_variable_data_type(var);
 	auto &var_type = get<SPIRType>(arg.type);
-	StorageClass storage = var_type.storage;
+	StorageClass type_storage = var_type.storage;
 	bool is_pointer = var_type.pointer;
 
 	// If we need to modify the name of the variable, make sure we use the original variable.
@@ -12038,17 +12435,41 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 
 	// Allow Metal to use the array<T> template to make arrays a value type
 	string address_space = get_argument_address_space(var);
-	bool builtin = is_builtin_variable(var);
+	bool builtin = has_decoration(var.self, DecorationBuiltIn);
 	auto builtin_type = BuiltIn(get_decoration(arg.id, DecorationBuiltIn));
-	is_using_builtin_array = builtin;
+
 	if (address_space == "threadgroup")
 		is_using_builtin_array = true;
 
 	if (var.basevariable && (var.basevariable == stage_in_ptr_var_id || var.basevariable == stage_out_ptr_var_id))
 		decl = join(cv_qualifier, type_to_glsl(type, arg.id));
 	else if (builtin)
-		decl = join(cv_qualifier, builtin_type_decl(builtin_type, arg.id));
-	else if ((storage == StorageClassUniform || storage == StorageClassStorageBuffer) && is_array(type))
+	{
+		// Only use templated array for Clip/Cull distance when feasible.
+		// In other scenarios, we need need to override array length for tess levels (if used as outputs),
+		// or we need to emit the expected type for builtins (uint vs int).
+		auto storage = get<SPIRType>(var.basetype).storage;
+
+		if (storage == StorageClassInput &&
+		    (builtin_type == BuiltInTessLevelInner || builtin_type == BuiltInTessLevelOuter))
+		{
+			is_using_builtin_array = false;
+		}
+		else if (builtin_type != BuiltInClipDistance && builtin_type != BuiltInCullDistance)
+		{
+			is_using_builtin_array = true;
+		}
+
+		if (storage == StorageClassOutput && variable_storage_requires_stage_io(storage) &&
+		    !is_stage_output_builtin_masked(builtin_type))
+			is_using_builtin_array = true;
+
+		if (is_using_builtin_array)
+			decl = join(cv_qualifier, builtin_type_decl(builtin_type, arg.id));
+		else
+			decl = join(cv_qualifier, type_to_glsl(type, arg.id));
+	}
+	else if ((type_storage == StorageClassUniform || type_storage == StorageClassStorageBuffer) && is_array(type))
 	{
 		is_using_builtin_array = true;
 		decl += join(cv_qualifier, type_to_glsl(type, arg.id), "*");
@@ -12072,10 +12493,10 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 			decl = join(cv_qualifier, type_to_glsl(type, arg.id));
 	}
 
-	bool opaque_handle = storage == StorageClassUniformConstant;
+	bool opaque_handle = type_storage == StorageClassUniformConstant;
 
 	if (!builtin && !opaque_handle && !is_pointer &&
-	    (storage == StorageClassFunction || storage == StorageClassGeneric))
+	    (type_storage == StorageClassFunction || type_storage == StorageClassGeneric))
 	{
 		// If the argument is a pure value and not an opaque type, we will pass by value.
 		if (msl_options.force_native_arrays && is_array(type))
@@ -12116,7 +12537,7 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 		if (msl_options.argument_buffers)
 		{
 			uint32_t desc_set = get_decoration(name_id, DecorationDescriptorSet);
-			if ((storage == StorageClassUniform || storage == StorageClassStorageBuffer) &&
+			if ((type_storage == StorageClassUniform || type_storage == StorageClassStorageBuffer) &&
 			    descriptor_set_is_argument_buffer(desc_set))
 			{
 				// An awkward case where we need to emit *more* address space declarations (yay!).
@@ -12158,7 +12579,12 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 		}
 		else
 		{
-			decl += " (&";
+			auto array_size_decl = type_to_array_glsl(type);
+			if (array_size_decl.empty())
+				decl += "& ";
+			else
+				decl += " (&";
+
 			const char *restrict_kw = to_restrict(name_id);
 			if (*restrict_kw)
 			{
@@ -12166,8 +12592,12 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 				decl += restrict_kw;
 			}
 			decl += to_expression(name_id);
-			decl += ")";
-			decl += type_to_array_glsl(type);
+
+			if (!array_size_decl.empty())
+			{
+				decl += ")";
+				decl += array_size_decl;
+			}
 		}
 	}
 	else if (!opaque_handle && (!pull_model_inputs.count(var.basevariable) || type.basetype == SPIRType::Struct))
@@ -12612,8 +13042,10 @@ string CompilerMSL::to_qualifiers_glsl(uint32_t id)
 {
 	string quals;
 
+	auto *var = maybe_get<SPIRVariable>(id);
 	auto &type = expression_type(id);
-	if (type.storage == StorageClassWorkgroup)
+
+	if (type.storage == StorageClassWorkgroup || (var && variable_decl_is_remapped_storage(*var, StorageClassWorkgroup)))
 		quals += "threadgroup ";
 
 	return quals;
@@ -12794,18 +13226,55 @@ string CompilerMSL::type_to_array_glsl(const SPIRType &type)
 	}
 }
 
-// Threadgroup arrays can't have a wrapper type
+bool CompilerMSL::variable_decl_is_remapped_storage(const SPIRVariable &variable, spv::StorageClass storage) const
+{
+	if (variable.storage == storage)
+		return true;
+
+	if (storage == StorageClassWorkgroup)
+	{
+		auto model = get_execution_model();
+
+		// Specially masked IO block variable.
+		// Normally, we will never access IO blocks directly here.
+		// The only scenario which that should occur is with a masked IO block.
+		if (model == ExecutionModelTessellationControl && variable.storage == StorageClassOutput &&
+		    has_decoration(get<SPIRType>(variable.basetype).self, DecorationBlock))
+		{
+			return true;
+		}
+
+		return variable.storage == StorageClassOutput &&
+		       model == ExecutionModelTessellationControl &&
+		       is_stage_output_variable_masked(variable);
+	}
+	else if (storage == StorageClassStorageBuffer)
+	{
+		// We won't be able to catch writes to control point outputs here since variable
+		// refers to a function local pointer.
+		// This is fine, as there cannot be concurrent writers to that memory anyways,
+		// so we just ignore that case.
+
+		return (variable.storage == StorageClassOutput || variable.storage == StorageClassInput) &&
+		       !variable_storage_requires_stage_io(variable.storage) &&
+		       (variable.storage != StorageClassOutput || !is_stage_output_variable_masked(variable));
+	}
+	else
+	{
+		return false;
+	}
+}
+
 std::string CompilerMSL::variable_decl(const SPIRVariable &variable)
 {
-	if (variable.storage == StorageClassWorkgroup)
-	{
+	bool old_is_using_builtin_array = is_using_builtin_array;
+
+	// Threadgroup arrays can't have a wrapper type.
+	if (variable_decl_is_remapped_storage(variable, StorageClassWorkgroup))
 		is_using_builtin_array = true;
-	}
+
 	std::string expr = CompilerGLSL::variable_decl(variable);
-	if (variable.storage == StorageClassWorkgroup)
-	{
-		is_using_builtin_array = false;
-	}
+	is_using_builtin_array = old_is_using_builtin_array;
 	return expr;
 }
 
@@ -13372,7 +13841,6 @@ string CompilerMSL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 {
 	switch (builtin)
 	{
-
 	// Handle HLSL-style 0-based vertex/instance index.
 	// Override GLSL compiler strictness
 	case BuiltInVertexId:
@@ -13505,9 +13973,9 @@ string CompilerMSL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 	case BuiltInSampleMask:
 		if (get_execution_model() == ExecutionModelTessellationControl)
 			break;
-		if (storage != StorageClassInput && current_function && (current_function->self == ir.default_entry_point))
+		if (storage != StorageClassInput && current_function && (current_function->self == ir.default_entry_point) &&
+		    !is_stage_output_builtin_masked(builtin))
 			return stage_out_var_name + "." + CompilerGLSL::builtin_to_glsl(builtin, storage);
-
 		break;
 
 	case BuiltInBaryCoordNV:
@@ -13517,31 +13985,21 @@ string CompilerMSL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 		break;
 
 	case BuiltInTessLevelOuter:
-		if (get_execution_model() == ExecutionModelTessellationEvaluation)
+		if (get_execution_model() == ExecutionModelTessellationControl &&
+		    storage != StorageClassInput && current_function && (current_function->self == ir.default_entry_point))
 		{
-			if (storage != StorageClassOutput && !get_entry_point().flags.get(ExecutionModeTriangles) &&
-			    current_function && (current_function->self == ir.default_entry_point))
-				return join(patch_stage_in_var_name, ".", CompilerGLSL::builtin_to_glsl(builtin, storage));
-			else
-				break;
-		}
-		if (storage != StorageClassInput && current_function && (current_function->self == ir.default_entry_point))
 			return join(tess_factor_buffer_var_name, "[", to_expression(builtin_primitive_id_id),
 			            "].edgeTessellationFactor");
+		}
 		break;
 
 	case BuiltInTessLevelInner:
-		if (get_execution_model() == ExecutionModelTessellationEvaluation)
+		if (get_execution_model() == ExecutionModelTessellationControl &&
+		    storage != StorageClassInput && current_function && (current_function->self == ir.default_entry_point))
 		{
-			if (storage != StorageClassOutput && !get_entry_point().flags.get(ExecutionModeTriangles) &&
-			    current_function && (current_function->self == ir.default_entry_point))
-				return join(patch_stage_in_var_name, ".", CompilerGLSL::builtin_to_glsl(builtin, storage));
-			else
-				break;
-		}
-		if (storage != StorageClassInput && current_function && (current_function->self == ir.default_entry_point))
 			return join(tess_factor_buffer_var_name, "[", to_expression(builtin_primitive_id_id),
 			            "].insideTessellationFactor");
+		}
 		break;
 
 	default:
@@ -13795,6 +14253,7 @@ string CompilerMSL::builtin_type_decl(BuiltIn builtin, uint32_t id)
 
 	// Vertex function out
 	case BuiltInClipDistance:
+	case BuiltInCullDistance:
 		return "float";
 	case BuiltInPointSize:
 		return "float";
@@ -14414,9 +14873,11 @@ CompilerMSL::SPVFuncImpl CompilerMSL::OpCodePreprocessor::get_spv_func_impl(Op o
 		return SPVFuncImplMod;
 
 	case OpFAdd:
-		if (compiler.msl_options.invariant_float_math)
+	case OpFSub:
+		if (compiler.msl_options.invariant_float_math ||
+		    compiler.has_decoration(args[1], DecorationNoContraction))
 		{
-			return SPVFuncImplFAdd;
+			return opcode == OpFAdd ? SPVFuncImplFAdd : SPVFuncImplFSub;
 		}
 		break;
 
@@ -14425,7 +14886,8 @@ CompilerMSL::SPVFuncImpl CompilerMSL::OpCodePreprocessor::get_spv_func_impl(Op o
 	case OpMatrixTimesVector:
 	case OpVectorTimesMatrix:
 	case OpMatrixTimesMatrix:
-		if (compiler.msl_options.invariant_float_math)
+		if (compiler.msl_options.invariant_float_math ||
+		    compiler.has_decoration(args[1], DecorationNoContraction))
 		{
 			return SPVFuncImplFMul;
 		}
@@ -14638,30 +15100,23 @@ void CompilerMSL::MemberSorter::sort()
 	}
 }
 
-// Sort first by builtin status (put builtins at end), then by the sorting aspect.
 bool CompilerMSL::MemberSorter::operator()(uint32_t mbr_idx1, uint32_t mbr_idx2)
 {
 	auto &mbr_meta1 = meta.members[mbr_idx1];
 	auto &mbr_meta2 = meta.members[mbr_idx2];
-	if (mbr_meta1.builtin != mbr_meta2.builtin)
-		return mbr_meta2.builtin;
-	else
-		switch (sort_aspect)
-		{
-		case Location:
+
+	if (sort_aspect == LocationThenBuiltInType)
+	{
+		// Sort first by builtin status (put builtins at end), then by the sorting aspect.
+		if (mbr_meta1.builtin != mbr_meta2.builtin)
+			return mbr_meta2.builtin;
+		else if (mbr_meta1.builtin)
+			return mbr_meta1.builtin_type < mbr_meta2.builtin_type;
+		else
 			return mbr_meta1.location < mbr_meta2.location;
-		case LocationReverse:
-			return mbr_meta1.location > mbr_meta2.location;
-		case Offset:
-			return mbr_meta1.offset < mbr_meta2.offset;
-		case OffsetThenLocationReverse:
-			return (mbr_meta1.offset < mbr_meta2.offset) ||
-			       ((mbr_meta1.offset == mbr_meta2.offset) && (mbr_meta1.location > mbr_meta2.location));
-		case Alphabetical:
-			return mbr_meta1.alias < mbr_meta2.alias;
-		default:
-			return false;
-		}
+	}
+	else
+		return mbr_meta1.offset < mbr_meta2.offset;
 }
 
 CompilerMSL::MemberSorter::MemberSorter(SPIRType &t, Meta &m, SortAspect sa)
@@ -15094,10 +15549,63 @@ void CompilerMSL::analyze_argument_buffers()
 		});
 
 		uint32_t member_index = 0;
+		uint32_t next_arg_buff_index = 0;
 		for (auto &resource : resources)
 		{
 			auto &var = *resource.var;
 			auto &type = get_variable_data_type(var);
+
+			// If needed, synthesize and add padding members.
+			// member_index and next_arg_buff_index are incremented when padding members are added.
+			if (msl_options.pad_argument_buffer_resources)
+			{
+				while (resource.index > next_arg_buff_index)
+				{
+					auto &rez_bind = get_argument_buffer_resource(desc_set, next_arg_buff_index);
+					switch (rez_bind.basetype)
+					{
+					case SPIRType::Void:
+					case SPIRType::Boolean:
+					case SPIRType::SByte:
+					case SPIRType::UByte:
+					case SPIRType::Short:
+					case SPIRType::UShort:
+					case SPIRType::Int:
+					case SPIRType::UInt:
+					case SPIRType::Int64:
+					case SPIRType::UInt64:
+					case SPIRType::AtomicCounter:
+					case SPIRType::Half:
+					case SPIRType::Float:
+					case SPIRType::Double:
+						add_argument_buffer_padding_buffer_type(buffer_type, member_index, next_arg_buff_index, rez_bind);
+						break;
+					case SPIRType::Image:
+						add_argument_buffer_padding_image_type(buffer_type, member_index, next_arg_buff_index, rez_bind);
+						break;
+					case SPIRType::Sampler:
+						add_argument_buffer_padding_sampler_type(buffer_type, member_index, next_arg_buff_index, rez_bind);
+						break;
+					case SPIRType::SampledImage:
+						if (next_arg_buff_index == rez_bind.msl_sampler)
+							add_argument_buffer_padding_sampler_type(buffer_type, member_index, next_arg_buff_index, rez_bind);
+						else
+							add_argument_buffer_padding_image_type(buffer_type, member_index, next_arg_buff_index, rez_bind);
+						break;
+					default:
+						break;
+					}
+				}
+
+				// Adjust the number of slots consumed by current member itself.
+				// If actual member is an array, allow runtime array resolution as well.
+				uint32_t elem_cnt = type.array.empty() ? 1 : to_array_size_literal(type);
+				if (elem_cnt == 0)
+					elem_cnt = get_resource_array_size(var.self);
+
+				next_arg_buff_index += elem_cnt;
+			}
+
 			string mbr_name = ensure_valid_name(resource.name, "m");
 			if (resource.plane > 0)
 				mbr_name += join(plane_name_suffix, resource.plane);
@@ -15196,6 +15704,125 @@ void CompilerMSL::analyze_argument_buffers()
 	}
 }
 
+// Return the resource type of the app-provided resources for the descriptor set,
+// that matches the resource index of the argument buffer index.
+// This is a two-step lookup, first lookup the resource binding number from the argument buffer index,
+// then lookup the resource binding using the binding number.
+MSLResourceBinding &CompilerMSL::get_argument_buffer_resource(uint32_t desc_set, uint32_t arg_idx)
+{
+	auto stage = get_entry_point().model;
+	StageSetBinding arg_idx_tuple = { stage, desc_set, arg_idx };
+	auto arg_itr = resource_arg_buff_idx_to_binding_number.find(arg_idx_tuple);
+	if (arg_itr != end(resource_arg_buff_idx_to_binding_number))
+	{
+		StageSetBinding bind_tuple = { stage, desc_set, arg_itr->second };
+		auto bind_itr = resource_bindings.find(bind_tuple);
+		if (bind_itr != end(resource_bindings))
+			return bind_itr->second.first;
+	}
+	SPIRV_CROSS_THROW("Argument buffer resource base type could not be determined. When padding argument buffer "
+	                  "elements, all descriptor set resources must be supplied with a base type by the app.");
+}
+
+// Adds an argument buffer padding argument buffer type as one or more members of the struct type at the member index.
+// Metal does not support arrays of buffers, so these are emitted as multiple struct members.
+void CompilerMSL::add_argument_buffer_padding_buffer_type(SPIRType &struct_type, uint32_t &mbr_idx,
+                                                          uint32_t &arg_buff_index, MSLResourceBinding &rez_bind)
+{
+	if (!argument_buffer_padding_buffer_type_id)
+	{
+		uint32_t buff_type_id = ir.increase_bound_by(2);
+		auto &buff_type = set<SPIRType>(buff_type_id);
+		buff_type.basetype = rez_bind.basetype;
+		buff_type.storage = StorageClassUniformConstant;
+
+		uint32_t ptr_type_id = buff_type_id + 1;
+		auto &ptr_type = set<SPIRType>(ptr_type_id);
+		ptr_type = buff_type;
+		ptr_type.pointer = true;
+		ptr_type.pointer_depth++;
+		ptr_type.parent_type = buff_type_id;
+
+		argument_buffer_padding_buffer_type_id = ptr_type_id;
+	}
+
+	for (uint32_t rez_idx = 0; rez_idx < rez_bind.count; rez_idx++)
+		add_argument_buffer_padding_type(argument_buffer_padding_buffer_type_id, struct_type, mbr_idx, arg_buff_index, 1);
+}
+
+// Adds an argument buffer padding argument image type as a member of the struct type at the member index.
+void CompilerMSL::add_argument_buffer_padding_image_type(SPIRType &struct_type, uint32_t &mbr_idx,
+                                                         uint32_t &arg_buff_index, MSLResourceBinding &rez_bind)
+{
+	if (!argument_buffer_padding_image_type_id)
+	{
+		uint32_t base_type_id = ir.increase_bound_by(2);
+		auto &base_type = set<SPIRType>(base_type_id);
+		base_type.basetype = SPIRType::Float;
+		base_type.width = 32;
+
+		uint32_t img_type_id = base_type_id + 1;
+		auto &img_type = set<SPIRType>(img_type_id);
+		img_type.basetype = SPIRType::Image;
+		img_type.storage = StorageClassUniformConstant;
+
+		img_type.image.type = base_type_id;
+		img_type.image.dim = Dim2D;
+		img_type.image.depth = false;
+		img_type.image.arrayed = false;
+		img_type.image.ms = false;
+		img_type.image.sampled = 1;
+		img_type.image.format = ImageFormatUnknown;
+		img_type.image.access = AccessQualifierMax;
+
+		argument_buffer_padding_image_type_id = img_type_id;
+	}
+
+	add_argument_buffer_padding_type(argument_buffer_padding_image_type_id, struct_type, mbr_idx, arg_buff_index, rez_bind.count);
+}
+
+// Adds an argument buffer padding argument sampler type as a member of the struct type at the member index.
+void CompilerMSL::add_argument_buffer_padding_sampler_type(SPIRType &struct_type, uint32_t &mbr_idx,
+                                                           uint32_t &arg_buff_index, MSLResourceBinding &rez_bind)
+{
+	if (!argument_buffer_padding_sampler_type_id)
+	{
+		uint32_t samp_type_id = ir.increase_bound_by(1);
+		auto &samp_type = set<SPIRType>(samp_type_id);
+		samp_type.basetype = SPIRType::Sampler;
+		samp_type.storage = StorageClassUniformConstant;
+
+		argument_buffer_padding_sampler_type_id = samp_type_id;
+	}
+
+	add_argument_buffer_padding_type(argument_buffer_padding_sampler_type_id, struct_type, mbr_idx, arg_buff_index, rez_bind.count);
+}
+
+// Adds the argument buffer padding argument type as a member of the struct type at the member index.
+// Advances both arg_buff_index and mbr_idx to next argument slots.
+void CompilerMSL::add_argument_buffer_padding_type(uint32_t mbr_type_id, SPIRType &struct_type, uint32_t &mbr_idx,
+                                                   uint32_t &arg_buff_index, uint32_t count)
+{
+	uint32_t type_id = mbr_type_id;
+	if (count > 1)
+	{
+		uint32_t ary_type_id = ir.increase_bound_by(1);
+		auto &ary_type = set<SPIRType>(ary_type_id);
+		ary_type = get<SPIRType>(type_id);
+		ary_type.array.push_back(count);
+		ary_type.array_size_literal.push_back(true);
+		ary_type.parent_type = type_id;
+		type_id = ary_type_id;
+	}
+
+	set_member_name(struct_type.self, mbr_idx, join("_m", arg_buff_index, "_pad"));
+	set_extended_member_decoration(struct_type.self, mbr_idx, SPIRVCrossDecorationResourceIndexPrimary, arg_buff_index);
+	struct_type.member_types.push_back(type_id);
+
+	arg_buff_index += count;
+	mbr_idx++;
+}
+
 void CompilerMSL::activate_argument_buffer_resources()
 {
 	// For ABI compatibility, force-enable all resources which are part of argument buffers.
diff --git a/3rdparty/spirv-cross/spirv_msl.hpp b/3rdparty/spirv-cross/spirv_msl.hpp
index 52e96761e..95e0c966c 100644
--- a/3rdparty/spirv-cross/spirv_msl.hpp
+++ b/3rdparty/spirv-cross/spirv_msl.hpp
@@ -71,15 +71,23 @@ struct MSLShaderInput
 // resources consumed by this binding, if the binding represents an array of resources.
 // If the resource array is a run-time-sized array, which are legal in GLSL or SPIR-V, this value
 // will be used to declare the array size in MSL, which does not support run-time-sized arrays.
-// For resources that are not held in a run-time-sized array, the count field does not need to be populated.
+// If pad_argument_buffer_resources is enabled, the base_type and count values are used to
+// specify the base type and array size of the resource in the argument buffer, if that resource
+// is not defined and used by the shader. With pad_argument_buffer_resources enabled, this
+// information will be used to pad the argument buffer structure, in order to align that
+// structure consistently for all uses, across all shaders, of the descriptor set represented
+// by the arugment buffer. If pad_argument_buffer_resources is disabled, base_type does not
+// need to be populated, and if the resource is also not a run-time sized array, the count
+// field does not need to be populated.
 // If using MSL 2.0 argument buffers, the descriptor set is not marked as a discrete descriptor set,
 // and (for iOS only) the resource is not a storage image (sampled != 2), the binding reference we
 // remap to will become an [[id(N)]] attribute within the "descriptor set" argument buffer structure.
-// For resources which are bound in the "classic" MSL 1.0 way or discrete descriptors, the remap will become a
-// [[buffer(N)]], [[texture(N)]] or [[sampler(N)]] depending on the resource types used.
+// For resources which are bound in the "classic" MSL 1.0 way or discrete descriptors, the remap will
+// become a [[buffer(N)]], [[texture(N)]] or [[sampler(N)]] depending on the resource types used.
 struct MSLResourceBinding
 {
 	spv::ExecutionModel stage = spv::ExecutionModelMax;
+	SPIRType::BaseType basetype = SPIRType::Unknown;
 	uint32_t desc_set = 0;
 	uint32_t binding = 0;
 	uint32_t count = 0;
@@ -346,6 +354,19 @@ public:
 		// and would otherwise declare a different IAB.
 		bool force_active_argument_buffer_resources = false;
 
+		// Aligns each resource in an argument buffer to its assigned index value, id(N),
+		// by adding synthetic padding members in the argument buffer struct for any resources
+		// in the argument buffer that are not defined and used by the shader. This allows
+		// the shader to index into the correct argument in a descriptor set argument buffer
+		// that is shared across shaders, where not all resources in the argument buffer are
+		// defined in each shader. For this to work, an MSLResourceBinding must be provided for
+		// all descriptors in any descriptor set held in an argument buffer in the shader, and
+		// that MSLResourceBinding must have the basetype and count members populated correctly.
+		// The implementation here assumes any inline blocks in the argument buffer is provided
+		// in a Metal buffer, and doesn't take into consideration inline blocks that are
+		// optionally embedded directly into the argument buffer via add_inline_uniform_block().
+		bool pad_argument_buffer_resources = false;
+
 		// Forces the use of plain arrays, which works around certain driver bugs on certain versions
 		// of Intel Macbooks. See https://github.com/KhronosGroup/SPIRV-Cross/issues/1210.
 		// May reduce performance in scenarios where arrays are copied around as value-types.
@@ -634,6 +655,7 @@ protected:
 		SPVFuncImplImage2DAtomicCoords, // Emulate texture2D atomic operations
 		SPVFuncImplFMul,
 		SPVFuncImplFAdd,
+		SPVFuncImplFSub,
 		SPVFuncImplCubemapTo2DArrayFace,
 		SPVFuncImplUnsafeArray, // Allow Metal to use the array<T> template to make arrays a value type
 		SPVFuncImplInverse4x4,
@@ -715,6 +737,8 @@ protected:
 	// Threadgroup arrays can't have a wrapper type
 	std::string variable_decl(const SPIRVariable &variable) override;
 
+	bool variable_decl_is_remapped_storage(const SPIRVariable &variable, spv::StorageClass storage) const override;
+
 	// GCC workaround of lambdas calling protected functions (for older GCC versions)
 	std::string variable_decl(const SPIRType &type, const std::string &name, uint32_t id = 0) override;
 
@@ -780,8 +804,11 @@ protected:
 		};
 		std::unordered_map<uint32_t, LocationMeta> location_meta;
 		bool strip_array = false;
+		bool allow_local_declaration = false;
 	};
 
+	std::string to_tesc_invocation_id();
+	void emit_local_masked_variable(const SPIRVariable &masked_var, bool strip_array);
 	void add_variable_to_interface_block(spv::StorageClass storage, const std::string &ib_var_ref, SPIRType &ib_type,
 	                                     SPIRVariable &var, InterfaceBlockMeta &meta);
 	void add_composite_variable_to_interface_block(spv::StorageClass storage, const std::string &ib_var_ref,
@@ -794,14 +821,15 @@ protected:
 	void add_composite_member_variable_to_interface_block(spv::StorageClass storage, const std::string &ib_var_ref,
 	                                                      SPIRType &ib_type, SPIRVariable &var, uint32_t index,
 	                                                      InterfaceBlockMeta &meta);
-	uint32_t get_accumulated_member_location(const SPIRVariable &var, uint32_t mbr_idx, bool strip_array);
 	void add_tess_level_input_to_interface_block(const std::string &ib_var_ref, SPIRType &ib_type, SPIRVariable &var);
 
 	void fix_up_interface_member_indices(spv::StorageClass storage, uint32_t ib_type_id);
 
-	void mark_location_as_used_by_shader(uint32_t location, const SPIRType &type, spv::StorageClass storage);
+	void mark_location_as_used_by_shader(uint32_t location, const SPIRType &type,
+	                                     spv::StorageClass storage, bool fallback = false);
 	uint32_t ensure_correct_builtin_type(uint32_t type_id, spv::BuiltIn builtin);
-	uint32_t ensure_correct_input_type(uint32_t type_id, uint32_t location, uint32_t num_components = 0);
+	uint32_t ensure_correct_input_type(uint32_t type_id, uint32_t location,
+	                                   uint32_t num_components, bool strip_array);
 
 	void emit_custom_templates();
 	void emit_custom_functions();
@@ -886,8 +914,8 @@ protected:
 	void add_pragma_line(const std::string &line);
 	void add_typedef_line(const std::string &line);
 	void emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uint32_t id_mem_sem);
-	void emit_array_copy(const std::string &lhs, uint32_t rhs_id, spv::StorageClass lhs_storage,
-	                     spv::StorageClass rhs_storage) override;
+	void emit_array_copy(const std::string &lhs, uint32_t lhs_id, uint32_t rhs_id,
+	                     spv::StorageClass lhs_storage, spv::StorageClass rhs_storage) override;
 	void build_implicit_builtins();
 	uint32_t build_constant_uint_array_pointer();
 	void emit_entry_point_declarations() override;
@@ -913,6 +941,9 @@ protected:
 	uint32_t view_mask_buffer_id = 0;
 	uint32_t dynamic_offsets_buffer_id = 0;
 	uint32_t uint_type_id = 0;
+	uint32_t argument_buffer_padding_buffer_type_id = 0;
+	uint32_t argument_buffer_padding_image_type_id = 0;
+	uint32_t argument_buffer_padding_sampler_type_id = 0;
 
 	bool does_shader_write_sample_mask = false;
 
@@ -922,6 +953,7 @@ protected:
 
 	void analyze_sampled_image_usage();
 
+	bool access_chain_needs_stage_io_builtin_translation(uint32_t base) override;
 	void prepare_access_chain_for_scalar_access(std::string &expr, const SPIRType &type, spv::StorageClass storage,
 	                                            bool &is_packed) override;
 	void fix_up_interpolant_access_chain(const uint32_t *ops, uint32_t length);
@@ -941,6 +973,7 @@ protected:
 	std::map<uint32_t, MSLShaderInput> inputs_by_location;
 	std::unordered_map<uint32_t, MSLShaderInput> inputs_by_builtin;
 	std::unordered_set<uint32_t> location_inputs_in_use;
+	std::unordered_set<uint32_t> location_inputs_in_use_fallback;
 	std::unordered_map<uint32_t, uint32_t> fragment_output_components;
 	std::unordered_map<uint32_t, uint32_t> builtin_to_automatic_input_location;
 	std::set<std::string> pragma_lines;
@@ -948,7 +981,7 @@ protected:
 	SmallVector<uint32_t> vars_needing_early_declaration;
 
 	std::unordered_map<StageSetBinding, std::pair<MSLResourceBinding, bool>, InternalHasher> resource_bindings;
-	uint32_t type_to_location_count(const SPIRType &type) const;
+	std::unordered_map<StageSetBinding, uint32_t, InternalHasher> resource_arg_buff_idx_to_binding_number;
 
 	uint32_t next_metal_resource_index_buffer = 0;
 	uint32_t next_metal_resource_index_texture = 0;
@@ -962,6 +995,7 @@ protected:
 	VariableID patch_stage_out_var_id = 0;
 	VariableID stage_in_ptr_var_id = 0;
 	VariableID stage_out_ptr_var_id = 0;
+	VariableID stage_out_masked_builtin_type_id = 0;
 
 	// Handle HLSL-style 0-based vertex/instance index.
 	enum class TriState
@@ -1027,6 +1061,11 @@ protected:
 
 	void analyze_argument_buffers();
 	bool descriptor_set_is_argument_buffer(uint32_t desc_set) const;
+	MSLResourceBinding &get_argument_buffer_resource(uint32_t desc_set, uint32_t arg_idx);
+	void add_argument_buffer_padding_buffer_type(SPIRType &struct_type, uint32_t &mbr_idx, uint32_t &arg_buff_index, MSLResourceBinding &rez_bind);
+	void add_argument_buffer_padding_image_type(SPIRType &struct_type, uint32_t &mbr_idx, uint32_t &arg_buff_index, MSLResourceBinding &rez_bind);
+	void add_argument_buffer_padding_sampler_type(SPIRType &struct_type, uint32_t &mbr_idx, uint32_t &arg_buff_index, MSLResourceBinding &rez_bind);
+	void add_argument_buffer_padding_type(uint32_t mbr_type_id, SPIRType &struct_type, uint32_t &mbr_idx, uint32_t &arg_buff_index, uint32_t count);
 
 	uint32_t get_target_components_for_fragment_location(uint32_t location) const;
 	uint32_t build_extended_vector_type(uint32_t type_id, uint32_t components,
@@ -1044,6 +1083,8 @@ protected:
 	bool type_is_pointer_to_pointer(const SPIRType &type) const;
 	bool is_supported_argument_buffer_type(const SPIRType &type) const;
 
+	bool variable_storage_requires_stage_io(spv::StorageClass storage) const;
+
 	// OpcodeHandler that handles several MSL preprocessing operations.
 	struct OpCodePreprocessor : OpcodeHandler
 	{
@@ -1087,11 +1128,8 @@ protected:
 	{
 		enum SortAspect
 		{
-			Location,
-			LocationReverse,
-			Offset,
-			OffsetThenLocationReverse,
-			Alphabetical
+			LocationThenBuiltInType,
+			Offset
 		};
 
 		void sort();