Updated astc-encoder.

2026-02-17 20:52:38 +01:00 · 2024-05-24 16:41:07 -07:00
parent e9fa0ceff2
commit 98a40e8533
19 changed files with 373 additions and 160 deletions
--- a/3rdparty/astc-encoder/include/astcenc.h
+++ b/3rdparty/astc-encoder/include/astcenc.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2020-2023 Arm Limited
+// Copyright 2020-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -215,6 +215,8 @@ enum astcenc_error {
 	ASTCENC_ERR_BAD_CONTEXT,
 	/** @brief The call failed due to unimplemented functionality. */
 	ASTCENC_ERR_NOT_IMPLEMENTED,
+	/** @brief The call failed due to an out-of-spec decode mode flag set. */
+	ASTCENC_ERR_BAD_DECODE_MODE,
 #if defined(ASTCENC_DIAGNOSTICS)
 	/** @brief The call failed due to an issue with diagnostic tracing. */
 	ASTCENC_ERR_DTRACE_FAILURE,
@@ -302,6 +304,11 @@ enum astcenc_type
 	ASTCENC_TYPE_F32 = 2
 };

+/**
+ * @brief Function pointer type for compression progress reporting callback.
+ */
+extern "C" typedef void (*astcenc_progress_callback)(float);
+
 /**
 * @brief Enable normal map compression.
 *
@@ -312,6 +319,19 @@ enum astcenc_type
 */
 static const unsigned int ASTCENC_FLG_MAP_NORMAL          = 1 << 0;

+/**
+ * @brief Enable compression heuristics that assume use of decode_unorm8 decode mode.
+ *
+ * The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this
+ * flag during compression will allow the compressor to use the correct rounding when selecting
+ * encodings. This will improve the compressed image quality if your application is using the
+ * decode_unorm8 decode mode, but will reduce image quality if using decode_fp16.
+ *
+ * Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of
+ * this setting.
+ */
+static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8        = 1 << 1;
+
 /**
 * @brief Enable alpha weighting.
 *
@@ -378,6 +398,7 @@ static const unsigned int ASTCENC_ALL_FLAGS =
                              ASTCENC_FLG_MAP_RGBM |
                              ASTCENC_FLG_USE_ALPHA_WEIGHT |
                              ASTCENC_FLG_USE_PERCEPTUAL |
+                              ASTCENC_FLG_USE_DECODE_UNORM8 |
                              ASTCENC_FLG_DECOMPRESS_ONLY |
                              ASTCENC_FLG_SELF_DECOMPRESS_ONLY;

@@ -550,6 +571,16 @@ struct astcenc_config
 	 */
 	float tune_search_mode0_enable;

+	/**
+	 * @brief The progress callback, can be @c nullptr.
+	 *
+	 * If this is specified the codec will peridocially report progress for
+	 * compression as a percentage between 0 and 100. The callback is called from one
+	 * of the compressor threads, so doing significant work in the callback will
+	 * reduce compression performance.
+	 */
+	astcenc_progress_callback progress_callback;
+
 #if defined(ASTCENC_DIAGNOSTICS)
 	/**
 	 * @brief The path to save the diagnostic trace data to.
--- a/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2023 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -894,32 +894,55 @@ void unpack_color_endpoints(
 		}
 	}

-	vint4 ldr_scale(257);
-	vint4 hdr_scale(1);
-	vint4 output_scale = ldr_scale;
+	// Handle endpoint errors and expansion

-	// An LDR profile image
-	if ((decode_mode == ASTCENC_PRF_LDR) ||
-	    (decode_mode == ASTCENC_PRF_LDR_SRGB))
+	// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
+	if (decode_mode == ASTCENC_PRF_LDR)
 	{
-		// Also matches HDR alpha, as cannot have HDR alpha without HDR RGB
-		if (rgb_hdr == true)
+		// Error color - HDR endpoint in an LDR encoding
+		if (rgb_hdr || alpha_hdr)
 		{
-			output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
-			output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
-			output_scale = hdr_scale;
-
+			output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
+			output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
 			rgb_hdr = false;
 			alpha_hdr = false;
 		}
+
+		output0 = output0 * 257;
+		output1 = output1 * 257;
 	}
-	// An HDR profile image
+	// sRGB LDR 8-bit endpoints are expanded to 16 bit by:
+	//  - RGB = shift left by 8 bits and OR with 0x80
+	//  - A = replication
+	else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
+	{
+		// Error color - HDR endpoint in an LDR encoding
+		if (rgb_hdr || alpha_hdr)
+		{
+			output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
+			output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
+			rgb_hdr = false;
+			alpha_hdr = false;
+		}
+
+		vmask4 mask(true, true, true, false);
+
+		vint4 output0rgb = lsl<8>(output0) | vint4(0x80);
+		vint4 output0a = output0 * 257;
+		output0 = select(output0a, output0rgb, mask);
+
+		vint4 output1rgb = lsl<8>(output1) | vint4(0x80);
+		vint4 output1a = output1 * 257;
+		output1 = select(output1a, output1rgb, mask);
+	}
+	// An HDR profile decode, but may be using linear LDR endpoints
+	// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
+	// HDR endpoints are already 16-bit
 	else
 	{
 		vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
-		output_scale = select(ldr_scale, hdr_scale, hdr_lanes);
+		vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes);
+		output0 = output0 * output_scale;
+		output1 = output1 * output_scale;
 	}
-
-	output0 = output0 * output_scale;
-	output1 = output1 * output_scale;
 }
--- a/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -247,7 +247,7 @@ static bool realign_weights_decimated(
 		}

 		// Create an unquantized weight grid for this decimation level
-		alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS];
+		ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
 		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
 		{
 			vint unquant_value(dec_weights_uquant + we_idx);
@@ -467,7 +467,7 @@ static float compress_symbolic_block_for_partition_1plane(

 		qwt_bitcounts[i] = static_cast<int8_t>(bitcount);

-		alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
+		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];

 		// Generate the optimized set of weights for the weight mode
 		compute_quantized_weights_for_decimation(
@@ -830,7 +830,7 @@ static float compress_symbolic_block_for_partition_2planes(
 		unsigned int decimation_mode = bm.decimation_mode;
 		const auto& di = bsd.get_decimation_info(decimation_mode);

-		alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
+		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];

 		// Generate the optimized set of weights for the mode
 		compute_quantized_weights_for_decimation(
--- a/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -27,15 +27,15 @@
 /**
 * @brief Compute the integer linear interpolation of two color endpoints.
 *
- * @param decode_mode   The ASTC profile (linear or sRGB)
+ * @param u8_mask       The mask for lanes using decode_unorm8 rather than decode_f16.
 * @param color0        The endpoint0 color.
 * @param color1        The endpoint1 color.
- * @param weights        The interpolation weight (between 0 and 64).
+ * @param weights       The interpolation weight (between 0 and 64).
 *
 * @return The interpolated color.
 */
 static vint4 lerp_color_int(
-	astcenc_profile decode_mode,
+	vmask4 u8_mask,
 	vint4 color0,
 	vint4 color1,
 	vint4 weights
@@ -43,24 +43,18 @@ static vint4 lerp_color_int(
 	vint4 weight1 = weights;
 	vint4 weight0 = vint4(64) - weight1;

-	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
-	{
-		color0 = asr<8>(color0);
-		color1 = asr<8>(color1);
-	}
-
 	vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
 	color = asr<6>(color);

-	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
-	{
-		color = color * vint4(257);
-	}
+	// For decode_unorm8 values force the codec to bit replicate. This allows the
+	// rest of the codec to assume the full 0xFFFF range for everything and ignore
+	// the decode_mode setting
+	vint4 color_u8 = asr<8>(color) * vint4(257);
+	color = select(color, color_u8, u8_mask);

 	return color;
 }

-
 /**
 * @brief Convert integer color value into a float value for the decoder.
 *
@@ -229,12 +223,13 @@ void decompress_symbolic_block(
 		{
 			vint4 colori(scb.constant_color);

-			// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
-			// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
-			if (decode_mode == ASTCENC_PRF_LDR_SRGB)
-			{
-				colori = asr<8>(colori) * 257;
-			}
+			// Determine the UNORM8 rounding on the decode
+			vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
+
+			// The real decoder would just use the top 8 bits, but we rescale
+			// in to a 16-bit value that rounds correctly.
+			vint4 colori_u8 = asr<8>(colori) * 257;
+			colori = select(colori, colori_u8, u8_mask);

 			vint4 colorf16 = unorm16_to_sf16(colori);
 			color = float16_to_float(colorf16);
@@ -289,6 +284,8 @@ void decompress_symbolic_block(
 	int plane2_component = scb.plane2_component;
 	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);

+	vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
+
 	for (int i = 0; i < partition_count; i++)
 	{
 		// Decode the color endpoints for this partition
@@ -310,7 +307,7 @@ void decompress_symbolic_block(
 		{
 			int tix = pi.texels_of_partition[i][j];
 			vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
-			vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
+			vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
 			vfloat4 colorf = decode_texel(color, lns_mask);

 			blk.data_r[tix] = colorf.lane<0>();
@@ -365,12 +362,14 @@ float compute_symbolic_block_difference_2plane(
 	                       rgb_lns, a_lns,
 	                       ep0, ep1);

+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
+
 	// Unpack and compute error for each texel in the partition
 	unsigned int texel_count = bsd.texel_count;
 	for (unsigned int i = 0; i < texel_count; i++)
 	{
 		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
-		vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);
+		vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);

 		vfloat4 color = int_to_float(colori);
 		vfloat4 oldColor = blk.texel(i);
@@ -444,6 +443,8 @@ float compute_symbolic_block_difference_1plane(
 	int plane1_weights[BLOCK_MAX_TEXELS];
 	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);

+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
+
 	vfloat4 summa = vfloat4::zero();
 	for (unsigned int i = 0; i < partition_count; i++)
 	{
@@ -464,7 +465,7 @@ float compute_symbolic_block_difference_1plane(
 		for (unsigned int j = 0; j < texel_count; j++)
 		{
 			unsigned int tix = pi.texels_of_partition[i][j];
-			vint4 colori = lerp_color_int(config.profile, ep0, ep1,
+			vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
 			                              vint4(plane1_weights[tix]));

 			vfloat4 color = int_to_float(colori);
@@ -532,7 +533,7 @@ float compute_symbolic_block_difference_1plane_1partition(
 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);

 	// Unquantize and undecimate the weights
-	alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
 	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);

 	// Decode the color endpoints for this partition
@@ -547,19 +548,12 @@ float compute_symbolic_block_difference_1plane_1partition(
 	                       rgb_lns, a_lns,
 	                       ep0, ep1);

-
-	// Pre-shift sRGB so things round correctly
-	if (config.profile == ASTCENC_PRF_LDR_SRGB)
-	{
-		ep0 = asr<8>(ep0);
-		ep1 = asr<8>(ep1);
-	}
+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);

 	// Unpack and compute error for each texel in the partition
 	vfloatacc summav = vfloatacc::zero();

 	vint lane_id = vint::lane_id();
-	vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);

 	unsigned int texel_count = bsd.texel_count;
 	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
@@ -578,11 +572,25 @@ float compute_symbolic_block_difference_1plane_1partition(
 		vint ep0_b = vint(ep0.lane<2>()) * weight0;
 		vint ep0_a = vint(ep0.lane<3>()) * weight0;

-		// Shift so things round correctly
-		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
-		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
-		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
-		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;
+		// Combine contributions
+		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
+		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
+		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
+		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
+
+		// If using a U8 decode mode bit replicate top 8 bits
+		// so rest of codec can assume 0xFFFF max range everywhere
+		vint colori_r8 = asr<8>(colori_r) * vint(257);
+		colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
+
+		vint colori_g8 = asr<8>(colori_g) * vint(257);
+		colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
+
+		vint colori_b8 = asr<8>(colori_b) * vint(257);
+		colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
+
+		vint colori_a8 = asr<8>(colori_a) * vint(257);
+		colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));

 		// Compute color diff
 		vfloat color_r = int_to_float(colori_r);
--- a/3rdparty/astc-encoder/source/astcenc_entry.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_entry.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -217,11 +217,13 @@ static astcenc_error validate_block_size(
 /**
 * @brief Validate flags.
 *
- * @param flags   The flags to check.
+ * @param profile   The profile to check.
+ * @param flags     The flags to check.
 *
 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
 */
 static astcenc_error validate_flags(
+	astcenc_profile profile,
 	unsigned int flags
 ) {
 	// Flags field must not contain any unknown flag bits
@@ -239,6 +241,14 @@ static astcenc_error validate_flags(
 		return ASTCENC_ERR_BAD_FLAGS;
 	}

+	// Decode_unorm8 must only be used with an LDR profile
+	bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
+	bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
+	if (is_unorm8 && is_hdr)
+	{
+		return ASTCENC_ERR_BAD_DECODE_MODE;
+	}
+
 	return ASTCENC_SUCCESS;
 }

@@ -364,7 +374,7 @@ static astcenc_error validate_config(
 		return status;
 	}

-	status = validate_flags(config.flags);
+	status = validate_flags(config.profile, config.flags);
 	if (status != ASTCENC_SUCCESS)
 	{
 		return status;
@@ -591,7 +601,7 @@ astcenc_error astcenc_config_init(
 	}

 	// Flags field must not contain any unknown flag bits
-	status = validate_flags(flags);
+	status = validate_flags(profile, flags);
 	if (status != ASTCENC_SUCCESS)
 	{
 		return status;
@@ -689,6 +699,12 @@ astcenc_error astcenc_context_alloc(
 	}

 	ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
+	if (!ctx->bsd)
+	{
+		delete ctxo;
+		return ASTCENC_ERR_OUT_OF_MEM;
+	}
+
 	bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
 	init_block_size_descriptor(config.block_x, config.block_y, config.block_z,
 	                           can_omit_modes,
@@ -698,7 +714,7 @@ astcenc_error astcenc_context_alloc(

 #if !defined(ASTCENC_DECOMPRESS_ONLY)
 	// Do setup only needed by compression
-	if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY))
+	if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
 	{
 		// Turn a dB limit into a per-texel error for faster use later
 		if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
@@ -712,7 +728,7 @@ astcenc_error astcenc_context_alloc(

 		size_t worksize = sizeof(compression_working_buffers) * thread_count;
 		ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
-		static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0,
+		static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
 		              "compression_working_buffers size must be multiple of vector alignment");
 		if (!ctx->working_buffers)
 		{
@@ -802,6 +818,8 @@ static void compress_image(
 	int row_blocks = xblocks;
 	int plane_blocks = xblocks * yblocks;

+	blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
+
 	// Populate the block channel weights
 	blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
 	                             ctx.config.cw_g_weight,
@@ -812,7 +830,7 @@ static void compress_image(
 	auto& temp_buffers = ctx.working_buffers[thread_index];

 	// Only the first thread actually runs the initializer
-	ctxo.manage_compress.init(block_count);
+	ctxo.manage_compress.init(block_count, ctx.config.progress_callback);

 	// Determine if we can use an optimized load function
 	bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
@@ -1137,6 +1155,7 @@ astcenc_error astcenc_decompress_image(
 	unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
 	unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
 	unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
+	unsigned int block_count = zblocks * yblocks * xblocks;

 	int row_blocks = xblocks;
 	int plane_blocks = xblocks * yblocks;
@@ -1148,9 +1167,12 @@ astcenc_error astcenc_decompress_image(
 		return ASTCENC_ERR_OUT_OF_MEM;
 	}

-	image_block blk;
+	image_block blk {};
 	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);

+	// Decode mode inferred from the output data type
+	blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
+
 	// If context thread count is one then implicitly reset
 	if (ctx->thread_count == 1)
 	{
@@ -1158,7 +1180,7 @@ astcenc_error astcenc_decompress_image(
 	}

 	// Only the first thread actually runs the initializer
-	ctxo->manage_decompress.init(zblocks * yblocks * xblocks);
+	ctxo->manage_decompress.init(block_count, nullptr);

 	// All threads run this processing loop until there is no work remaining
 	while (true)
@@ -1356,6 +1378,8 @@ const char* astcenc_get_error_string(
 		return "ASTCENC_ERR_BAD_CONTEXT";
 	case ASTCENC_ERR_NOT_IMPLEMENTED:
 		return "ASTCENC_ERR_NOT_IMPLEMENTED";
+	case ASTCENC_ERR_BAD_DECODE_MODE:
+		return "ASTCENC_ERR_BAD_DECODE_MODE";
 #if defined(ASTCENC_DIAGNOSTICS)
 	case ASTCENC_ERR_DTRACE_FAILURE:
 		return "ASTCENC_ERR_DTRACE_FAILURE";
--- a/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -873,7 +873,7 @@ void compute_ideal_weights_for_decimation(
 	}

 	// Otherwise compute an estimate and perform single refinement iteration
-	alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];

 	// Compute an initial average for each decimated weight
 	bool constant_wes = ei.is_constant_weight_error_scale;
@@ -1171,7 +1171,7 @@ void recompute_ideal_colors_1plane(
 	promise(total_texel_count > 0);
 	promise(partition_count > 0);

-	alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS];
+	ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS];
 	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
 	{
 		vint unquant_value(dec_weights_uquant + i);
@@ -1179,7 +1179,7 @@ void recompute_ideal_colors_1plane(
 		storea(unquant_valuef, dec_weight + i);
 	}

-	alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS];
 	float* undec_weight_ref;
 	if (di.max_texel_weight_count == 1)
 	{
@@ -1394,8 +1394,8 @@ void recompute_ideal_colors_2planes(
 	promise(total_texel_count > 0);
 	promise(weight_count > 0);

-	alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
-	alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
+	ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
+	ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];

 	assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);

@@ -1410,8 +1410,8 @@ void recompute_ideal_colors_2planes(
 		storea(unquant_value2f, dec_weight_plane2 + i);
 	}

-	alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS];
-	alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS];

 	float* undec_weight_plane1_ref;
 	float* undec_weight_plane2_ref;
--- a/3rdparty/astc-encoder/source/astcenc_image.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_image.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2022 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -109,7 +109,7 @@ static vfloat4 swz_texel(
 	vfloat4 data,
 	const astcenc_swizzle& swz
 ) {
-	alignas(16) float datas[6];
+	ASTCENC_ALIGNAS float datas[6];

 	storea(data, datas);
 	datas[ASTCENC_SWZ_0] = 0.0f;
--- a/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -464,10 +464,10 @@ static inline void write_bits(
 }

 /**
- * @brief Read up to 8 bits at an arbitrary bit offset.
+ * @brief Read up to 16 bits from two bytes.
 *
- * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
- * span two separate bytes in memory.
+ * This function reads a packed N-bit field from two bytes in memory. The stored value must exist
+ * within the two bytes, but can start at an arbitary bit offset and span the two bytes in memory.
 *
 * @param         bitcount    The number of bits to read.
 * @param         bitoffset   The bit offset to read from, between 0 and 7.
--- a/3rdparty/astc-encoder/source/astcenc_internal.h
+++ b/3rdparty/astc-encoder/source/astcenc_internal.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -29,6 +29,7 @@
 	#include <cstdio>
 #endif
 #include <cstdlib>
+#include <limits>

 #include "astcenc.h"
 #include "astcenc_mathlib.h"
@@ -325,10 +326,10 @@ struct partition_info
 	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS];

 	/** @brief The partition of each texel in the block. */
-	uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS uint8_t partition_of_texel[BLOCK_MAX_TEXELS];

 	/** @brief The list of texels in each partition. */
-	uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
 };

 /**
@@ -366,40 +367,40 @@ struct decimation_info
 	 * @brief The number of weights that contribute to each texel.
 	 * Value is between 1 and 4.
 	 */
-	uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS uint8_t texel_weight_count[BLOCK_MAX_TEXELS];

 	/**
 	 * @brief The weight index of the N weights that are interpolated for each texel.
 	 * Stored transposed to improve vectorization.
 	 */
-	uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];

 	/**
 	 * @brief The bilinear contribution of the N weights that are interpolated for each texel.
 	 * Value is between 0 and 16, stored transposed to improve vectorization.
 	 */
-	uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];

 	/**
 	 * @brief The bilinear contribution of the N weights that are interpolated for each texel.
 	 * Value is between 0 and 1, stored transposed to improve vectorization.
 	 */
-	alignas(ASTCENC_VECALIGN) float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];

 	/** @brief The number of texels that each stored weight contributes to. */
-	uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
+	ASTCENC_ALIGNAS uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];

 	/**
 	 * @brief The list of texels that use a specific weight index.
 	 * Stored transposed to improve vectorization.
 	 */
-	uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
+	ASTCENC_ALIGNAS uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];

 	/**
 	 * @brief The bilinear contribution to the N texels that use each weight.
 	 * Value is between 0 and 1, stored transposed to improve vectorization.
 	 */
-	alignas(ASTCENC_VECALIGN) float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
+	ASTCENC_ALIGNAS float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];

 	/**
 	 * @brief The bilinear contribution to the Nth texel that uses each weight.
@@ -579,7 +580,7 @@ struct block_size_descriptor
 	decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES];

 	/** @brief The active decimation tables, stored in low indices. */
-	alignas(ASTCENC_VECALIGN) decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
+	ASTCENC_ALIGNAS decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];

 	/** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */
 	uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES];
@@ -731,7 +732,11 @@ struct block_size_descriptor
 *
 * The @c data_[rgba] fields store the image data in an encoded SoA float form designed for easy
 * vectorization. Input data is converted to float and stored as values between 0 and 65535. LDR
- * data is stored as direct UNORM data, HDR data is stored as LNS data.
+ * data is stored as direct UNORM data, HDR data is stored as LNS data. They are allocated SIMD
+ * elements over-size to allow vectorized stores of unaligned and partial SIMD lanes (e.g. in a
+ * 6x6x6 block the final row write will read elements 210-217 (vec8) or 214-217 (vec4), which is
+ * two elements above the last real data element). The overspill values are never written to memory,
+ * and would be benign, but the padding avoids hitting undefined behavior.
 *
 * The @c rgb_lns and @c alpha_lns fields that assigned a per-texel use of HDR are only used during
 * decompression. The current compressor will always use HDR endpoint formats when in HDR mode.
@@ -739,16 +744,16 @@ struct block_size_descriptor
 struct image_block
 {
 	/** @brief The input (compress) or output (decompress) data for the red color component. */
-	alignas(ASTCENC_VECALIGN) float data_r[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float data_r[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];

 	/** @brief The input (compress) or output (decompress) data for the green color component. */
-	alignas(ASTCENC_VECALIGN) float data_g[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float data_g[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];

 	/** @brief The input (compress) or output (decompress) data for the blue color component. */
-	alignas(ASTCENC_VECALIGN) float data_b[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float data_b[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];

 	/** @brief The input (compress) or output (decompress) data for the alpha color component. */
-	alignas(ASTCENC_VECALIGN) float data_a[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float data_a[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];

 	/** @brief The number of texels in the block. */
 	uint8_t texel_count;
@@ -771,6 +776,9 @@ struct image_block
 	/** @brief Is this grayscale block where R == G == B for all texels? */
 	bool grayscale;

+	/** @brief Is the eventual decode using decode_unorm8 rounding? */
+	bool decode_unorm8;
+
 	/** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */
 	uint8_t rgb_lns[BLOCK_MAX_TEXELS];

@@ -897,10 +905,10 @@ struct endpoints_and_weights
 	endpoints ep;

 	/** @brief The ideal weight for each texel; may be undecimated or decimated. */
-	alignas(ASTCENC_VECALIGN) float weights[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float weights[BLOCK_MAX_TEXELS];

 	/** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */
-	alignas(ASTCENC_VECALIGN) float weight_error_scale[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float weight_error_scale[BLOCK_MAX_TEXELS];
 };

 /**
@@ -930,7 +938,7 @@ struct encoding_choice_errors
 /**
 * @brief Preallocated working buffers, allocated per thread during context creation.
 */
-struct alignas(ASTCENC_VECALIGN) compression_working_buffers
+struct ASTCENC_ALIGNAS compression_working_buffers
 {
 	/** @brief Ideal endpoints and weights for plane 1. */
 	endpoints_and_weights ei1;
@@ -946,17 +954,17 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers
 	 *
 	 * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
 	 */
-	alignas(ASTCENC_VECALIGN) float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
+	ASTCENC_ALIGNAS float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];

 	/**
 	 * @brief Decimated quantized weight values in the unquantized 0-64 range.
 	 *
 	 * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
 	 */
-	uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
+	ASTCENC_ALIGNAS uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];

 	/** @brief Error of the best encoding combination for each block mode. */
-	alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
+	ASTCENC_ALIGNAS float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];

 	/** @brief The best color quant for each block mode. */
 	uint8_t best_quant_levels[WEIGHTS_MAX_BLOCK_MODES];
@@ -1107,7 +1115,7 @@ struct symbolic_compressed_block
 	 *
 	 * If dual plane, the second plane starts at @c weights[WEIGHTS_PLANE2_OFFSET].
 	 */
-	uint8_t weights[BLOCK_MAX_WEIGHTS];
+	ASTCENC_ALIGNAS uint8_t weights[BLOCK_MAX_WEIGHTS];

 	/**
 	 * @brief Get the weight quantization used by this block mode.
@@ -1563,6 +1571,33 @@ unsigned int find_best_partition_candidates(
  Functionality for managing images and image related data.
 ============================================================================ */

+/**
+ * @brief Get a vector mask indicating lanes decompressing into a UNORM8 value.
+ *
+ * @param decode_mode   The color profile for LDR_SRGB settings.
+ * @param blk           The image block for output image bitness settings.
+ *
+ * @return The component mask vector.
+ */
+static inline vmask4 get_u8_component_mask(
+	astcenc_profile decode_mode,
+	const image_block& blk
+) {
+	vmask4 u8_mask(false);
+	// Decode mode writing to a unorm8 output value
+	if (blk.decode_unorm8)
+	{
+		u8_mask = vmask4(true);
+	}
+	// SRGB writing to a unorm8 RGB value
+	else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
+	{
+		u8_mask = vmask4(true, true, true, false);
+	}
+
+	return u8_mask;
+}
+
 /**
 * @brief Setup computation of regional averages in an image.
 *
@@ -1816,7 +1851,7 @@ uint8_t pack_color_endpoints(
 *
 * Endpoints must be unscrambled and converted into the 0-255 range before calling this functions.
 *
- * @param      decode_mode   The decode mode (LDR, HDR).
+ * @param      decode_mode   The decode mode (LDR, HDR, etc).
 * @param      format        The color endpoint mode used.
 * @param      input         The raw array of encoded input integers. The length of this array
 *                           depends on @c format; it can be safely assumed to be large enough.
@@ -2142,10 +2177,11 @@ Platform-specific functions.
 /**
 * @brief Allocate an aligned memory buffer.
 *
- * Allocated memory must be freed by aligned_free;
+ * Allocated memory must be freed by aligned_free.
 *
 * @param size    The desired buffer size.
- * @param align   The desired buffer alignment; must be 2^N.
+ * @param align   The desired buffer alignment; must be 2^N, may be increased
+ *                by the implementation to a minimum allowable alignment.
 *
 * @return The memory buffer pointer or nullptr on allocation failure.
 */
@@ -2155,10 +2191,14 @@ T* aligned_malloc(size_t size, size_t align)
 	void* ptr;
 	int error = 0;

+	// Don't allow this to under-align a type
+	size_t min_align = astc::max(alignof(T), sizeof(void*));
+	size_t real_align = astc::max(min_align, align);
+
 #if defined(_WIN32)
-	ptr = _aligned_malloc(size, align);
+	ptr = _aligned_malloc(size, real_align);
 #else
-	error = posix_memalign(&ptr, align, size);
+	error = posix_memalign(&ptr, real_align, size);
 #endif

 	if (error || (!ptr))
--- a/3rdparty/astc-encoder/source/astcenc_internal_entry.h
+++ b/3rdparty/astc-encoder/source/astcenc_internal_entry.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2022 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -118,6 +118,18 @@ private:
 	/** @brief Number of tasks that need to be processed. */
 	unsigned int m_task_count;

+	/** @brief Progress callback (optional). */
+	astcenc_progress_callback m_callback;
+
+	/** @brief Lock used for callback synchronization. */
+	std::mutex m_callback_lock;
+
+	/** @brief Minimum progress before making a callback. */
+	float m_callback_min_diff;
+
+	/** @brief Last progress callback value. */
+	float m_callback_last_value;
+
 public:
 	/** @brief Create a new ParallelManager. */
 	ParallelManager()
@@ -138,6 +150,9 @@ public:
 		m_start_count = 0;
 		m_done_count = 0;
 		m_task_count = 0;
+		m_callback = nullptr;
+		m_callback_last_value = 0.0f;
+		m_callback_min_diff = 1.0f;
 	}

 	/**
@@ -166,14 +181,20 @@ public:
 	 * initialization. Other threads will block and wait for it to complete.
 	 *
 	 * @param task_count   Total number of tasks needing processing.
+	 * @param callback     Function pointer for progress status callbacks.
 	 */
-	void init(unsigned int task_count)
+	void init(unsigned int task_count, astcenc_progress_callback callback)
 	{
 		std::lock_guard<std::mutex> lck(m_lock);
 		if (!m_init_done)
 		{
+			m_callback = callback;
 			m_task_count = task_count;
 			m_init_done = true;
+
+			// Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead
+			float min_diff = (4096.0f / static_cast<float>(task_count)) * 100.0f;
+			m_callback_min_diff = astc::max(min_diff, 1.0f);
 		}
 	}

@@ -212,12 +233,49 @@ public:
 	{
 		// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
 		// update here and the wait() for other threads
-		std::unique_lock<std::mutex> lck(m_lock);
-		this->m_done_count += count;
-		if (m_done_count == m_task_count)
+		unsigned int local_count;
+		float local_last_value;
 		{
-			lck.unlock();
-			m_complete.notify_all();
+			std::unique_lock<std::mutex> lck(m_lock);
+			m_done_count += count;
+			local_count = m_done_count;
+			local_last_value = m_callback_last_value;
+
+			if (m_done_count == m_task_count)
+			{
+				// Ensure the progress bar hits 100%
+				if (m_callback)
+				{
+					std::unique_lock<std::mutex> cblck(m_callback_lock);
+					m_callback(100.0f);
+					m_callback_last_value = 100.0f;
+				}
+
+				lck.unlock();
+				m_complete.notify_all();
+			}
+		}
+
+		// Process progress callback if we have one
+		if (m_callback)
+		{
+			// Initial lockless test - have we progressed enough to emit?
+			float num = static_cast<float>(local_count);
+			float den = static_cast<float>(m_task_count);
+			float this_value =  (num / den) * 100.0f;
+			bool report_test = (this_value - local_last_value) > m_callback_min_diff;
+
+			// Recheck under lock, because another thread might report first
+			if (report_test)
+			{
+				std::unique_lock<std::mutex> cblck(m_callback_lock);
+				bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff;
+				if (report_retest)
+				{
+					m_callback(this_value);
+					m_callback_last_value = this_value;
+				}
+			}
 		}
 	}

--- a/3rdparty/astc-encoder/source/astcenc_mathlib.h
+++ b/3rdparty/astc-encoder/source/astcenc_mathlib.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -73,10 +73,22 @@
  #endif
 #endif

+// Force vector-sized SIMD alignment
 #if ASTCENC_AVX
  #define ASTCENC_VECALIGN 32
-#else
+#elif ASTCENC_SSE || ASTCENC_NEON
  #define ASTCENC_VECALIGN 16
+// Use default alignment for non-SIMD builds
+#else
+  #define ASTCENC_VECALIGN 0
+#endif
+
+// C++11 states that alignas(0) should be ignored but GCC doesn't do
+// this on some versions, so workaround and avoid emitting alignas(0)
+#if ASTCENC_VECALIGN > 0
+	#define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN)
+#else
+	#define ASTCENC_ALIGNAS
 #endif

 #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
--- a/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp
@@ -15,13 +15,13 @@
 // under the License.
 // ----------------------------------------------------------------------------

-#include "astcenc_mathlib.h"
-
 /**
 * @brief Soft-float library for IEEE-754.
 */
 #if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)

+#include "astcenc_mathlib.h"
+
 /*	sized soft-float types. These are mapped to the sized integer
    types of C99, instead of C's floating-point types; this is because
    the library needs to maintain exact, bit-level control on all
--- a/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp
@@ -330,12 +330,14 @@ void physical_to_symbolic(
 				return;
 			}

+			// Low values span 3 bytes so need two read_bits calls
 			int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8);
-			int vx_high_s = read_bits(8, 25, pcb) | (read_bits(5, 25 + 8, pcb) << 8);
+			int vx_high_s = read_bits(13, 25, pcb);
 			int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8);
-			int vx_high_t = read_bits(8, 51, pcb) | (read_bits(5, 51 + 8, pcb) << 8);
+			int vx_high_t = read_bits(13, 51, pcb);

-			int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
+			int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF &&
+			               vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;

 			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
 			{
@@ -350,12 +352,14 @@ void physical_to_symbolic(
 			int vx_high_s = read_bits(9, 19, pcb);
 			int vx_low_t = read_bits(9, 28, pcb);
 			int vx_high_t = read_bits(9, 37, pcb);
-			int vx_low_p = read_bits(9, 46, pcb);
-			int vx_high_p = read_bits(9, 55, pcb);
+			int vx_low_r = read_bits(9, 46, pcb);
+			int vx_high_r = read_bits(9, 55, pcb);

-			int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;
+			int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF &&
+			               vx_low_t == 0x1FF && vx_high_t == 0x1FF &&
+			               vx_low_r == 0x1FF && vx_high_r == 0x1FF;

-			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_p >= vx_high_p) && !all_ones)
+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_r >= vx_high_r) && !all_ones)
 			{
 				scb.block_type = SYM_BTYPE_ERROR;
 				return;
@@ -470,8 +474,7 @@ void physical_to_symbolic(
 				bitpos += 2;
 			}
 		}
-		scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb) |
-		                                            (read_bits(PARTITION_INDEX_BITS - 6, 19, pcb) << 6));
+		scb.partition_index = static_cast<uint16_t>(read_bits(10, 13, pcb));
 	}

 	for (int i = 0; i < partition_count; i++)
--- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_avx2_8.h
+++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_avx2_8.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2022 Arm Limited
+// Copyright 2019-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -1170,7 +1170,7 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 ma
 */
 ASTCENC_SIMD_INLINE void print(vint8 a)
 {
-	alignas(ASTCENC_VECALIGN) int v[8];
+	alignas(32) int v[8];
 	storea(a, v);
 	printf("v8_i32:\n  %8d %8d %8d %8d %8d %8d %8d %8d\n",
 	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
@@ -1181,7 +1181,7 @@ ASTCENC_SIMD_INLINE void print(vint8 a)
 */
 ASTCENC_SIMD_INLINE void printx(vint8 a)
 {
-	alignas(ASTCENC_VECALIGN) int v[8];
+	alignas(32) int v[8];
 	storea(a, v);
 	printf("v8_i32:\n  %08x %08x %08x %08x %08x %08x %08x %08x\n",
 	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
@@ -1192,7 +1192,7 @@ ASTCENC_SIMD_INLINE void printx(vint8 a)
 */
 ASTCENC_SIMD_INLINE void print(vfloat8 a)
 {
-	alignas(ASTCENC_VECALIGN) float v[8];
+	alignas(32) float v[8];
 	storea(a, v);
 	printf("v8_f32:\n  %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
 	       static_cast<double>(v[0]), static_cast<double>(v[1]),
--- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h
+++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2020-2021 Arm Limited
+// Copyright 2020-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -383,7 +383,7 @@ static ASTCENC_SIMD_INLINE void bit_transfer_signed(
 */
 ASTCENC_SIMD_INLINE void print(vint4 a)
 {
-	alignas(16) int v[4];
+	ASTCENC_ALIGNAS int v[4];
 	storea(a, v);
 	printf("v4_i32:\n  %8d %8d %8d %8d\n",
 	       v[0], v[1], v[2], v[3]);
@@ -394,7 +394,7 @@ ASTCENC_SIMD_INLINE void print(vint4 a)
 */
 ASTCENC_SIMD_INLINE void printx(vint4 a)
 {
-	alignas(16) int v[4];
+	ASTCENC_ALIGNAS int v[4];
 	storea(a, v);
 	printf("v4_i32:\n  %08x %08x %08x %08x\n",
 	       v[0], v[1], v[2], v[3]);
@@ -405,7 +405,7 @@ ASTCENC_SIMD_INLINE void printx(vint4 a)
 */
 ASTCENC_SIMD_INLINE void print(vfloat4 a)
 {
-	alignas(16) float v[4];
+	ASTCENC_ALIGNAS float v[4];
 	storea(a, v);
 	printf("v4_f32:\n  %0.4f %0.4f %0.4f %0.4f\n",
 	       static_cast<double>(v[0]), static_cast<double>(v[1]),
--- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h
+++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h
@@ -359,9 +359,9 @@ struct vmask4
 	/**
 	 * @brief Get the scalar from a single lane.
 	 */
-	template <int32_t l> ASTCENC_SIMD_INLINE uint32_t lane() const
+	template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
 	{
-		return vgetq_lane_u32(m, l);
+		return vgetq_lane_u32(m, l) != 0;
 	}

 	/**
--- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_none_4.h
+++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_none_4.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2022 Arm Limited
+// Copyright 2019-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -351,6 +351,13 @@ struct vmask4
 		m[3] = d == false ? 0 : -1;
 	}

+	/**
+	 * @brief Get the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE float lane() const
+	{
+		return m[l] != 0;
+	}

 	/**
 	 * @brief The vector ...
@@ -549,10 +556,16 @@ ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
 */
 template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
 {
-	return vint4(a.m[0] << s,
-	             a.m[1] << s,
-	             a.m[2] << s,
-	             a.m[3] << s);
+	// Cast to unsigned to avoid shift in/out of sign bit undefined behavior
+	unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
+	unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
+	unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
+	unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
+
+	return vint4(static_cast<int>(as0),
+	             static_cast<int>(as1),
+	             static_cast<int>(as2),
+	             static_cast<int>(as3));
 }

 /**
@@ -560,6 +573,7 @@ template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
 */
 template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
 {
+	// Cast to unsigned to avoid shift in/out of sign bit undefined behavior
 	unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
 	unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
 	unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
--- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_sse_4.h
+++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_sse_4.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2022 Arm Limited
+// Copyright 2019-2023 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -379,9 +379,9 @@ struct vmask4
 	/**
 	 * @brief Get the scalar value of a single lane.
 	 */
-	template <int l> ASTCENC_SIMD_INLINE float lane() const
+	template <int l> ASTCENC_SIMD_INLINE bool lane() const
 	{
-		return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l));
+		return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)) != 0.0f;
 	}

 	/**
--- a/3rdparty/astc-encoder/source/astcenc_weight_align.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_weight_align.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -60,8 +60,8 @@ static const uint8_t steps_for_quant_level[12] {
 	2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
 };

-alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
-alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
+ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
+ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];

 #if defined(ASTCENC_DIAGNOSTICS)
 	static bool print_once { true };
@@ -99,7 +99,7 @@ static void compute_angular_offsets(
 	promise(weight_count > 0);
 	promise(max_angular_steps > 0);

-	alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS];
+	ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];

 	// Precompute isample; arrays are always allocated 64 elements long
 	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
@@ -242,16 +242,16 @@ static void compute_angular_endpoints_for_quant_levels(
 	unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
 	unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];

-	alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];

 	compute_angular_offsets(weight_count, dec_weight_ideal_value,
 	                        max_angular_steps, angular_offsets);

-	alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS];
-	alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
-	alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
-	alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS];
-	alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];

 	compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
 	                                  max_angular_steps, max_quant_steps,