Updated astc-encoder.

2026-02-17 20:52:38 +01:00 · 2022-11-10 17:27:01 -08:00
parent 225dad7441
commit 8c3aabc3c1
13 changed files with 496 additions and 512 deletions
--- a/3rdparty/astc-encoder/include/astcenc.h
+++ b/3rdparty/astc-encoder/include/astcenc.h
@@ -241,6 +241,9 @@ static const float ASTCENC_PRE_MEDIUM = 60.0f;
 /** @brief The thorough quality search preset. */
 static const float ASTCENC_PRE_THOROUGH = 98.0f;

+/** @brief The thorough quality search preset. */
+static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f;
+
 /** @brief The exhaustive, highest quality, search preset. */
 static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f;

@@ -440,11 +443,25 @@ struct astcenc_config
 	unsigned int tune_partition_count_limit;

 	/**
-	 * @brief The maximum number of partitions searched (-partitionindexlimit).
+	 * @brief The maximum number of partitions searched (-2partitionindexlimit).
 	 *
 	 * Valid values are between 1 and 1024.
 	 */
-	unsigned int tune_partition_index_limit;
+	unsigned int tune_2partition_index_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-3partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_3partition_index_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-4partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_4partition_index_limit;

 	/**
 	 * @brief The maximum centile for block modes searched (-blockmodelimit).
@@ -468,6 +485,27 @@ struct astcenc_config
 	 */
 	unsigned int tune_candidate_limit;

+	/**
+	 * @brief The number of trial partitionings per search (-2partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIIONING_CANDIDATES.
+	 */
+	unsigned int tune_2partitioning_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-3partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIIONING_CANDIDATES.
+	 */
+	unsigned int tune_3partitioning_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-4partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIIONING_CANDIDATES.
+	 */
+	unsigned int tune_4partitioning_candidate_limit;
+
 	/**
 	 * @brief The dB threshold for stopping block search (-dblimit).
 	 *
@@ -517,11 +555,6 @@ struct astcenc_config
 	 */
 	float tune_2_plane_early_out_limit_correlation;

-	/**
-	 * @brief The threshold below which (inclusive) we stop testing low/high/low+high cutoffs.
-	 */
-	unsigned int tune_low_weight_count_limit;
-
 #if defined(ASTCENC_DIAGNOSTICS)
 	/**
 	 * @brief The path to save the diagnostic trace data to.
--- a/3rdparty/astc-encoder/source/astcenc_color_quantize.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_color_quantize.cpp
@@ -334,13 +334,13 @@ static bool try_quantize_rgb_delta(
 	int g0be = quant_color(quant_level, g0b);
 	int b0be = quant_color(quant_level, b0b);

-	r0b = unquant_color(quant_level, r0be);
-	g0b = unquant_color(quant_level, g0be);
-	b0b = unquant_color(quant_level, b0be);
+	int r0bu = unquant_color(quant_level, r0be);
+	int g0bu = unquant_color(quant_level, g0be);
+	int b0bu = unquant_color(quant_level, b0be);

-	r0b |= r0a & 0x100;
-	g0b |= g0a & 0x100;
-	b0b |= b0a & 0x100;
+	r0b = r0bu | (r0a & 0x100);
+	g0b = g0bu | (g0a & 0x100);
+	b0b = b0bu | (b0a & 0x100);

 	// Get hold of the second value
 	int r1d = astc::flt2int_rtn(r1);
@@ -386,36 +386,18 @@ static bool try_quantize_rgb_delta(
 		return false;
 	}

-	// Check that the sum of the encoded offsets is nonnegative, else encoding fails
-	r1du &= 0x7f;
-	g1du &= 0x7f;
-	b1du &= 0x7f;
-
-	if (r1du & 0x40)
-	{
-		r1du -= 0x80;
-	}
-
-	if (g1du & 0x40)
-	{
-		g1du -= 0x80;
-	}
-
-	if (b1du & 0x40)
-	{
-		b1du -= 0x80;
-	}
-
-	if (r1du + g1du + b1du < 0)
+	// If the sum of offsets triggers blue-contraction then encoding fails
+	vint4 ep0(r0bu, g0bu, b0bu, 0);
+	vint4 ep1(r1du, g1du, b1du, 0);
+	bit_transfer_signed(ep1, ep0);
+	if (hadd_rgb_s(ep1) < 0)
 	{
 		return false;
 	}

 	// Check that the offsets produce legitimate sums as well
-	r1du += r0b;
-	g1du += g0b;
-	b1du += b0b;
-	if (r1du < 0 || r1du > 0x1FF || g1du < 0 || g1du > 0x1FF || b1du < 0 || b1du > 0x1FF)
+	ep0 = ep0 + ep1;
+	if (any((ep0 < vint4(0)) | (ep0 > vint4(0xFF))))
 	{
 		return false;
 	}
@@ -477,13 +459,13 @@ static bool try_quantize_rgb_delta_blue_contract(
 	int g0be = quant_color(quant_level, g0b);
 	int b0be = quant_color(quant_level, b0b);

-	r0b = unquant_color(quant_level, r0be);
-	g0b = unquant_color(quant_level, g0be);
-	b0b = unquant_color(quant_level, b0be);
+	int r0bu = unquant_color(quant_level, r0be);
+	int g0bu = unquant_color(quant_level, g0be);
+	int b0bu = unquant_color(quant_level, b0be);

-	r0b |= r0a & 0x100;
-	g0b |= g0a & 0x100;
-	b0b |= b0a & 0x100;
+	r0b = r0bu | (r0a & 0x100);
+	g0b = g0bu | (g0a & 0x100);
+	b0b = b0bu | (b0a & 0x100);

 	// Get hold of the second value
 	int r1d = astc::flt2int_rtn(r1);
@@ -530,38 +512,18 @@ static bool try_quantize_rgb_delta_blue_contract(
 		return false;
 	}

-	// Check that the sum of the encoded offsets is negative, else encoding fails
-	// Note that this is inverse of the test for non-blue-contracted RGB.
-	r1du &= 0x7f;
-	g1du &= 0x7f;
-	b1du &= 0x7f;
-
-	if (r1du & 0x40)
-	{
-		r1du -= 0x80;
-	}
-
-	if (g1du & 0x40)
-	{
-		g1du -= 0x80;
-	}
-
-	if (b1du & 0x40)
-	{
-		b1du -= 0x80;
-	}
-
-	if (r1du + g1du + b1du >= 0)
+	// If the sum of offsets does not trigger blue-contraction then encoding fails
+	vint4 ep0(r0bu, g0bu, b0bu, 0);
+	vint4 ep1(r1du, g1du, b1du, 0);
+	bit_transfer_signed(ep1, ep0);
+	if (hadd_rgb_s(ep1) >= 0)
 	{
 		return false;
 	}

 	// Check that the offsets produce legitimate sums as well
-	r1du += r0b;
-	g1du += g0b;
-	b1du += b0b;
-
-	if (r1du < 0 || r1du > 0x1FF || g1du < 0 || g1du > 0x1FF || b1du < 0 || b1du > 0x1FF)
+	ep0 = ep0 + ep1;
+	if (any((ep0 < vint4(0)) | (ep0 > vint4(0xFF))))
 	{
 		return false;
 	}
--- a/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp
@@ -97,15 +97,8 @@ static void rgba_delta_unpack(
 	vint4 input0 = unquant_color(quant_level, input0q);
 	vint4 input1 = unquant_color(quant_level, input1q);

-	// Perform bit-transfer
-	input0 = input0 | lsl<1>(input1 & 0x80);
-	input1 = input1 & 0x7F;
-	vmask4 mask = (input1 & 0x40) != vint4::zero();
-	input1 = select(input1, input1 - 0x80, mask);
-
-	// Scale
-	input0 = asr<1>(input0);
-	input1 = asr<1>(input1);
+	// Apply bit transfer
+	bit_transfer_signed(input1, input0);

 	// Apply blue-uncontraction if needed
 	int rgb_sum = hadd_rgb_s(input1);
--- a/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp
@@ -424,11 +424,7 @@ static float compress_symbolic_block_for_partition_1plane(

 	// For each mode, use the angular method to compute a shift
 	compute_angular_endpoints_1plane(
-	    config.tune_low_weight_count_limit,
-	    only_always, bsd,
-	    dec_weights_ideal,
-	    max_weight_quant,
-	    tmpbuf);
+	    only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);

 	float* weight_low_value = tmpbuf.weight_low_value1;
 	float* weight_high_value = tmpbuf.weight_high_value1;
@@ -795,9 +791,7 @@ static float compress_symbolic_block_for_partition_2planes(
 	float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));

 	compute_angular_endpoints_2planes(
-	    config.tune_low_weight_count_limit,
-	    bsd, dec_weights_ideal, max_weight_quant,
-	    tmpbuf);
+	    bsd, dec_weights_ideal, max_weight_quant, tmpbuf);

 	// For each mode (which specifies a decimation and a quantization):
 	//     * Compute number of bits needed for the quantized weights
@@ -1130,12 +1124,13 @@ static float prepare_block_statistics(

 	aa_var -= as * (as * rpt);

-	rg_cov *= astc::rsqrt(astc::max(rr_var * gg_var, 1e-30f));
-	rb_cov *= astc::rsqrt(astc::max(rr_var * bb_var, 1e-30f));
-	ra_cov *= astc::rsqrt(astc::max(rr_var * aa_var, 1e-30f));
-	gb_cov *= astc::rsqrt(astc::max(gg_var * bb_var, 1e-30f));
-	ga_cov *= astc::rsqrt(astc::max(gg_var * aa_var, 1e-30f));
-	ba_cov *= astc::rsqrt(astc::max(bb_var * aa_var, 1e-30f));
+	// These will give a NaN if a channel is constant - these are fixed up in the next step
+	rg_cov *= astc::rsqrt(rr_var * gg_var);
+	rb_cov *= astc::rsqrt(rr_var * bb_var);
+	ra_cov *= astc::rsqrt(rr_var * aa_var);
+	gb_cov *= astc::rsqrt(gg_var * bb_var);
+	ga_cov *= astc::rsqrt(gg_var * aa_var);
+	ba_cov *= astc::rsqrt(bb_var * aa_var);

 	if (astc::isnan(rg_cov)) rg_cov = 1.0f;
 	if (astc::isnan(rb_cov)) rb_cov = 1.0f;
@@ -1144,7 +1139,7 @@ static float prepare_block_statistics(
 	if (astc::isnan(ga_cov)) ga_cov = 1.0f;
 	if (astc::isnan(ba_cov)) ba_cov = 1.0f;

-	float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
+	float lowest_correlation = astc::min(fabsf(rg_cov),      fabsf(rb_cov));
 	lowest_correlation       = astc::min(lowest_correlation, fabsf(ra_cov));
 	lowest_correlation       = astc::min(lowest_correlation, fabsf(gb_cov));
 	lowest_correlation       = astc::min(lowest_correlation, fabsf(ga_cov));
@@ -1197,6 +1192,18 @@ void compress_block(
 	bool block_skip_two_plane = false;
 	int max_partitions = ctx.config.tune_partition_count_limit;

+	unsigned int requested_partition_indices[3] {
+		ctx.config.tune_2partition_index_limit,
+		ctx.config.tune_3partition_index_limit,
+		ctx.config.tune_4partition_index_limit
+	};
+
+	unsigned int requested_partition_trials[3] {
+		ctx.config.tune_2partitioning_candidate_limit,
+		ctx.config.tune_3partitioning_candidate_limit,
+		ctx.config.tune_4partitioning_candidate_limit
+	};
+
 #if defined(ASTCENC_DIAGNOSTICS)
 	// Do this early in diagnostic builds so we can dump uniform metrics
 	// for every block. Do it later in release builds to avoid redundant work!
@@ -1366,13 +1373,19 @@ void compress_block(
 	// Find best blocks for 2, 3 and 4 partitions
 	for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
 	{
-		unsigned int partition_indices[2] { 0 };
+		unsigned int partition_indices[TUNE_MAX_PARTITIIONING_CANDIDATES];

-		find_best_partition_candidates(bsd, blk, partition_count,
-		                               ctx.config.tune_partition_index_limit,
-		                               partition_indices);
+		unsigned int requested_indices = requested_partition_indices[partition_count - 2];

-		for (unsigned int i = 0; i < 2; i++)
+		unsigned int requested_trials = requested_partition_trials[partition_count - 2];
+		requested_trials = astc::min(requested_trials, requested_indices);
+
+		unsigned int actual_trials = find_best_partition_candidates(
+		    bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
+
+		float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
+
+		for (unsigned int i = 0; i < actual_trials; i++)
 		{
 			TRACE_NODE(node1, "pass");
 			trace_add_data("partition_count", partition_count);
@@ -1387,6 +1400,20 @@ void compress_block(
 			    scb, tmpbuf, quant_limit);

 			best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
+
+			// If using N partitions doesn't improve much over using N-1 partitions then skip trying
+			// N+1. Error can dramatically improve if the data is correlated or non-correlated and
+			// aligns with a partitioning that suits that encoding, so for this inner loop check add
+			// a large error scale because the "other" trial could be a lot better. In total the
+			// error must be at least 2x worse than the best existing error to early-out.
+			float best_error = best_errorvals_for_pcount[partition_count - 1];
+			float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 2.0f;
+			if (best_error > (best_error_in_prev * best_error_scale))
+			{
+				trace_add_data("skip", "tune_partition_early_out_limit_factor");
+				goto END_OF_TESTS;
+			}
+
 			if (errorval < error_threshold)
 			{
 				trace_add_data("exit", "quality hit");
@@ -1396,7 +1423,6 @@ void compress_block(

 		// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
 		float best_error = best_errorvals_for_pcount[partition_count - 1];
-		float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
 		float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
 		if (best_error > (best_error_in_prev * best_error_scale))
 		{
--- a/3rdparty/astc-encoder/source/astcenc_entry.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_entry.cpp
@@ -40,10 +40,15 @@ struct astcenc_preset_config
 {
 	float quality;
 	unsigned int tune_partition_count_limit;
-	unsigned int tune_partition_index_limit;
+	unsigned int tune_2partition_index_limit;
+	unsigned int tune_3partition_index_limit;
+	unsigned int tune_4partition_index_limit;
 	unsigned int tune_block_mode_limit;
 	unsigned int tune_refinement_limit;
 	unsigned int tune_candidate_limit;
+	unsigned int tune_2partitioning_candidate_limit;
+	unsigned int tune_3partitioning_candidate_limit;
+	unsigned int tune_4partitioning_candidate_limit;
 	float tune_db_limit_a_base;
 	float tune_db_limit_b_base;
 	float tune_mode0_mse_overshoot;
@@ -51,7 +56,6 @@ struct astcenc_preset_config
 	float tune_2_partition_early_out_limit_factor;
 	float tune_3_partition_early_out_limit_factor;
 	float tune_2_plane_early_out_limit_correlation;
-	unsigned int tune_low_weight_count_limit;
 };


@@ -59,22 +63,25 @@ struct astcenc_preset_config
 * @brief The static quality presets that are built-in for high bandwidth
 * presets (x < 25 texels per block).
 */
-static const std::array<astcenc_preset_config, 5> preset_configs_high {{
+static const std::array<astcenc_preset_config, 6> preset_configs_high {{
 	{
 		ASTCENC_PRE_FASTEST,
-		2, 10, 43, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 25
+		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
 	}, {
 		ASTCENC_PRE_FAST,
-		3, 14, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.65f, 20
+		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.90f
 	}, {
 		ASTCENC_PRE_MEDIUM,
-		4, 28, 76, 3, 3, 95.0f, 70.0f, 2.5f, 2.5f, 1.2f, 1.25f, 0.85f, 16
+		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 2.5f, 1.1f, 1.05f, 0.95f
 	}, {
 		ASTCENC_PRE_THOROUGH,
-		4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 12
+		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.35f, 1.15f, 0.97f
+	}, {
+		ASTCENC_PRE_VERYTHOROUGH,
+		4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
 	}, {
 		ASTCENC_PRE_EXHAUSTIVE,
-		4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
+		4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
 	}
 }};

@@ -82,46 +89,51 @@ static const std::array<astcenc_preset_config, 5> preset_configs_high {{
 * @brief The static quality presets that are built-in for medium bandwidth
 * presets (25 <= x < 64 texels per block).
 */
-static const std::array<astcenc_preset_config, 5> preset_configs_mid {{
+static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
 	{
 		ASTCENC_PRE_FASTEST,
-		2, 10, 43, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
+		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f
 	}, {
 		ASTCENC_PRE_FAST,
-		3, 15, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
+		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
 	}, {
 		ASTCENC_PRE_MEDIUM,
-		4, 30, 76, 3, 3, 95.0f, 70.0f, 3.0f, 3.0f, 1.2f, 1.25f, 0.75f, 14
+		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 3.0f, 1.1f, 1.05f, 0.90f
 	}, {
 		ASTCENC_PRE_THOROUGH,
-		4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 10
+		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.4f, 1.2f, 0.95f
+	}, {
+		ASTCENC_PRE_VERYTHOROUGH,
+		4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
 	}, {
 		ASTCENC_PRE_EXHAUSTIVE,
-		4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
+		4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
 	}
 }};

-
 /**
 * @brief The static quality presets that are built-in for low bandwidth
 * presets (64 <= x texels per block).
 */
-static const std::array<astcenc_preset_config, 5> preset_configs_low {{
+static const std::array<astcenc_preset_config, 6> preset_configs_low {{
 	{
 		ASTCENC_PRE_FASTEST,
-		2, 10, 40, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
+		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f
 	}, {
 		ASTCENC_PRE_FAST,
-		2, 15, 55, 3, 3, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
+		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
 	}, {
 		ASTCENC_PRE_MEDIUM,
-		3, 30, 76, 3, 3, 95.0f, 70.0f, 3.5f, 3.5f, 1.2f, 1.25f, 0.65f, 12
+		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 3.5f, 1.1f, 1.05f, 0.90f
 	}, {
 		ASTCENC_PRE_THOROUGH,
-		4, 75, 92, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.85f, 10
+		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.3f, 1.2f, 0.97f
+	}, {
+		ASTCENC_PRE_VERYTHOROUGH,
+		4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
 	}, {
 		ASTCENC_PRE_EXHAUSTIVE,
-		4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
+		4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
 	}
 }};

@@ -422,10 +434,15 @@ static astcenc_error validate_config(
 	config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);

 	config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
-	config.tune_partition_index_limit = astc::clamp(config.tune_partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
+	config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
+	config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
+	config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
 	config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
 	config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
 	config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
+	config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
+	config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
+	config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
 	config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
 	config.tune_mode0_mse_overshoot = astc::max(config.tune_mode0_mse_overshoot, 1.0f);
 	config.tune_refinement_mse_overshoot = astc::max(config.tune_refinement_mse_overshoot, 1.0f);
@@ -464,9 +481,23 @@ astcenc_error astcenc_config_init(
 	astcenc_config* configp
 ) {
 	astcenc_error status;
-	astcenc_config& config = *configp;
+
+	// Check basic library compatibility options here so they are checked early. Note, these checks
+	// are repeated in context_alloc for cases where callers use a manually defined config struct
+	status = validate_cpu_isa();
+	if (status != ASTCENC_SUCCESS)
+	{
+		return status;
+	}
+
+	status = validate_cpu_float();
+	if (status != ASTCENC_SUCCESS)
+	{
+		return status;
+	}

 	// Zero init all config fields; although most of will be over written
+	astcenc_config& config = *configp;
 	std::memset(&config, 0, sizeof(config));

 	// Process the block size
@@ -493,7 +524,7 @@ astcenc_error astcenc_config_init(
 		return ASTCENC_ERR_BAD_QUALITY;
 	}

-	static const std::array<astcenc_preset_config, 5>* preset_configs;
+	static const std::array<astcenc_preset_config, 6>* preset_configs;
 	int texels_int = block_x * block_y * block_z;
 	if (texels_int < 25)
 	{
@@ -525,11 +556,15 @@ astcenc_error astcenc_config_init(
 	if (start == end)
 	{
 		config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
-		config.tune_partition_index_limit = (*preset_configs)[start].tune_partition_index_limit;
+		config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
+		config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
+		config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
 		config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
 		config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
-		config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit,
-		                                        TUNE_MAX_TRIAL_CANDIDATES);
+		config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES);
+		config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
+		config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
+		config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
 		config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
 		                                 (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);

@@ -539,7 +574,6 @@ astcenc_error astcenc_config_init(
 		config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor;
 		config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor;
 		config.tune_2_plane_early_out_limit_correlation = (*preset_configs)[start].tune_2_plane_early_out_limit_correlation;
-		config.tune_low_weight_count_limit = (*preset_configs)[start].tune_low_weight_count_limit;
 	}
 	// Start and end node are not the same - so interpolate between them
 	else
@@ -561,11 +595,19 @@ astcenc_error astcenc_config_init(
 		#define LERPUI(param) static_cast<unsigned int>(LERPI(param))

 		config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
-		config.tune_partition_index_limit = LERPI(tune_partition_index_limit);
+		config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
+		config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
+		config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
 		config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
 		config.tune_refinement_limit = LERPI(tune_refinement_limit);
 		config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit),
 		                                        TUNE_MAX_TRIAL_CANDIDATES);
+		config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit),
+		                                                      BLOCK_MAX_PARTITIONINGS);
+		config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit),
+		                                                      BLOCK_MAX_PARTITIONINGS);
+		config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit),
+		                                                      BLOCK_MAX_PARTITIONINGS);
 		config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
 		                                 LERP(tune_db_limit_b_base) - 19 * ltexels);

@@ -575,7 +617,6 @@ astcenc_error astcenc_config_init(
 		config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor);
 		config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor);
 		config.tune_2_plane_early_out_limit_correlation = LERP(tune_2_plane_early_out_limit_correlation);
-		config.tune_low_weight_count_limit = LERPI(tune_low_weight_count_limit);
 		#undef LERP
 		#undef LERPI
 		#undef LERPUI
@@ -676,13 +717,13 @@ astcenc_error astcenc_context_alloc(
 	astcenc_error status;
 	const astcenc_config& config = *configp;

-	status = validate_cpu_float();
+	status = validate_cpu_isa();
 	if (status != ASTCENC_SUCCESS)
 	{
 		return status;
 	}

-	status = validate_cpu_isa();
+	status = validate_cpu_float();
 	if (status != ASTCENC_SUCCESS)
 	{
 		return status;
@@ -714,7 +755,7 @@ astcenc_error astcenc_context_alloc(
 	status = validate_config(ctx->config);
 	if (status != ASTCENC_SUCCESS)
 	{
-		delete ctx;
+		delete ctxo;
 		return status;
 	}

--- a/3rdparty/astc-encoder/source/astcenc_find_best_partitioning.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_find_best_partitioning.cpp
@@ -485,13 +485,59 @@ static unsigned int compute_kmeans_partition_ordering(
 	    mismatch_counts, partition_ordering);
 }

+/**
+ * @brief Insert a partitioning into an order list of results, sorted by error.
+ *
+ * @param      max_values      The max number of entries in the best result arrays/
+ * @param      this_error      The error of the new entry.
+ * @param      this_partition  The partition ID of the new entry.
+ * @param[out] best_errors     The array of best error values.
+ * @param[out] best_partitions The array of best partition values.
+ */
+static void insert_result(
+	unsigned int max_values,
+	float this_error,
+	unsigned int this_partition,
+	float* best_errors,
+	unsigned int* best_partitions)
+{
+	// Don't bother searching if the current worst error beats the new error
+	if (this_error >= best_errors[max_values - 1])
+	{
+		return;
+	}
+
+	// Else insert into the list in error-order
+	for (unsigned int i = 0; i < max_values;  i++)
+	{
+		// Existing result is better - move on ...
+		if (this_error > best_errors[i])
+		{
+			continue;
+		}
+
+		// Move existing results down one
+		for (unsigned int j = max_values - 1; j > i; j--)
+		{
+			best_errors[j] = best_errors[j - 1];
+			best_partitions[j] = best_partitions[j - 1];
+		}
+
+		// Insert new result
+		best_errors[i] = this_error;
+		best_partitions[i] = this_partition;
+		break;
+	}
+}
+
 /* See header for documentation. */
-void find_best_partition_candidates(
+unsigned int find_best_partition_candidates(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
 	unsigned int partition_count,
 	unsigned int partition_search_limit,
-	unsigned int best_partitions[2]
+	unsigned int best_partitions[BLOCK_MAX_PARTITIONINGS],
+	unsigned int requested_candidates
 ) {
 	// Constant used to estimate quantization error for a given partitioning; the optimal value for
 	// this depends on bitrate. These values have been determined empirically.
@@ -518,17 +564,23 @@ void find_best_partition_candidates(
 	unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS];
 	unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
 	partition_search_limit = astc::min(partition_search_limit, sequence_len);
+	requested_candidates = astc::min(partition_search_limit, requested_candidates);

 	bool uses_alpha = !blk.is_constant_channel(3);

 	// Partitioning errors assuming uncorrelated-chrominance endpoints
-	float uncor_best_error { ERROR_CALC_DEFAULT };
-	unsigned int uncor_best_partition { 0 };
+	float uncor_best_errors[TUNE_MAX_PARTITIIONING_CANDIDATES];
+	unsigned int uncor_best_partitions[TUNE_MAX_PARTITIIONING_CANDIDATES];

 	// Partitioning errors assuming same-chrominance endpoints
-	// Store two so we can always return one different to uncorr
-	float samec_best_errors[2] { ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT };
-	unsigned int samec_best_partitions[2] { 0, 0 };
+	float samec_best_errors[TUNE_MAX_PARTITIIONING_CANDIDATES];
+	unsigned int samec_best_partitions[TUNE_MAX_PARTITIIONING_CANDIDATES];
+
+	for (unsigned int i = 0; i < requested_candidates; i++)
+	{
+		uncor_best_errors[i] = ERROR_CALC_DEFAULT;
+		samec_best_errors[i] = ERROR_CALC_DEFAULT;
+	}

 	if (uses_alpha)
 	{
@@ -602,25 +654,8 @@ void find_best_partition_candidates(
 				samec_error += dot_s(samec_vector * samec_vector, error_weights);
 			}

-			if (uncor_error < uncor_best_error)
-			{
-				uncor_best_error = uncor_error;
-				uncor_best_partition = partition;
-			}
-
-			if (samec_error < samec_best_errors[0])
-			{
-				samec_best_errors[1] = samec_best_errors[0];
-				samec_best_partitions[1] = samec_best_partitions[0];
-
-				samec_best_errors[0] = samec_error;
-				samec_best_partitions[0] = partition;
-			}
-			else if (samec_error < samec_best_errors[1])
-			{
-				samec_best_errors[1] = samec_error;
-				samec_best_partitions[1] = partition;
-			}
+			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
+			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
 		}
 	}
 	else
@@ -687,50 +722,55 @@ void find_best_partition_candidates(
 				samec_error += dot3_s(samec_vector * samec_vector, error_weights);
 			}

-			if (uncor_error < uncor_best_error)
-			{
-				uncor_best_error = uncor_error;
-				uncor_best_partition = partition;
-			}
+			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
+			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
+		}
+	}

-			if (samec_error < samec_best_errors[0])
-			{
-				samec_best_errors[1] = samec_best_errors[0];
-				samec_best_partitions[1] = samec_best_partitions[0];
+	bool best_is_uncor = uncor_best_partitions[0] > samec_best_partitions[0];

-				samec_best_errors[0] = samec_error;
-				samec_best_partitions[0] = partition;
-			}
-			else if (samec_error < samec_best_errors[1])
+	unsigned int interleave[2 * TUNE_MAX_PARTITIIONING_CANDIDATES];
+	for (unsigned int i = 0; i < requested_candidates; i++)
+	{
+		if (best_is_uncor)
+		{
+			interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
+			interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
+		}
+		else
+		{
+			interleave[2 * i] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
+			interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
+		}
+	}
+
+	uint64_t bitmasks[1024/64] { 0 };
+	unsigned int emitted = 0;
+
+	// Deduplicate the first "requested" entries
+	for (unsigned int i = 0; i < requested_candidates * 2;  i++)
+	{
+		unsigned int partition = interleave[i];
+
+		unsigned int word = partition / 64;
+		unsigned int bit = partition % 64;
+
+		bool written = bitmasks[word] & (1ull << bit);
+
+		if (!written)
+		{
+			best_partitions[emitted] = partition;
+			bitmasks[word] |= 1ull << bit;
+			emitted++;
+
+			if (emitted == requested_candidates)
 			{
-				samec_best_errors[1] = samec_error;
-				samec_best_partitions[1] = partition;
+				break;
 			}
 		}
 	}

-	// Same partition is best for both, so use this first unconditionally
-	if (uncor_best_partition == samec_best_partitions[0])
-	{
-		best_partitions[0] = samec_best_partitions[0];
-		best_partitions[1] = samec_best_partitions[1];
-	}
-	// Uncor is best
-	else if (uncor_best_error <= samec_best_errors[0])
-	{
-		best_partitions[0] = uncor_best_partition;
-		best_partitions[1] = samec_best_partitions[0];
-	}
-	// Samec is best
-	else
-	{
-		best_partitions[0] = samec_best_partitions[0];
-		best_partitions[1] = uncor_best_partition;
-	}
-
-	// Convert these back into canonical partition IDs for the rest of the codec
-	best_partitions[0] = bsd.get_raw_partition_info(partition_count, best_partitions[0]).partition_index;
-	best_partitions[1] = bsd.get_raw_partition_info(partition_count, best_partitions[1]).partition_index;
+	return emitted;
 }

 #endif
--- a/3rdparty/astc-encoder/source/astcenc_internal.h
+++ b/3rdparty/astc-encoder/source/astcenc_internal.h
@@ -130,7 +130,14 @@ static constexpr unsigned int TUNE_MIN_TEXELS_MODE0_FASTPATH { 24 };
 *
 * This can be dynamically reduced by the compression quality preset.
 */
-static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 4 };
+static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 8 };
+
+/**
+ * @brief The maximum number of candidate partitionings tested for each encoding mode.
+ *
+ * This can be dynamically reduced by the compression quality preset.
+ */
+static constexpr unsigned int TUNE_MAX_PARTITIIONING_CANDIDATES { 32 };

 /**
 * @brief The maximum quant level using full angular endpoint search method.
@@ -1345,11 +1352,11 @@ extern const int8_t quant_mode_table[10][128];
 * Note that BISE can return strings that are not a whole number of bytes in length, and ASTC can
 * start storing strings in a block at arbitrary bit offsets in the encoded data.
 *
- * @param         quant_level      The BISE alphabet size.
- * @param         character_count  The number of characters in the string.
- * @param         input_data       The unpacked string, one byte per character.
- * @param[in,out] output_data      The output packed string.
- * @param         bit_offset       The starting offset in the output storage.
+ * @param         quant_level       The BISE alphabet size.
+ * @param         character_count   The number of characters in the string.
+ * @param         input_data        The unpacked string, one byte per character.
+ * @param[in,out] output_data       The output packed string.
+ * @param         bit_offset        The starting offset in the output storage.
 */
 void encode_ise(
 	quant_method quant_level,
@@ -1436,11 +1443,11 @@ void compute_avgs_and_dirs_3_comp(
 * This is a specialization of @c compute_avgs_and_dirs_3_comp where the omitted component is
 * always alpha, a common case during partition search.
 *
- * @param      pi                  The partition info for the current trial.
- * @param      blk                 The image block color data to be compressed.
- * @param[out] pm                  The output partition metrics.
- *                                 - Only pi.partition_count array entries actually get initialized.
- *                                 - Direction vectors @c pm.dir are not normalized.
+ * @param      pi    The partition info for the current trial.
+ * @param      blk   The image block color data to be compressed.
+ * @param[out] pm    The output partition metrics.
+ *                   - Only pi.partition_count array entries actually get initialized.
+ *                   - Direction vectors @c pm.dir are not normalized.
 */
 void compute_avgs_and_dirs_3_comp_rgb(
 	const partition_info& pi,
@@ -1471,11 +1478,11 @@ void compute_avgs_and_dirs_4_comp(
 *
 * This function computes the squared error when using these two representations.
 *
- * @param         pi              The partition info for the current trial.
- * @param         blk             The image block color data to be compressed.
- * @param[in,out] plines          Processed line inputs, and line length outputs.
- * @param[out]    uncor_error     The cumulative error for using the uncorrelated line.
- * @param[out]    samec_error     The cumulative error for using the same chroma line.
+ * @param         pi            The partition info for the current trial.
+ * @param         blk           The image block color data to be compressed.
+ * @param[in,out] plines        Processed line inputs, and line length outputs.
+ * @param[out]    uncor_error   The cumulative error for using the uncorrelated line.
+ * @param[out]    samec_error   The cumulative error for using the same chroma line.
 */
 void compute_error_squared_rgb(
 	const partition_info& pi,
@@ -1520,18 +1527,23 @@ void compute_error_squared_rgba(
 * candidates; one assuming data has uncorrelated chroma and one assuming the
 * data has correlated chroma. The best candidate is returned first in the list.
 *
- * @param      bsd                        The block size information.
- * @param      blk                        The image block color data to compress.
- * @param      partition_count            The number of partitions in the block.
- * @param      partition_search_limit     The number of candidate partition encodings to trial.
- * @param[out] best_partitions            The best partition candidates.
+ * @param      bsd                      The block size information.
+ * @param      blk                      The image block color data to compress.
+ * @param      partition_count          The number of partitions in the block.
+ * @param      partition_search_limit   The number of candidate partition encodings to trial.
+ * @param[out] best_partitions          The best partition candidates.
+ * @param      requested_candidates     The number of requsted partitionings. May return fewer if
+ *                                      candidates are not avaiable.
+ *
+ * @return The actual number of candidates returned.
 */
-void find_best_partition_candidates(
+unsigned int find_best_partition_candidates(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
 	unsigned int partition_count,
 	unsigned int partition_search_limit,
-	unsigned int best_partitions[2]);
+	unsigned int best_partitions[TUNE_MAX_PARTITIIONING_CANDIDATES],
+	unsigned int requested_candidates);

 /* ============================================================================
  Functionality for managing images and image related data.
@@ -1545,10 +1557,10 @@ void find_best_partition_candidates(
 *
 * Results are written back into @c img->input_alpha_averages.
 *
- * @param      img                     The input image data, also holds output data.
- * @param      alpha_kernel_radius     The kernel radius (in pixels) for alpha mods.
- * @param      swz                     Input data component swizzle.
- * @param[out] ag                      The average variance arguments to init.
+ * @param      img                   The input image data, also holds output data.
+ * @param      alpha_kernel_radius   The kernel radius (in pixels) for alpha mods.
+ * @param      swz                   Input data component swizzle.
+ * @param[out] ag                    The average variance arguments to init.
 *
 * @return The number of tasks in the processing stage.
 */
@@ -1766,13 +1778,13 @@ float compute_error_of_weight_set_2planes(
 * The user requests a base color endpoint mode in @c format, but the quantizer may choose a
 * delta-based representation. It will report back the format variant it actually used.
 *
- * @param      color0       The input unquantized color0 endpoint for absolute endpoint pairs.
- * @param      color1       The input unquantized color1 endpoint for absolute endpoint pairs.
- * @param      rgbs_color   The input unquantized RGBS variant endpoint for same chroma endpoints.
- * @param      rgbo_color   The input unquantized RGBS variant endpoint for HDR endpoints..
- * @param      format       The desired base format.
- * @param[out] output       The output storage for the quantized colors/
- * @param      quant_level  The quantization level requested.
+ * @param      color0        The input unquantized color0 endpoint for absolute endpoint pairs.
+ * @param      color1        The input unquantized color1 endpoint for absolute endpoint pairs.
+ * @param      rgbs_color    The input unquantized RGBS variant endpoint for same chroma endpoints.
+ * @param      rgbo_color    The input unquantized RGBS variant endpoint for HDR endpoints.
+ * @param      format        The desired base format.
+ * @param[out] output        The output storage for the quantized colors/
+ * @param      quant_level   The quantization level requested.
 *
 * @return The actual endpoint mode used.
 */
@@ -1873,13 +1885,13 @@ unsigned int compute_ideal_endpoint_formats(
 * As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must
 * recompute the ideal colors for a specific weight set.
 *
- * @param         blk                        The image block color data to compress.
- * @param         pi                         The partition info for the current trial.
- * @param         di                         The weight grid decimation table.
+ * @param         blk                  The image block color data to compress.
+ * @param         pi                   The partition info for the current trial.
+ * @param         di                   The weight grid decimation table.
 * @param         dec_weights_uquant   The quantized weight set.
- * @param[in,out] ep                         The color endpoints (modifed in place).
- * @param[out]    rgbs_vectors               The RGB+scale vectors for LDR blocks.
- * @param[out]    rgbo_vectors               The RGB+offset vectors for HDR blocks.
+ * @param[in,out] ep                   The color endpoints (modifed in place).
+ * @param[out]    rgbs_vectors         The RGB+scale vectors for LDR blocks.
+ * @param[out]    rgbo_vectors         The RGB+offset vectors for HDR blocks.
 */
 void recompute_ideal_colors_1plane(
 	const image_block& blk,
@@ -1896,15 +1908,15 @@ void recompute_ideal_colors_1plane(
 * As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must
 * recompute the ideal colors for a specific weight set.
 *
- * @param         blk                               The image block color data to compress.
- * @param         bsd                               The block_size descriptor.
- * @param         di                                The weight grid decimation table.
+ * @param         blk                         The image block color data to compress.
+ * @param         bsd                         The block_size descriptor.
+ * @param         di                          The weight grid decimation table.
 * @param         dec_weights_uquant_plane1   The quantized weight set for plane 1.
 * @param         dec_weights_uquant_plane2   The quantized weight set for plane 2.
- * @param[in,out] ep                                The color endpoints (modifed in place).
- * @param[out]    rgbs_vector                       The RGB+scale color for LDR blocks.
- * @param[out]    rgbo_vector                       The RGB+offset color for HDR blocks.
- * @param         plane2_component                  The component assigned to plane 2.
+ * @param[in,out] ep                          The color endpoints (modifed in place).
+ * @param[out]    rgbs_vector                 The RGB+scale color for LDR blocks.
+ * @param[out]    rgbo_vector                 The RGB+offset color for HDR blocks.
+ * @param         plane2_component            The component assigned to plane 2.
 */
 void recompute_ideal_colors_2planes(
 	const image_block& blk,
@@ -1925,15 +1937,13 @@ void prepare_angular_tables();
 /**
 * @brief Compute the angular endpoints for one plane for each block mode.
 *
- * @param      tune_low_weight_limit     Weight count cutoff below which we use simpler searches.
- * @param      only_always               Only consider block modes that are always enabled.
- * @param      bsd                       The block size descriptor for the current trial.
- * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
- * @param      max_weight_quant          The maximum block mode weight quantization allowed.
- * @param[out] tmpbuf                    Preallocated scratch buffers for the compressor.
+ * @param      only_always              Only consider block modes that are always enabled.
+ * @param      bsd                      The block size descriptor for the current trial.
+ * @param      dec_weight_ideal_value   The ideal decimated unquantized weight values.
+ * @param      max_weight_quant         The maximum block mode weight quantization allowed.
+ * @param[out] tmpbuf                   Preallocated scratch buffers for the compressor.
 */
 void compute_angular_endpoints_1plane(
-	unsigned int tune_low_weight_limit,
 	bool only_always,
 	const block_size_descriptor& bsd,
 	const float* dec_weight_ideal_value,
@@ -1943,14 +1953,12 @@ void compute_angular_endpoints_1plane(
 /**
 * @brief Compute the angular endpoints for two planes for each block mode.
 *
- * @param      tune_low_weight_limit     Weight count cutoff below which we use simpler searches.
- * @param      bsd                       The block size descriptor for the current trial.
- * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
- * @param      max_weight_quant          The maximum block mode weight quantization allowed.
- * @param[out] tmpbuf                    Preallocated scratch buffers for the compressor.
+ * @param      bsd                      The block size descriptor for the current trial.
+ * @param      dec_weight_ideal_value   The ideal decimated unquantized weight values.
+ * @param      max_weight_quant         The maximum block mode weight quantization allowed.
+ * @param[out] tmpbuf                   Preallocated scratch buffers for the compressor.
 */
 void compute_angular_endpoints_2planes(
-	unsigned int tune_low_weight_limit,
 	const block_size_descriptor& bsd,
 	const float* dec_weight_ideal_value,
 	unsigned int max_weight_quant,
@@ -2162,18 +2170,4 @@ void aligned_free(T* ptr)
 #endif
 }

-static inline void dump_weights(const char* label, uint8_t* weights, int weight_count)
-{
-	printf("%s\n", label);
-	vint lane = vint::lane_id();
-	for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
-	{
-		vmask mask = lane < vint(weight_count);
-		vint val(weights + i);
-		val = select(vint::zero(), val, mask);
-		print(val);
-		lane += vint(ASTCENC_SIMD_WIDTH);
-	}
-}
-
 #endif
--- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h
+++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h
@@ -361,6 +361,23 @@ static inline int popcount(uint64_t v)

 #endif

+/**
+ * @brief Apply signed bit transfer.
+ *
+ * @param input0   The first encoded endpoint.
+ * @param input1   The second encoded endpoint.
+ */
+static ASTCENC_SIMD_INLINE void bit_transfer_signed(
+	vint4& input0,
+	vint4& input1
+) {
+	input1 = lsr<1>(input1) | (input0 & 0x80);
+	input0 = lsr<1>(input0) & 0x3F;
+
+	vmask4 mask = (input0 & 0x20) != vint4::zero();
+	input0 = select(input0, input0 - 0x40, mask);
+}
+
 /**
 * @brief Debug function to print a vector of ints.
 */
--- a/3rdparty/astc-encoder/source/astcenc_weight_align.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_weight_align.cpp
@@ -333,156 +333,8 @@ static void compute_angular_endpoints_for_quant_levels(
 	}
 }

-/**
- * @brief For a given step size compute the lowest and highest weight, variant for low weight count.
- *
- * Compute the lowest and highest weight that results from quantizing using the given stepsize and
- * offset, and then compute the resulting error. The cut errors indicate the error that results from
- * forcing samples that should have had one weight value one step up or down.
- *
- * @param      weight_count              The number of (decimated) weights.
- * @param      dec_weight_quant_uvalue   The decimated and quantized weight values.
- * @param      max_angular_steps         The maximum number of steps to be tested.
- * @param      max_quant_steps           The maximum quantization level to be tested.
- * @param      offsets                   The angular offsets array.
- * @param[out] lowest_weight             Per angular step, the lowest weight.
- * @param[out] weight_span               Per angular step, the span between lowest and highest weight.
- * @param[out] error                     Per angular step, the error.
- */
-static void compute_lowest_and_highest_weight_lwc(
-	unsigned int weight_count,
-	const float* dec_weight_quant_uvalue,
-	unsigned int max_angular_steps,
-	unsigned int max_quant_steps,
-	const float* offsets,
-	float* lowest_weight,
-	int* weight_span,
-	float* error
-) {
-	promise(weight_count > 0);
-	promise(max_angular_steps > 0);
-
-	vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
-
-	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
-	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
-	{
-		vfloat minidx(128.0f);
-		vfloat maxidx(-128.0f);
-		vfloat errval = vfloat::zero();
-		vfloat offset = loada(offsets + sp);
-
-		for (unsigned int j = 0; j < weight_count; j++)
-		{
-			vfloat sval = load1(dec_weight_quant_uvalue + j) * rcp_stepsize - offset;
-			vfloat svalrte = round(sval);
-			vfloat diff = sval - svalrte;
-			errval += diff * diff;
-
-			// Compute min and max quantized weight spans for each step
-			minidx = min(minidx, svalrte);
-			maxidx = max(maxidx, svalrte);
-		}
-
-		// Write out min weight and weight span; clamp span to a usable range
-		vint span = float_to_int(maxidx - minidx + vfloat(1.0f));
-		span = min(span, vint(max_quant_steps + 3));
-		span = max(span, vint(2));
-		storea(minidx, lowest_weight + sp);
-		storea(span, weight_span + sp);
-
-		vfloat ssize = 1.0f / rcp_stepsize;
-		vfloat errscale = ssize * ssize;
-		storea(errval * errscale, error + sp);
-
-		rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
-	}
-}
-
-/**
- * @brief The main function for the angular algorithm, variant for low weight count.
- *
- * @param      weight_count              The number of (decimated) weights.
- * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
- * @param      max_quant_level           The maximum quantization level to be tested.
- * @param[out] low_value                 Per angular step, the lowest weight value.
- * @param[out] high_value                Per angular step, the highest weight value.
- */
-static void compute_angular_endpoints_for_quant_levels_lwc(
-	unsigned int weight_count,
-	const float* dec_weight_ideal_value,
-	unsigned int max_quant_level,
-	float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
-	float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
-) {
-	unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
-	unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
-
-	alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
-	alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS];
-	alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
-	alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
-
-	compute_angular_offsets(weight_count, dec_weight_ideal_value,
-	                        max_angular_steps, angular_offsets);
-
-
-	compute_lowest_and_highest_weight_lwc(weight_count, dec_weight_ideal_value,
-	                                      max_angular_steps, max_quant_steps,
-	                                      angular_offsets, lowest_weight, weight_span, error);
-
-	// For each quantization level, find the best error terms. Use packed vectors so data-dependent
-	// branches can become selects. This involves some integer to float casts, but the values are
-	// small enough so they never round the wrong way.
-	vfloat4 best_results[36];
-
-	// Initialize the array to some safe defaults
-	promise(max_quant_steps > 0);
-	for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
-	{
-		best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
-	}
-
-	promise(max_angular_steps > 0);
-	for (unsigned int i = 0; i < max_angular_steps; i++)
-	{
-		int idx_span = weight_span[i];
-
-		// Check best error against record N
-		vfloat4 current_best = best_results[idx_span];
-		vfloat4 candidate = vfloat4(error[i], static_cast<float>(i), 0.0f, 0.0f);
-		vmask4 mask = vfloat4(current_best.lane<0>()) > vfloat4(error[i]);
-		best_results[idx_span] = select(current_best, candidate, mask);
-	}
-
-	for (unsigned int i = 0; i <= max_quant_level; i++)
-	{
-		unsigned int q = steps_for_quant_level[i];
-		int bsi = static_cast<int>(best_results[q].lane<1>());
-
-		// Did we find anything?
-#if defined(ASTCENC_DIAGNOSTICS)
-		if ((bsi < 0) && print_once)
-		{
-			print_once = false;
-			printf("INFO: Unable to find low weight encoding within search error limit.\n\n");
-		}
-#endif
-
-		bsi = astc::max(0, bsi);
-
-		float lwi = lowest_weight[bsi];
-		float hwi = lwi + static_cast<float>(q) - 1.0f;
-
-		float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
-		low_value[i]  = (angular_offsets[bsi] + lwi) * stepsize;
-		high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
-	}
-}
-
 /* See header for documentation. */
 void compute_angular_endpoints_1plane(
-	unsigned int tune_low_weight_limit,
 	bool only_always,
 	const block_size_descriptor& bsd,
 	const float* dec_weight_ideal_value,
@@ -519,20 +371,10 @@ void compute_angular_endpoints_1plane(
 			max_precision = max_weight_quant;
 		}

-		if (weight_count < tune_low_weight_limit)
-		{
-			compute_angular_endpoints_for_quant_levels_lwc(
-				weight_count,
-				dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
-				max_precision, low_values[i], high_values[i]);
-		}
-		else
-		{
-			compute_angular_endpoints_for_quant_levels(
-				weight_count,
-				dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
-				max_precision, low_values[i], high_values[i]);
-		}
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
+		    max_precision, low_values[i], high_values[i]);
 	}

 	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
@@ -561,7 +403,6 @@ void compute_angular_endpoints_1plane(

 /* See header for documentation. */
 void compute_angular_endpoints_2planes(
-	unsigned int tune_low_weight_limit,
 	const block_size_descriptor& bsd,
 	const float* dec_weight_ideal_value,
 	unsigned int max_weight_quant,
@@ -599,30 +440,15 @@ void compute_angular_endpoints_2planes(
 			max_precision = max_weight_quant;
 		}

-		if (weight_count < tune_low_weight_limit)
-		{
-			compute_angular_endpoints_for_quant_levels_lwc(
-				weight_count,
-				dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
-				max_precision, low_values1[i], high_values1[i]);
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
+		    max_precision, low_values1[i], high_values1[i]);

-			compute_angular_endpoints_for_quant_levels_lwc(
-				weight_count,
-				dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
-				max_precision, low_values2[i], high_values2[i]);
-		}
-		else
-		{
-			compute_angular_endpoints_for_quant_levels(
-				weight_count,
-				dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
-				max_precision, low_values1[i], high_values1[i]);
-
-			compute_angular_endpoints_for_quant_levels(
-				weight_count,
-				dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
-				max_precision, low_values2[i], high_values2[i]);
-		}
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
+		    max_precision, low_values2[i], high_values2[i]);
 	}

 	unsigned int start = bsd.block_mode_count_1plane_selected;