Updated astc-encoder.

2026-02-17 20:52:38 +01:00 · 2023-01-01 19:16:52 -08:00
parent 6a15b31f19
commit 1395f4e969
17 changed files with 201 additions and 232 deletions
--- a/3rdparty/astc-encoder/include/astcenc.h
+++ b/3rdparty/astc-encoder/include/astcenc.h
@@ -514,25 +514,15 @@ struct astcenc_config
 	float tune_db_limit;

 	/**
-	 * @brief The amount of overshoot needed to early-out mode 0 fast path.
+	 * @brief The amount of MSE overshoot needed to early-out trials.
 	 *
-	 * We have a fast-path for mode 0 (1 partition, 1 plane) which uses only essential block modes
-	 * as an initial search. This can short-cut compression for simple blocks, but to avoid
-	 * short-cutting too much we force this to overshoot the MSE threshold needed to hit the
-	 * block-local db_limit e.g. 1.0 = no overshoot, 2.0 = need half the error to trigger.
-	 */
-	float tune_mode0_mse_overshoot;
-
-	/**
-	 * @brief The amount of overshoot needed to early-out refinement.
+	 * The first early-out is for 1 partition, 1 plane trials, where we try a minimal encode using
+	 * the high probability block modes. This can short-cut compression for simple blocks.
 	 *
-	 * The codec will refine block candidates iteratively to improve the encoding, based on the
-	 * @c tune_refinement_limit count. Earlier implementations will use all refinement iterations,
-	 * even if the target threshold is reached. This tuning parameter allows an early out, but with
-	 * an overshoot MSE threshold. Setting this to 1.0 will early-out as soon as the target is hit,
-	 * but does reduce image quality vs the default behavior of over-refinement.
+	 * The second early-out is for refinement trials, where we can exit refinement once quality is
+	 * reached.
 	 */
-	float tune_refinement_mse_overshoot;
+	float tune_mse_overshoot;

 	/**
 	 * @brief The threshold for skipping 3.1/4.1 trials (-2partitionlimitfactor).
--- a/3rdparty/astc-encoder/source/astcenc_color_quantize.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_color_quantize.cpp
@@ -1960,7 +1960,7 @@ uint8_t pack_color_endpoints(
 	switch (format)
 	{
 	case FMT_RGB:
-		if (quant_level <= 18)
+		if (quant_level <= QUANT_160)
 		{
 			if (try_quantize_rgb_delta_blue_contract(color0, color1, output, quant_level))
 			{
@@ -1973,7 +1973,7 @@ uint8_t pack_color_endpoints(
 				break;
 			}
 		}
-		if (try_quantize_rgb_blue_contract(color0, color1, output, quant_level))
+		if (quant_level < QUANT_256 && try_quantize_rgb_blue_contract(color0, color1, output, quant_level))
 		{
 			retval = FMT_RGB;
 			break;
@@ -1983,7 +1983,7 @@ uint8_t pack_color_endpoints(
 		break;

 	case FMT_RGBA:
-		if (quant_level <= 18)
+		if (quant_level <= QUANT_160)
 		{
 			if (try_quantize_rgba_delta_blue_contract(color0, color1, output, quant_level))
 			{
@@ -1996,7 +1996,7 @@ uint8_t pack_color_endpoints(
 				break;
 			}
 		}
-		if (try_quantize_rgba_blue_contract(color0, color1, output, quant_level))
+		if (quant_level < QUANT_256 && try_quantize_rgba_blue_contract(color0, color1, output, quant_level))
 		{
 			retval = FMT_RGBA;
 			break;
--- a/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp
@@ -82,7 +82,7 @@ static bool realign_weights_undecimated(
 	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];

 	unsigned int max_plane = bm.is_dual_plane;
-	int plane2_component = bm.is_dual_plane ? scb.plane2_component : -1;
+	int plane2_component = scb.plane2_component;
 	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);

 	// Decode the color endpoints
@@ -206,7 +206,7 @@ static bool realign_weights_decimated(
 	assert(weight_count != bsd.texel_count);

 	unsigned int max_plane = bm.is_dual_plane;
-	int plane2_component = bm.is_dual_plane ? scb.plane2_component : -1;
+	int plane2_component = scb.plane2_component;
 	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);

 	// Decode the color endpoints
@@ -1279,13 +1279,13 @@ void compress_block(
 	// compression and slightly reduces image quality.

 	float errorval_mult[2] {
-		1.0f / ctx.config.tune_mode0_mse_overshoot,
+		1.0f / ctx.config.tune_mse_overshoot,
 		1.0f
 	};

-	static const float errorval_overshoot = 1.0f / ctx.config.tune_refinement_mse_overshoot;
+	static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;

-	// Only enable MODE0 fast path (trial 0) if 2D and more than 25 texels
+	// Only enable MODE0 fast path (trial 0) if 2D, and more than 25 texels
 	int start_trial = 1;
 	if ((bsd.texel_count >= TUNE_MIN_TEXELS_MODE0_FASTPATH) && (bsd.zdim == 1))
 	{
--- a/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp
@@ -286,7 +286,7 @@ void decompress_symbolic_block(
 	unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);

 	// Now that we have endpoint colors and weights, we can unpack texel colors
-	int plane2_component = is_dual_plane ? scb.plane2_component : -1;
+	int plane2_component = scb.plane2_component;
 	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);

 	for (int i = 0; i < partition_count; i++)
--- a/3rdparty/astc-encoder/source/astcenc_entry.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_entry.cpp
@@ -51,89 +51,84 @@ struct astcenc_preset_config
 	unsigned int tune_4partitioning_candidate_limit;
 	float tune_db_limit_a_base;
 	float tune_db_limit_b_base;
-	float tune_mode0_mse_overshoot;
-	float tune_refinement_mse_overshoot;
+	float tune_mse_overshoot;
 	float tune_2_partition_early_out_limit_factor;
 	float tune_3_partition_early_out_limit_factor;
 	float tune_2_plane_early_out_limit_correlation;
 };

-
 /**
- * @brief The static quality presets that are built-in for high bandwidth
- * presets (x < 25 texels per block).
+ * @brief The static presets for high bandwidth encodings (x < 25 texels per block).
 */
 static const std::array<astcenc_preset_config, 6> preset_configs_high {{
 	{
 		ASTCENC_PRE_FASTEST,
-		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
+		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f
 	}, {
 		ASTCENC_PRE_FAST,
-		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.90f
+		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f
 	}, {
 		ASTCENC_PRE_MEDIUM,
-		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 2.5f, 1.1f, 1.05f, 0.95f
+		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f
 	}, {
 		ASTCENC_PRE_THOROUGH,
-		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.35f, 1.15f, 0.97f
+		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f
 	}, {
 		ASTCENC_PRE_VERYTHOROUGH,
-		4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
+		4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
 	}, {
 		ASTCENC_PRE_EXHAUSTIVE,
-		4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
+		4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
 	}
 }};

 /**
- * @brief The static quality presets that are built-in for medium bandwidth
- * presets (25 <= x < 64 texels per block).
+ * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block).
 */
 static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
 	{
 		ASTCENC_PRE_FASTEST,
-		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f
+		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f
 	}, {
 		ASTCENC_PRE_FAST,
-		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
+		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f
 	}, {
 		ASTCENC_PRE_MEDIUM,
-		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 3.0f, 1.1f, 1.05f, 0.90f
+		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f
 	}, {
 		ASTCENC_PRE_THOROUGH,
-		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.4f, 1.2f, 0.95f
+		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f
 	}, {
 		ASTCENC_PRE_VERYTHOROUGH,
-		4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
+		4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
 	}, {
 		ASTCENC_PRE_EXHAUSTIVE,
-		4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
+		4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
 	}
 }};

 /**
- * @brief The static quality presets that are built-in for low bandwidth
- * presets (64 <= x texels per block).
+ * @brief The static presets for low bandwidth encodings (64 <= x texels per block).
 */
 static const std::array<astcenc_preset_config, 6> preset_configs_low {{
 	{
 		ASTCENC_PRE_FASTEST,
-		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f
+		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f
 	}, {
 		ASTCENC_PRE_FAST,
-		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
+		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f
 	}, {
 		ASTCENC_PRE_MEDIUM,
-		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 3.5f, 1.1f, 1.05f, 0.90f
+		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f
 	}, {
 		ASTCENC_PRE_THOROUGH,
-		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.3f, 1.2f, 0.97f
+		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f
 	}, {
 		ASTCENC_PRE_VERYTHOROUGH,
-		4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
+		4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
 	}, {
 		ASTCENC_PRE_EXHAUSTIVE,
-		4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
+		4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
 	}
 }};

@@ -444,8 +439,7 @@ static astcenc_error validate_config(
 	config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
 	config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
 	config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
-	config.tune_mode0_mse_overshoot = astc::max(config.tune_mode0_mse_overshoot, 1.0f);
-	config.tune_refinement_mse_overshoot = astc::max(config.tune_refinement_mse_overshoot, 1.0f);
+	config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f);
 	config.tune_2_partition_early_out_limit_factor = astc::max(config.tune_2_partition_early_out_limit_factor, 0.0f);
 	config.tune_3_partition_early_out_limit_factor = astc::max(config.tune_3_partition_early_out_limit_factor, 0.0f);
 	config.tune_2_plane_early_out_limit_correlation = astc::max(config.tune_2_plane_early_out_limit_correlation, 0.0f);
@@ -568,8 +562,7 @@ astcenc_error astcenc_config_init(
 		config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
 		                                 (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);

-		config.tune_mode0_mse_overshoot = (*preset_configs)[start].tune_mode0_mse_overshoot;
-		config.tune_refinement_mse_overshoot = (*preset_configs)[start].tune_refinement_mse_overshoot;
+		config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot;

 		config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor;
 		config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor;
@@ -611,8 +604,7 @@ astcenc_error astcenc_config_init(
 		config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
 		                                 LERP(tune_db_limit_b_base) - 19 * ltexels);

-		config.tune_mode0_mse_overshoot = LERP(tune_mode0_mse_overshoot);
-		config.tune_refinement_mse_overshoot = LERP(tune_refinement_mse_overshoot);
+		config.tune_mse_overshoot = LERP(tune_mse_overshoot);

 		config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor);
 		config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor);
--- a/3rdparty/astc-encoder/source/astcenc_find_best_partitioning.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_find_best_partitioning.cpp
@@ -362,6 +362,7 @@ static void count_partition_mismatch_bits(
 	unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS]
 ) {
 	unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
+	promise(active_count > 0);

 	if (partition_count == 2)
 	{
@@ -400,6 +401,7 @@ static unsigned int get_partition_ordering_by_mismatch_bits(
 	const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS],
 	unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
 ) {
+	promise(partitioning_count > 0);
 	unsigned int mscount[256] { 0 };

 	// Create the histogram of mismatch counts
@@ -488,7 +490,7 @@ static unsigned int compute_kmeans_partition_ordering(
 /**
 * @brief Insert a partitioning into an order list of results, sorted by error.
 *
- * @param      max_values      The max number of entries in the best result arrays/
+ * @param      max_values      The max number of entries in the best result arrays.
 * @param      this_error      The error of the new entry.
 * @param      this_partition  The partition ID of the new entry.
 * @param[out] best_errors     The array of best error values.
@@ -501,6 +503,8 @@ static void insert_result(
 	float* best_errors,
 	unsigned int* best_partitions)
 {
+	promise(max_values > 0);
+
 	// Don't bother searching if the current worst error beats the new error
 	if (this_error >= best_errors[max_values - 1])
 	{
@@ -508,7 +512,7 @@ static void insert_result(
 	}

 	// Else insert into the list in error-order
-	for (unsigned int i = 0; i < max_values;  i++)
+	for (unsigned int i = 0; i < max_values; i++)
 	{
 		// Existing result is better - move on ...
 		if (this_error > best_errors[i])
--- a/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp
@@ -692,6 +692,7 @@ float compute_error_of_weight_set_1plane(
 ) {
 	vfloatacc error_summav = vfloatacc::zero();
 	unsigned int texel_count = di.texel_count;
+	promise(texel_count > 0);

 	// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
 	if (di.max_texel_weight_count > 2)
@@ -757,6 +758,7 @@ float compute_error_of_weight_set_2planes(
 ) {
 	vfloatacc error_summav = vfloatacc::zero();
 	unsigned int texel_count = di.texel_count;
+	promise(texel_count > 0);

 	// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
 	if (di.max_texel_weight_count > 2)
@@ -861,8 +863,7 @@ void compute_ideal_weights_for_decimation(
 	// zero-initialized SIMD over-fetch region
 	if (is_direct)
 	{
-		unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
-		for (unsigned int i = 0; i < texel_count_simd; i += ASTCENC_SIMD_WIDTH)
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vfloat weight(ei.weights + i);
 			storea(weight, dec_weight_ideal_value + i);
@@ -970,7 +971,7 @@ void compute_ideal_weights_for_decimation(
 		vfloat step = (error_change1 * chd_scale) / error_change0;
 		step = clamp(-stepsize, stepsize, step);

-		// Update the weight; note this can store negative values.
+		// Update the weight; note this can store negative values
 		storea(weight_val + step, dec_weight_ideal_value + i);
 	}
 }
@@ -1215,7 +1216,7 @@ void recompute_ideal_colors_1plane(
 		// Only compute a partition mean if more than one partition
 		if (partition_count > 1)
 		{
-			rgba_sum = vfloat4(1e-17f);
+			rgba_sum = vfloat4::zero();
 			promise(texel_count > 0);
 			for (unsigned int j = 0; j < texel_count; j++)
 			{
@@ -1251,7 +1252,6 @@ void recompute_ideal_colors_1plane(
 		for (unsigned int j = 0; j < texel_count; j++)
 		{
 			unsigned int tix = texel_indexes[j];
-
 			vfloat4 rgba = blk.texel(tix);

 			float idx0 = undec_weight_ref[tix];
@@ -1284,9 +1284,6 @@ void recompute_ideal_colors_1plane(
 		vfloat4 right_sum  = vfloat4(right_sum_s) * color_weight;
 		vfloat4 lmrs_sum   = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;

-		vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
-		float psum = right_sum_s * hadd_rgb_s(color_weight);
-
 		color_vec_x = color_vec_x * color_weight;
 		color_vec_y = color_vec_y * color_weight;

@@ -1349,26 +1346,32 @@ void recompute_ideal_colors_1plane(
 			}
 		}

-		// Calculations specific to mode #7, the HDR RGB-scale mode
-		vfloat4 rgbq_sum = color_vec_x + color_vec_y;
-		rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
-
-		vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
-		rgbo_vectors[i] = rgbovec;
-
-		// We can get a failure due to the use of a singular (non-invertible) matrix
-		// If it failed, compute rgbo_vectors[] with a different method ...
-		if (astc::isnan(dot_s(rgbovec, rgbovec)))
+		// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
+		if (blk.rgb_lns[0] || blk.alpha_lns[0])
 		{
-			vfloat4 v0 = ep.endpt0[i];
-			vfloat4 v1 = ep.endpt1[i];
+			vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
+			float psum = right_sum_s * hadd_rgb_s(color_weight);

-			float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
-			avgdif = astc::max(avgdif, 0.0f);
+			vfloat4 rgbq_sum = color_vec_x + color_vec_y;
+			rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));

-			vfloat4 avg = (v0 + v1) * 0.5f;
-			vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
-			rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
+			vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
+			rgbo_vectors[i] = rgbovec;
+
+			// We can get a failure due to the use of a singular (non-invertible) matrix
+			// If it failed, compute rgbo_vectors[] with a different method ...
+			if (astc::isnan(dot_s(rgbovec, rgbovec)))
+			{
+				vfloat4 v0 = ep.endpt0[i];
+				vfloat4 v1 = ep.endpt1[i];
+
+				float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
+				avgdif = astc::max(avgdif, 0.0f);
+
+				vfloat4 avg = (v0 + v1) * 0.5f;
+				vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
+				rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
+			}
 		}
 	}
 }
@@ -1516,7 +1519,7 @@ void recompute_ideal_colors_2planes(
 		color_vec_x += cwprod - cwiprod;

 		scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
-		weight_weight_sum += (color_weight * color_idx);
+		weight_weight_sum += color_idx;
 	}

 	vfloat4 left1_sum   = vfloat4(left1_sum_s) * color_weight;
@@ -1528,8 +1531,6 @@ void recompute_ideal_colors_2planes(
 	vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
 	vfloat4 right2_sum  = vfloat4(right2_sum_s) * color_weight;

-	float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
-
 	color_vec_x = color_vec_x * color_weight;
 	color_vec_y = color_vec_y * color_weight;

@@ -1630,26 +1631,32 @@ void recompute_ideal_colors_2planes(
 		ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
 	}

-	// Calculations specific to mode #7, the HDR RGB-scale mode
-	vfloat4 rgbq_sum = color_vec_x + color_vec_y;
-	rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
-
-	rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
-
-	// We can get a failure due to the use of a singular (non-invertible) matrix
-	// If it failed, compute rgbo_vectors[] with a different method ...
-	if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
+	// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
+	if (blk.rgb_lns[0] || blk.alpha_lns[0])
 	{
-		vfloat4 v0 = ep.endpt0[0];
-		vfloat4 v1 = ep.endpt1[0];
+		weight_weight_sum = weight_weight_sum * color_weight;
+		float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);

-		float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
-		avgdif = astc::max(avgdif, 0.0f);
+		vfloat4 rgbq_sum = color_vec_x + color_vec_y;
+		rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));

-		vfloat4 avg = (v0 + v1) * 0.5f;
-		vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
+		rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);

-		rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
+		// We can get a failure due to the use of a singular (non-invertible) matrix
+		// If it failed, compute rgbo_vectors[] with a different method ...
+		if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
+		{
+			vfloat4 v0 = ep.endpt0[0];
+			vfloat4 v1 = ep.endpt1[0];
+
+			float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
+			avgdif = astc::max(avgdif, 0.0f);
+
+			vfloat4 avg = (v0 + v1) * 0.5f;
+			vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
+
+			rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
+		}
 	}
 }

--- a/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp
@@ -24,6 +24,7 @@
 #include <array>

 /** @brief Unpacked quint triplets <low,middle,high> for each packed value */
+// TODO: Bitpack these into a uint16_t?
 static const uint8_t quints_of_integer[128][3] {
 	{0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0},
 	{4, 0, 0}, {0, 4, 0}, {4, 4, 0}, {4, 4, 4},
@@ -99,6 +100,7 @@ static const uint8_t integer_of_quints[5][5][5] {
 };

 /** @brief Unpacked trit quintuplets <low,...,high> for each packed value */
+// TODO: Bitpack these into a uint16_t?
 static const uint8_t trits_of_integer[256][5] {
 	{0, 0, 0, 0, 0}, {1, 0, 0, 0, 0}, {2, 0, 0, 0, 0}, {0, 0, 2, 0, 0},
 	{0, 1, 0, 0, 0}, {1, 1, 0, 0, 0}, {2, 1, 0, 0, 0}, {1, 0, 2, 0, 0},
@@ -334,44 +336,41 @@ static const uint8_t integer_of_trits[3][3][3][3][3] {
 */
 struct btq_count
 {
-	/** @brief The quantization level. */
-	uint8_t quant;
-
 	/** @brief The number of bits. */
-	uint8_t bits;
+	uint8_t bits:6;

 	/** @brief The number of trits. */
-	uint8_t trits;
+	uint8_t trits:1;

 	/** @brief The number of quints. */
-	uint8_t quints;
+	uint8_t quints:1;
 };

 /**
 * @brief The table of bits, trits, and quints needed for a quant encode.
 */
 static const std::array<btq_count, 21> btq_counts {{
-	{   QUANT_2, 1, 0, 0 },
-	{   QUANT_3, 0, 1, 0 },
-	{   QUANT_4, 2, 0, 0 },
-	{   QUANT_5, 0, 0, 1 },
-	{   QUANT_6, 1, 1, 0 },
-	{   QUANT_8, 3, 0, 0 },
-	{  QUANT_10, 1, 0, 1 },
-	{  QUANT_12, 2, 1, 0 },
-	{  QUANT_16, 4, 0, 0 },
-	{  QUANT_20, 2, 0, 1 },
-	{  QUANT_24, 3, 1, 0 },
-	{  QUANT_32, 5, 0, 0 },
-	{  QUANT_40, 3, 0, 1 },
-	{  QUANT_48, 4, 1, 0 },
-	{  QUANT_64, 6, 0, 0 },
-	{  QUANT_80, 4, 0, 1 },
-	{  QUANT_96, 5, 1, 0 },
-	{ QUANT_128, 7, 0, 0 },
-	{ QUANT_160, 5, 0, 1 },
-	{ QUANT_192, 6, 1, 0 },
-	{ QUANT_256, 8, 0, 0 }
+	{ 1, 0, 0 }, // QUANT_2
+	{ 0, 1, 0 }, // QUANT_3
+	{ 2, 0, 0 }, // QUANT_4
+	{ 0, 0, 1 }, // QUANT_5
+	{ 1, 1, 0 }, // QUANT_6
+	{ 3, 0, 0 }, // QUANT_8
+	{ 1, 0, 1 }, // QUANT_10
+	{ 2, 1, 0 }, // QUANT_12
+	{ 4, 0, 0 }, // QUANT_16
+	{ 2, 0, 1 }, // QUANT_20
+	{ 3, 1, 0 }, // QUANT_24
+	{ 5, 0, 0 }, // QUANT_32
+	{ 3, 0, 1 }, // QUANT_40
+	{ 4, 1, 0 }, // QUANT_48
+	{ 6, 0, 0 }, // QUANT_64
+	{ 4, 0, 1 }, // QUANT_80
+	{ 5, 1, 0 }, // QUANT_96
+	{ 7, 0, 0 }, // QUANT_128
+	{ 5, 0, 1 }, // QUANT_160
+	{ 6, 1, 0 }, // QUANT_192
+	{ 8, 0, 0 }  // QUANT_256
 }};

 /**
@@ -382,44 +381,38 @@ static const std::array<btq_count, 21> btq_counts {{
 */
 struct ise_size
 {
-	/** @brief The quantization level. */
-	uint8_t quant;
-
 	/** @brief The scaling parameter. */
-	uint8_t scale;
-
-	/** @brief The rounding parameter. */
-	uint8_t round;
+	uint8_t scale:6;

 	/** @brief The divisor parameter. */
-	uint8_t divisor;
+	uint8_t divisor:2;
 };

 /**
 * @brief The table of scale, round, and divisors needed for quant sizing.
 */
 static const std::array<ise_size, 21> ise_sizes {{
-	{   QUANT_2,  1, 0, 1 },
-	{   QUANT_3,  8, 4, 5 },
-	{   QUANT_4,  2, 0, 1 },
-	{   QUANT_5,  7, 2, 3 },
-	{   QUANT_6, 13, 4, 5 },
-	{   QUANT_8,  3, 0, 1 },
-	{  QUANT_10, 10, 2, 3 },
-	{  QUANT_12, 18, 4, 5 },
-	{  QUANT_16,  4, 0, 1 },
-	{  QUANT_20, 13, 2, 3 },
-	{  QUANT_24, 23, 4, 5 },
-	{  QUANT_32,  5, 0, 1 },
-	{  QUANT_40, 16, 2, 3 },
-	{  QUANT_48, 28, 4, 5 },
-	{  QUANT_64,  6, 0, 1 },
-	{  QUANT_80, 19, 2, 3 },
-	{  QUANT_96, 33, 4, 5 },
-	{ QUANT_128,  7, 0, 1 },
-	{ QUANT_160, 22, 2, 3 },
-	{ QUANT_192, 38, 4, 5 },
-	{ QUANT_256,  8, 0, 1 }
+	{  1, 0 }, // QUANT_2
+	{  8, 2 }, // QUANT_3
+	{  2, 0 }, // QUANT_4
+	{  7, 1 }, // QUANT_5
+	{ 13, 2 }, // QUANT_6
+	{  3, 0 }, // QUANT_8
+	{ 10, 1 }, // QUANT_10
+	{ 18, 2 }, // QUANT_12
+	{  4, 0 }, // QUANT_16
+	{ 13, 1 }, // QUANT_20
+	{ 23, 2 }, // QUANT_24
+	{  5, 0 }, // QUANT_32
+	{ 16, 1 }, // QUANT_40
+	{ 28, 2 }, // QUANT_48
+	{  6, 0 }, // QUANT_64
+	{ 19, 1 }, // QUANT_80
+	{ 33, 2 }, // QUANT_96
+	{  7, 0 }, // QUANT_128
+	{ 22, 1 }, // QUANT_160
+	{ 38, 2 }, // QUANT_192
+	{  8, 0 }  // QUANT_256
 }};

 /* See header for documentation. */
@@ -435,7 +428,8 @@ unsigned int get_ise_sequence_bitcount(
 	}

 	auto& entry = ise_sizes[quant_level];
-	return (entry.scale * character_count + entry.round) / entry.divisor;
+	unsigned int divisor = (entry.divisor << 1) + 1;
+	return (entry.scale * character_count + divisor - 1) / divisor;
 }

 /**
@@ -645,7 +639,6 @@ void encode_ise(
 	// Write out just bits
 	else
 	{
-		promise(character_count > 0);
 		for (unsigned int i = 0; i < character_count; i++)
 		{
 			write_bits(input_data[i], bits, bit_offset, output_data);
@@ -685,10 +678,10 @@ void decode_ise(

 		if (trits)
 		{
-			static const unsigned int bits_to_read[5]  { 2, 2, 1, 2, 1 };
-			static const unsigned int block_shift[5]   { 0, 2, 4, 5, 7 };
-			static const unsigned int next_lcounter[5] { 1, 2, 3, 4, 0 };
-			static const unsigned int hcounter_incr[5] { 0, 0, 0, 0, 1 };
+			static const uint8_t bits_to_read[5]  { 2, 2, 1, 2, 1 };
+			static const uint8_t block_shift[5]   { 0, 2, 4, 5, 7 };
+			static const uint8_t next_lcounter[5] { 1, 2, 3, 4, 0 };
+			static const uint8_t hcounter_incr[5] { 0, 0, 0, 0, 1 };
 			unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
 			bit_offset += bits_to_read[lcounter];
 			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
@@ -698,10 +691,10 @@ void decode_ise(

 		if (quints)
 		{
-			static const unsigned int bits_to_read[3]  { 3, 2, 2 };
-			static const unsigned int block_shift[3]   { 0, 3, 5 };
-			static const unsigned int next_lcounter[3] { 1, 2, 0 };
-			static const unsigned int hcounter_incr[3] { 0, 0, 1 };
+			static const uint8_t bits_to_read[3]  { 3, 2, 2 };
+			static const uint8_t block_shift[3]   { 0, 3, 5 };
+			static const uint8_t next_lcounter[3] { 1, 2, 0 };
+			static const uint8_t hcounter_incr[3] { 0, 0, 1 };
 			unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
 			bit_offset += bits_to_read[lcounter];
 			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
@@ -714,6 +707,7 @@ void decode_ise(
 	if (trits)
 	{
 		unsigned int trit_blocks = (character_count + 4) / 5;
+		promise(trit_blocks > 0);
 		for (unsigned int i = 0; i < trit_blocks; i++)
 		{
 			const uint8_t *tritptr = trits_of_integer[tq_blocks[i]];
@@ -728,6 +722,7 @@ void decode_ise(
 	if (quints)
 	{
 		unsigned int quint_blocks = (character_count + 2) / 3;
+		promise(quint_blocks > 0);
 		for (unsigned int i = 0; i < quint_blocks; i++)
 		{
 			const uint8_t *quintptr = quints_of_integer[tq_blocks[i]];
--- a/3rdparty/astc-encoder/source/astcenc_internal.h
+++ b/3rdparty/astc-encoder/source/astcenc_internal.h
@@ -1008,9 +1008,6 @@ struct dt_init_working_buffers
 */
 struct quant_and_transfer_table
 {
-	/** @brief The quantization level used. */
-	quant_method method;
-
 	/** @brief The unscrambled unquantized value. */
 	int8_t quant_to_unquant[32];

--- a/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp
@@ -15,13 +15,13 @@
 // under the License.
 // ----------------------------------------------------------------------------

+#include "astcenc_mathlib.h"
+
 /**
 * @brief Soft-float library for IEEE-754.
 */
 #if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)

-#include "astcenc_mathlib.h"
-
 /*	sized soft-float types. These are mapped to the sized integer
    types of C99, instead of C's floating-point types; this is because
    the library needs to maintain exact, bit-level control on all
--- a/3rdparty/astc-encoder/source/astcenc_percentile_tables.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_percentile_tables.cpp
@@ -1166,11 +1166,11 @@ const float *get_2d_percentile_table(
 	unsigned int xdim,
 	unsigned int ydim
 ) {
-	float* unpacked_table = new float[2048];
+	float* unpacked_table = new float[WEIGHTS_MAX_BLOCK_MODES];
 	const packed_percentile_table *apt = get_packed_table(xdim, ydim);

 	// Set the default percentile
-	for (unsigned int i = 0; i < 2048; i++)
+	for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
 	{
 		unpacked_table[i] = 1.0f;
 	}
--- a/3rdparty/astc-encoder/source/astcenc_pick_best_endpoint_format.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_pick_best_endpoint_format.cpp
@@ -325,11 +325,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
 ) {
 	int partition_size = pi.partition_texel_count[partition_index];

-	static const float baseline_quant_error[21] {
-		(65536.0f * 65536.0f / 18.0f),				// 2 values, 1 step
-		(65536.0f * 65536.0f / 18.0f) / (2 * 2),	// 3 values, 2 steps
-		(65536.0f * 65536.0f / 18.0f) / (3 * 3),	// 4 values, 3 steps
-		(65536.0f * 65536.0f / 18.0f) / (4 * 4),	// 5 values
+	static const float baseline_quant_error[21 - QUANT_6] {
 		(65536.0f * 65536.0f / 18.0f) / (5 * 5),
 		(65536.0f * 65536.0f / 18.0f) / (7 * 7),
 		(65536.0f * 65536.0f / 18.0f) / (9 * 9),
@@ -528,7 +524,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
 			// The base_quant_error should depend on the scale-factor that would be used during
 			// actual encode of the color value

-			float base_quant_error = baseline_quant_error[i] * static_cast<float>(partition_size);
+			float base_quant_error = baseline_quant_error[i - QUANT_6] * static_cast<float>(partition_size);
 			float rgb_quantization_error = error_weight_rgbsum * base_quant_error * 2.0f;
 			float alpha_quantization_error = error_weight.lane<3>() * base_quant_error * 2.0f;
 			float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error;
@@ -591,7 +587,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
 				error_scale_oe_rgb = 1.0f;
 			}

-			float base_quant_error = baseline_quant_error[i];
+			float base_quant_error = baseline_quant_error[i - QUANT_6];
 			float quant_error_rgb  = base_quant_error_rgb * base_quant_error;
 			float quant_error_rgba = base_quant_error_rgba * base_quant_error;

@@ -1136,22 +1132,19 @@ unsigned int compute_ideal_endpoint_formats(
 	uint8_t (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats;

 	// Ensure that the first iteration understep contains data that will never be picked
+	vfloat clear_error(ERROR_CALC_DEFAULT);
+	vint clear_quant(0);
+
 	unsigned int packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
-	for (unsigned int i = packed_start_block_mode; i < start_block_mode; i++)
-	{
-		errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
-		best_quant_levels[i] = QUANT_2;
-		best_quant_levels_mod[i] = QUANT_2;
-	}
+	storea(clear_error, errors_of_best_combination + packed_start_block_mode);
+	store_nbytes(clear_quant, best_quant_levels + packed_start_block_mode);
+	store_nbytes(clear_quant, best_quant_levels_mod + packed_start_block_mode);

 	// Ensure that last iteration overstep contains data that will never be picked
-	const unsigned int packed_end_block_mode = round_up_to_simd_multiple_vla(end_block_mode);
-	for (unsigned int i = end_block_mode; i < packed_end_block_mode; i++)
-	{
-		errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
-		best_quant_levels[i] = QUANT_2;
-		best_quant_levels_mod[i] = QUANT_2;
-	}
+	unsigned int packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1);
+	storea(clear_error, errors_of_best_combination + packed_end_block_mode);
+	store_nbytes(clear_quant, best_quant_levels + packed_end_block_mode);
+	store_nbytes(clear_quant, best_quant_levels_mod + packed_end_block_mode);

 	// Track a scalar best to avoid expensive search at least once ...
 	float error_of_best_combination = ERROR_CALC_DEFAULT;
--- a/3rdparty/astc-encoder/source/astcenc_platform_isa_detection.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_platform_isa_detection.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2020-2021 Arm Limited
+// Copyright 2020-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -47,7 +47,7 @@ static bool g_cpu_has_f16c { false };
 ============================================================================ */
 #if !defined(__clang__) && defined(_MSC_VER)
 #define WIN32_LEAN_AND_MEAN
-#include <Windows.h>
+#include <windows.h>
 #include <intrin.h>

 /**
--- a/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp
@@ -371,12 +371,15 @@ void physical_to_symbolic(
 	const auto& di = bsd.get_decimation_info(bm.decimation_mode);

 	int weight_count = di.weight_count;
+	promise(weight_count > 0);
+
 	quant_method weight_quant_method = static_cast<quant_method>(bm.quant_mode);
 	int is_dual_plane = bm.is_dual_plane;

 	int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;

 	int partition_count = read_bits(2, 11, pcb.data) + 1;
+	promise(partition_count > 0);

 	scb.block_mode = static_cast<uint16_t>(block_mode);
 	scb.partition_count = static_cast<uint8_t>(partition_count);
@@ -523,6 +526,7 @@ void physical_to_symbolic(
 	}

 	// Fetch component for second-plane in the case of dual plane of weights.
+	scb.plane2_component = -1;
 	if (is_dual_plane)
 	{
 		scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb.data));
--- a/3rdparty/astc-encoder/source/astcenc_vecmathlib.h
+++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib.h
@@ -26,7 +26,7 @@
 * with that is available at compile time. The current vector width is
 * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
 *
- * Explicit scalar types are acessible via the vint1, vfloat1, vmask1 types.
+ * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
 * These are provided primarily for prototyping and algorithm debug of VLA
 * implementations.
 *
@@ -402,7 +402,7 @@ static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
 	// the original integer value into a 2^N encoding we can recover easily.

 	// Convert to float without risk of rounding up by keeping only top 8 bits.
-	// This trick is is guranteed to keep top 8 bits and clear the 9th.
+	// This trick is is guaranteed to keep top 8 bits and clear the 9th.
 	a = (~lsr<8>(a)) & a;
 	a = float_as_int(int_to_float(a));

--- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h
+++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h
@@ -106,7 +106,7 @@ struct vfloat4
 	 */
 	template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
 	{
-		m = vld1q_lane_f32(&a, m, l);
+		m = vsetq_lane_f32(a, m, l);
 	}

 	/**
@@ -122,7 +122,7 @@ struct vfloat4
 	 */
 	static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
 	{
-		return vfloat4(vdupq_n_f32(*p));
+		return vfloat4(vld1q_dup_f32(p));
 	}

 	/**
@@ -202,9 +202,8 @@ struct vint4
 	 */
 	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
 	{
-		uint32x2_t t8 {};
 		// Cast is safe - NEON loads are allowed to be unaligned
-		t8 = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), t8, 0);
+		uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
 		uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
 		m = vreinterpretq_s32_u32(vmovl_u16(t16));
 	}
@@ -251,7 +250,7 @@ struct vint4
 	 */
 	template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
 	{
-		m = vld1q_lane_s32(&a, m, l);
+		m = vsetq_lane_s32(a, m, l);
 	}

 	/**
--- a/3rdparty/astc-encoder/source/astcenc_weight_quant_xfer_tables.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_weight_quant_xfer_tables.cpp
@@ -24,9 +24,8 @@
 #define _ 0 // Using _ to indicate an entry that will not be used.

 const quant_and_transfer_table quant_and_xfer_tables[12] {
-	// Quantization method 0, range 0..1
+	// QUANT2, range 0..1
 	{
-		QUANT_2,
 		{0, 64},
 		{0, 1},
 		{0, 64},
@@ -34,9 +33,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] {
 		 _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
 		 0x4000}
 	},
-	// Quantization method 1, range 0..2
+	// QUANT_3, range 0..2
 	{
-		QUANT_3,
 		{0, 32, 64},
 		{0, 1, 2},
 		{0, 32, 64},
@@ -44,19 +42,17 @@ const quant_and_transfer_table quant_and_xfer_tables[12] {
 		 _,_,0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
 		 _,_,_,_,0x4020}
 	},
-	// Quantization method 2, range 0..3
+	// QUANT_4, range 0..3
 	{
-		QUANT_4,
 		{0, 21, 43, 64},
 		{0, 1, 2, 3},
-		 {0, 21, 43, 64},
+		{0, 21, 43, 64},
 		{0x1500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2b00,_,_,_,_,
 		 _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4015,_,_,_,_,_,_,_,_,_,_,_,_,
 		 _,_,_,_,_,_,_,_,0x402b}
 	},
-	// Quantization method 3, range 0..4
+	//QUANT_5, range 0..4
 	{
-		QUANT_5,
 		{0, 16, 32, 48, 64},
 		{0, 1, 2, 3, 4},
 		{0, 16, 32, 48, 64},
@@ -64,9 +60,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] {
 		 _,_,_,_,_,_,0x3010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4020,_,_,_,
 		 _,_,_,_,_,_,_,_,_,_,_,_,0x4030}
 	},
-	// Quantization method 4, range 0..5
+	// QUANT_6, range 0..5
 	{
-		QUANT_6,
 		{0, 12, 25, 39, 52, 64},
 		{0, 2, 4, 5, 3, 1},
 		{0, 64, 12, 52, 25, 39},
@@ -74,9 +69,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] {
 		 0x270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x3419,_,_,_,_,_,_,_,_,_,_,
 		 _,_,0x4027,_,_,_,_,_,_,_,_,_,_,_,0x4034}
 	},
-	// Quantization method 5, range 0..7
+	// QUANT_8, range 0..7
 	{
-		QUANT_8,
 		{0, 9, 18, 27, 37, 46, 55, 64},
 		{0, 1, 2, 3, 4, 5, 6, 7},
 		{0, 9, 18, 27, 37, 46, 55, 64},
@@ -84,9 +78,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] {
 		 _,_,_,_,_,_,0x2512,_,_,_,_,_,_,_,_,_,0x2e1b,_,_,_,_,_,_,_,_,
 		 0x3725,_,_,_,_,_,_,_,_,0x402e,_,_,_,_,_,_,_,_,0x4037}
 	},
-	// Quantization method 6, range 0..9
+	// QUANT_10, range 0..9
 	{
-		QUANT_10,
 		{0, 7, 14, 21, 28, 36, 43, 50, 57, 64},
 		{0, 2, 4, 6, 8, 9, 7, 5, 3, 1},
 		{0, 64, 7, 57, 14, 50, 21, 43, 28, 36},
@@ -95,9 +88,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] {
 		 _,0x3224,_,_,_,_,_,_,0x392b,_,_,_,_,_,_,0x4032,_,_,_,_,_,
 		 _,0x4039}
 	},
-	// Quantization method 7, range 0..11
+	// QUANT_12, range 0..11
 	{
-		QUANT_12,
 		{0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64},
 		{0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1},
 		{0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36},
@@ -106,9 +98,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] {
 		 0x291c,_,_,_,_,0x2f24,_,_,_,_,_,0x3529,_,_,_,_,_,
 		 0x3b2f,_,_,_,_,_,0x4035,_,_,_,_,0x403b}
 	},
-	// Quantization method 8, range 0..15
+	// QUANT_16, range 0..15
 	{
-		QUANT_16,
 		{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
 		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
 		{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
@@ -117,9 +108,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] {
 		 _,0x271d,_,_,_,0x2b23,_,_,_,0x2f27,_,_,_,0x342b,_,_,_,
 		 _,0x382f,_,_,_,0x3c34,_,_,_,0x4038,_,_,_,0x403c}
 	},
-	// Quantization method 9, range 0..19
+	// QUANT_20, range 0..19
 	{
-		QUANT_20,
 		{0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58, 61, 64},
 		{0, 4, 8, 12, 16, 2, 6, 10, 14, 18, 19, 15, 11, 7, 3, 17, 13, 9, 5, 1},
 		{0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35},
@@ -129,9 +119,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] {
 		 0x2d26,_,_,_,0x3029,_,_,0x332d,_,_,0x3730,_,_,_,
 		 0x3a33,_,_,0x3d37,_,_,0x403a,_,_,0x403d}
 	},
-	// Quantization method 10, range 0..23
+	// QUANT_24, range 0..23
 	{
-		QUANT_24,
 		{0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48, 51, 53, 56, 59, 62, 64},
 		{0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19, 11, 3, 17, 9, 1},
 		{0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34},
@@ -142,9 +131,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] {
 		 _,_,0x3530,_,0x3833,_,_,0x3b35,_,_,0x3e38,_,_,
 		 0x403b,_,0x403e}
 	},
-	// Quantization method 11, range 0..31
+	// QUANT_32, range 0..31
 	{
-		QUANT_32,
 		{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
 		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
 		{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},