From 8c3aabc3c1536a914a66a73c9865bc87ceade839 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=91=D1=80=D0=B0=D0=BD=D0=B8=D0=BC=D0=B8=D1=80=20=D0=9A?= =?UTF-8?q?=D0=B0=D1=80=D0=B0=D1=9F=D0=B8=D1=9B?= Date: Thu, 10 Nov 2022 17:27:01 -0800 Subject: [PATCH] Updated astc-encoder. --- 3rdparty/astc-encoder/include/astcenc.h | 47 ++++- .../source/astcenc_color_quantize.cpp | 90 +++----- .../source/astcenc_color_unquantize.cpp | 11 +- .../source/astcenc_compress_symbolic.cpp | 68 ++++-- .../astc-encoder/source/astcenc_entry.cpp | 107 +++++++--- .../source/astcenc_find_best_partitioning.cpp | 166 +++++++++------ .../astc-encoder/source/astcenc_internal.h | 142 ++++++------- .../source/astcenc_vecmathlib_common_4.h | 17 ++ .../source/astcenc_weight_align.cpp | 198 ++---------------- scripts/bimg.lua | 6 + scripts/bimg_encode.lua | 10 +- src/image.cpp | 64 ++++-- src/image_encode.cpp | 82 +++++--- 13 files changed, 496 insertions(+), 512 deletions(-) diff --git a/3rdparty/astc-encoder/include/astcenc.h b/3rdparty/astc-encoder/include/astcenc.h index 5d7af53..56f1ad8 100644 --- a/3rdparty/astc-encoder/include/astcenc.h +++ b/3rdparty/astc-encoder/include/astcenc.h @@ -241,6 +241,9 @@ static const float ASTCENC_PRE_MEDIUM = 60.0f; /** @brief The thorough quality search preset. */ static const float ASTCENC_PRE_THOROUGH = 98.0f; +/** @brief The thorough quality search preset. */ +static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f; + /** @brief The exhaustive, highest quality, search preset. */ static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f; @@ -440,11 +443,25 @@ struct astcenc_config unsigned int tune_partition_count_limit; /** - * @brief The maximum number of partitions searched (-partitionindexlimit). + * @brief The maximum number of partitions searched (-2partitionindexlimit). * * Valid values are between 1 and 1024. */ - unsigned int tune_partition_index_limit; + unsigned int tune_2partition_index_limit; + + /** + * @brief The maximum number of partitions searched (-3partitionindexlimit). + * + * Valid values are between 1 and 1024. + */ + unsigned int tune_3partition_index_limit; + + /** + * @brief The maximum number of partitions searched (-4partitionindexlimit). + * + * Valid values are between 1 and 1024. + */ + unsigned int tune_4partition_index_limit; /** * @brief The maximum centile for block modes searched (-blockmodelimit). @@ -468,6 +485,27 @@ struct astcenc_config */ unsigned int tune_candidate_limit; + /** + * @brief The number of trial partitionings per search (-2partitioncandidatelimit). + * + * Valid values are between 1 and TUNE_MAX_PARTITIIONING_CANDIDATES. + */ + unsigned int tune_2partitioning_candidate_limit; + + /** + * @brief The number of trial partitionings per search (-3partitioncandidatelimit). + * + * Valid values are between 1 and TUNE_MAX_PARTITIIONING_CANDIDATES. + */ + unsigned int tune_3partitioning_candidate_limit; + + /** + * @brief The number of trial partitionings per search (-4partitioncandidatelimit). + * + * Valid values are between 1 and TUNE_MAX_PARTITIIONING_CANDIDATES. + */ + unsigned int tune_4partitioning_candidate_limit; + /** * @brief The dB threshold for stopping block search (-dblimit). * @@ -517,11 +555,6 @@ struct astcenc_config */ float tune_2_plane_early_out_limit_correlation; - /** - * @brief The threshold below which (inclusive) we stop testing low/high/low+high cutoffs. - */ - unsigned int tune_low_weight_count_limit; - #if defined(ASTCENC_DIAGNOSTICS) /** * @brief The path to save the diagnostic trace data to. diff --git a/3rdparty/astc-encoder/source/astcenc_color_quantize.cpp b/3rdparty/astc-encoder/source/astcenc_color_quantize.cpp index 176754f..278f334 100644 --- a/3rdparty/astc-encoder/source/astcenc_color_quantize.cpp +++ b/3rdparty/astc-encoder/source/astcenc_color_quantize.cpp @@ -334,13 +334,13 @@ static bool try_quantize_rgb_delta( int g0be = quant_color(quant_level, g0b); int b0be = quant_color(quant_level, b0b); - r0b = unquant_color(quant_level, r0be); - g0b = unquant_color(quant_level, g0be); - b0b = unquant_color(quant_level, b0be); + int r0bu = unquant_color(quant_level, r0be); + int g0bu = unquant_color(quant_level, g0be); + int b0bu = unquant_color(quant_level, b0be); - r0b |= r0a & 0x100; - g0b |= g0a & 0x100; - b0b |= b0a & 0x100; + r0b = r0bu | (r0a & 0x100); + g0b = g0bu | (g0a & 0x100); + b0b = b0bu | (b0a & 0x100); // Get hold of the second value int r1d = astc::flt2int_rtn(r1); @@ -386,36 +386,18 @@ static bool try_quantize_rgb_delta( return false; } - // Check that the sum of the encoded offsets is nonnegative, else encoding fails - r1du &= 0x7f; - g1du &= 0x7f; - b1du &= 0x7f; - - if (r1du & 0x40) - { - r1du -= 0x80; - } - - if (g1du & 0x40) - { - g1du -= 0x80; - } - - if (b1du & 0x40) - { - b1du -= 0x80; - } - - if (r1du + g1du + b1du < 0) + // If the sum of offsets triggers blue-contraction then encoding fails + vint4 ep0(r0bu, g0bu, b0bu, 0); + vint4 ep1(r1du, g1du, b1du, 0); + bit_transfer_signed(ep1, ep0); + if (hadd_rgb_s(ep1) < 0) { return false; } // Check that the offsets produce legitimate sums as well - r1du += r0b; - g1du += g0b; - b1du += b0b; - if (r1du < 0 || r1du > 0x1FF || g1du < 0 || g1du > 0x1FF || b1du < 0 || b1du > 0x1FF) + ep0 = ep0 + ep1; + if (any((ep0 < vint4(0)) | (ep0 > vint4(0xFF)))) { return false; } @@ -477,13 +459,13 @@ static bool try_quantize_rgb_delta_blue_contract( int g0be = quant_color(quant_level, g0b); int b0be = quant_color(quant_level, b0b); - r0b = unquant_color(quant_level, r0be); - g0b = unquant_color(quant_level, g0be); - b0b = unquant_color(quant_level, b0be); + int r0bu = unquant_color(quant_level, r0be); + int g0bu = unquant_color(quant_level, g0be); + int b0bu = unquant_color(quant_level, b0be); - r0b |= r0a & 0x100; - g0b |= g0a & 0x100; - b0b |= b0a & 0x100; + r0b = r0bu | (r0a & 0x100); + g0b = g0bu | (g0a & 0x100); + b0b = b0bu | (b0a & 0x100); // Get hold of the second value int r1d = astc::flt2int_rtn(r1); @@ -530,38 +512,18 @@ static bool try_quantize_rgb_delta_blue_contract( return false; } - // Check that the sum of the encoded offsets is negative, else encoding fails - // Note that this is inverse of the test for non-blue-contracted RGB. - r1du &= 0x7f; - g1du &= 0x7f; - b1du &= 0x7f; - - if (r1du & 0x40) - { - r1du -= 0x80; - } - - if (g1du & 0x40) - { - g1du -= 0x80; - } - - if (b1du & 0x40) - { - b1du -= 0x80; - } - - if (r1du + g1du + b1du >= 0) + // If the sum of offsets does not trigger blue-contraction then encoding fails + vint4 ep0(r0bu, g0bu, b0bu, 0); + vint4 ep1(r1du, g1du, b1du, 0); + bit_transfer_signed(ep1, ep0); + if (hadd_rgb_s(ep1) >= 0) { return false; } // Check that the offsets produce legitimate sums as well - r1du += r0b; - g1du += g0b; - b1du += b0b; - - if (r1du < 0 || r1du > 0x1FF || g1du < 0 || g1du > 0x1FF || b1du < 0 || b1du > 0x1FF) + ep0 = ep0 + ep1; + if (any((ep0 < vint4(0)) | (ep0 > vint4(0xFF)))) { return false; } diff --git a/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp b/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp index c0aeebd..203615c 100644 --- a/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp +++ b/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp @@ -97,15 +97,8 @@ static void rgba_delta_unpack( vint4 input0 = unquant_color(quant_level, input0q); vint4 input1 = unquant_color(quant_level, input1q); - // Perform bit-transfer - input0 = input0 | lsl<1>(input1 & 0x80); - input1 = input1 & 0x7F; - vmask4 mask = (input1 & 0x40) != vint4::zero(); - input1 = select(input1, input1 - 0x80, mask); - - // Scale - input0 = asr<1>(input0); - input1 = asr<1>(input1); + // Apply bit transfer + bit_transfer_signed(input1, input0); // Apply blue-uncontraction if needed int rgb_sum = hadd_rgb_s(input1); diff --git a/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp b/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp index 1a58b0e..68bde08 100644 --- a/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp +++ b/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp @@ -424,11 +424,7 @@ static float compress_symbolic_block_for_partition_1plane( // For each mode, use the angular method to compute a shift compute_angular_endpoints_1plane( - config.tune_low_weight_count_limit, - only_always, bsd, - dec_weights_ideal, - max_weight_quant, - tmpbuf); + only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf); float* weight_low_value = tmpbuf.weight_low_value1; float* weight_high_value = tmpbuf.weight_high_value1; @@ -795,9 +791,7 @@ static float compress_symbolic_block_for_partition_2planes( float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask)); compute_angular_endpoints_2planes( - config.tune_low_weight_count_limit, - bsd, dec_weights_ideal, max_weight_quant, - tmpbuf); + bsd, dec_weights_ideal, max_weight_quant, tmpbuf); // For each mode (which specifies a decimation and a quantization): // * Compute number of bits needed for the quantized weights @@ -1130,12 +1124,13 @@ static float prepare_block_statistics( aa_var -= as * (as * rpt); - rg_cov *= astc::rsqrt(astc::max(rr_var * gg_var, 1e-30f)); - rb_cov *= astc::rsqrt(astc::max(rr_var * bb_var, 1e-30f)); - ra_cov *= astc::rsqrt(astc::max(rr_var * aa_var, 1e-30f)); - gb_cov *= astc::rsqrt(astc::max(gg_var * bb_var, 1e-30f)); - ga_cov *= astc::rsqrt(astc::max(gg_var * aa_var, 1e-30f)); - ba_cov *= astc::rsqrt(astc::max(bb_var * aa_var, 1e-30f)); + // These will give a NaN if a channel is constant - these are fixed up in the next step + rg_cov *= astc::rsqrt(rr_var * gg_var); + rb_cov *= astc::rsqrt(rr_var * bb_var); + ra_cov *= astc::rsqrt(rr_var * aa_var); + gb_cov *= astc::rsqrt(gg_var * bb_var); + ga_cov *= astc::rsqrt(gg_var * aa_var); + ba_cov *= astc::rsqrt(bb_var * aa_var); if (astc::isnan(rg_cov)) rg_cov = 1.0f; if (astc::isnan(rb_cov)) rb_cov = 1.0f; @@ -1144,7 +1139,7 @@ static float prepare_block_statistics( if (astc::isnan(ga_cov)) ga_cov = 1.0f; if (astc::isnan(ba_cov)) ba_cov = 1.0f; - float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov)); + float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov)); lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov)); lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov)); lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov)); @@ -1197,6 +1192,18 @@ void compress_block( bool block_skip_two_plane = false; int max_partitions = ctx.config.tune_partition_count_limit; + unsigned int requested_partition_indices[3] { + ctx.config.tune_2partition_index_limit, + ctx.config.tune_3partition_index_limit, + ctx.config.tune_4partition_index_limit + }; + + unsigned int requested_partition_trials[3] { + ctx.config.tune_2partitioning_candidate_limit, + ctx.config.tune_3partitioning_candidate_limit, + ctx.config.tune_4partitioning_candidate_limit + }; + #if defined(ASTCENC_DIAGNOSTICS) // Do this early in diagnostic builds so we can dump uniform metrics // for every block. Do it later in release builds to avoid redundant work! @@ -1366,13 +1373,19 @@ void compress_block( // Find best blocks for 2, 3 and 4 partitions for (int partition_count = 2; partition_count <= max_partitions; partition_count++) { - unsigned int partition_indices[2] { 0 }; + unsigned int partition_indices[TUNE_MAX_PARTITIIONING_CANDIDATES]; - find_best_partition_candidates(bsd, blk, partition_count, - ctx.config.tune_partition_index_limit, - partition_indices); + unsigned int requested_indices = requested_partition_indices[partition_count - 2]; - for (unsigned int i = 0; i < 2; i++) + unsigned int requested_trials = requested_partition_trials[partition_count - 2]; + requested_trials = astc::min(requested_trials, requested_indices); + + unsigned int actual_trials = find_best_partition_candidates( + bsd, blk, partition_count, requested_indices, partition_indices, requested_trials); + + float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2]; + + for (unsigned int i = 0; i < actual_trials; i++) { TRACE_NODE(node1, "pass"); trace_add_data("partition_count", partition_count); @@ -1387,6 +1400,20 @@ void compress_block( scb, tmpbuf, quant_limit); best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval); + + // If using N partitions doesn't improve much over using N-1 partitions then skip trying + // N+1. Error can dramatically improve if the data is correlated or non-correlated and + // aligns with a partitioning that suits that encoding, so for this inner loop check add + // a large error scale because the "other" trial could be a lot better. In total the + // error must be at least 2x worse than the best existing error to early-out. + float best_error = best_errorvals_for_pcount[partition_count - 1]; + float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 2.0f; + if (best_error > (best_error_in_prev * best_error_scale)) + { + trace_add_data("skip", "tune_partition_early_out_limit_factor"); + goto END_OF_TESTS; + } + if (errorval < error_threshold) { trace_add_data("exit", "quality hit"); @@ -1396,7 +1423,6 @@ void compress_block( // If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1 float best_error = best_errorvals_for_pcount[partition_count - 1]; - float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2]; float best_error_scale = exit_thresholds_for_pcount[partition_count - 1]; if (best_error > (best_error_in_prev * best_error_scale)) { diff --git a/3rdparty/astc-encoder/source/astcenc_entry.cpp b/3rdparty/astc-encoder/source/astcenc_entry.cpp index 32403c7..6491c4e 100644 --- a/3rdparty/astc-encoder/source/astcenc_entry.cpp +++ b/3rdparty/astc-encoder/source/astcenc_entry.cpp @@ -40,10 +40,15 @@ struct astcenc_preset_config { float quality; unsigned int tune_partition_count_limit; - unsigned int tune_partition_index_limit; + unsigned int tune_2partition_index_limit; + unsigned int tune_3partition_index_limit; + unsigned int tune_4partition_index_limit; unsigned int tune_block_mode_limit; unsigned int tune_refinement_limit; unsigned int tune_candidate_limit; + unsigned int tune_2partitioning_candidate_limit; + unsigned int tune_3partitioning_candidate_limit; + unsigned int tune_4partitioning_candidate_limit; float tune_db_limit_a_base; float tune_db_limit_b_base; float tune_mode0_mse_overshoot; @@ -51,7 +56,6 @@ struct astcenc_preset_config float tune_2_partition_early_out_limit_factor; float tune_3_partition_early_out_limit_factor; float tune_2_plane_early_out_limit_correlation; - unsigned int tune_low_weight_count_limit; }; @@ -59,22 +63,25 @@ struct astcenc_preset_config * @brief The static quality presets that are built-in for high bandwidth * presets (x < 25 texels per block). */ -static const std::array preset_configs_high {{ +static const std::array preset_configs_high {{ { ASTCENC_PRE_FASTEST, - 2, 10, 43, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 25 + 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f }, { ASTCENC_PRE_FAST, - 3, 14, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.65f, 20 + 3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.90f }, { ASTCENC_PRE_MEDIUM, - 4, 28, 76, 3, 3, 95.0f, 70.0f, 2.5f, 2.5f, 1.2f, 1.25f, 0.85f, 16 + 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 2.5f, 1.1f, 1.05f, 0.95f }, { ASTCENC_PRE_THOROUGH, - 4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 12 + 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.35f, 1.15f, 0.97f + }, { + ASTCENC_PRE_VERYTHOROUGH, + 4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f }, { ASTCENC_PRE_EXHAUSTIVE, - 4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0 + 4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f } }}; @@ -82,46 +89,51 @@ static const std::array preset_configs_high {{ * @brief The static quality presets that are built-in for medium bandwidth * presets (25 <= x < 64 texels per block). */ -static const std::array preset_configs_mid {{ +static const std::array preset_configs_mid {{ { ASTCENC_PRE_FASTEST, - 2, 10, 43, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20 + 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f }, { ASTCENC_PRE_FAST, - 3, 15, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16 + 3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f }, { ASTCENC_PRE_MEDIUM, - 4, 30, 76, 3, 3, 95.0f, 70.0f, 3.0f, 3.0f, 1.2f, 1.25f, 0.75f, 14 + 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 3.0f, 1.1f, 1.05f, 0.90f }, { ASTCENC_PRE_THOROUGH, - 4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 10 + 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.4f, 1.2f, 0.95f + }, { + ASTCENC_PRE_VERYTHOROUGH, + 4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f }, { ASTCENC_PRE_EXHAUSTIVE, - 4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0 + 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f } }}; - /** * @brief The static quality presets that are built-in for low bandwidth * presets (64 <= x texels per block). */ -static const std::array preset_configs_low {{ +static const std::array preset_configs_low {{ { ASTCENC_PRE_FASTEST, - 2, 10, 40, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20 + 2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f }, { ASTCENC_PRE_FAST, - 2, 15, 55, 3, 3, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16 + 2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f }, { ASTCENC_PRE_MEDIUM, - 3, 30, 76, 3, 3, 95.0f, 70.0f, 3.5f, 3.5f, 1.2f, 1.25f, 0.65f, 12 + 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 3.5f, 1.1f, 1.05f, 0.90f }, { ASTCENC_PRE_THOROUGH, - 4, 75, 92, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.85f, 10 + 4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.3f, 1.2f, 0.97f + }, { + ASTCENC_PRE_VERYTHOROUGH, + 4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f }, { ASTCENC_PRE_EXHAUSTIVE, - 4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0 + 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f } }}; @@ -422,10 +434,15 @@ static astcenc_error validate_config( config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f); config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u); - config.tune_partition_index_limit = astc::clamp(config.tune_partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); + config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); + config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); + config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u); config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u); config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES); + config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES); + config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES); + config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES); config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f); config.tune_mode0_mse_overshoot = astc::max(config.tune_mode0_mse_overshoot, 1.0f); config.tune_refinement_mse_overshoot = astc::max(config.tune_refinement_mse_overshoot, 1.0f); @@ -464,9 +481,23 @@ astcenc_error astcenc_config_init( astcenc_config* configp ) { astcenc_error status; - astcenc_config& config = *configp; + + // Check basic library compatibility options here so they are checked early. Note, these checks + // are repeated in context_alloc for cases where callers use a manually defined config struct + status = validate_cpu_isa(); + if (status != ASTCENC_SUCCESS) + { + return status; + } + + status = validate_cpu_float(); + if (status != ASTCENC_SUCCESS) + { + return status; + } // Zero init all config fields; although most of will be over written + astcenc_config& config = *configp; std::memset(&config, 0, sizeof(config)); // Process the block size @@ -493,7 +524,7 @@ astcenc_error astcenc_config_init( return ASTCENC_ERR_BAD_QUALITY; } - static const std::array* preset_configs; + static const std::array* preset_configs; int texels_int = block_x * block_y * block_z; if (texels_int < 25) { @@ -525,11 +556,15 @@ astcenc_error astcenc_config_init( if (start == end) { config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit; - config.tune_partition_index_limit = (*preset_configs)[start].tune_partition_index_limit; + config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit; + config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit; + config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit; config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit; config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit; - config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, - TUNE_MAX_TRIAL_CANDIDATES); + config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES); + config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES); + config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES); + config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES); config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels, (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels); @@ -539,7 +574,6 @@ astcenc_error astcenc_config_init( config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor; config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor; config.tune_2_plane_early_out_limit_correlation = (*preset_configs)[start].tune_2_plane_early_out_limit_correlation; - config.tune_low_weight_count_limit = (*preset_configs)[start].tune_low_weight_count_limit; } // Start and end node are not the same - so interpolate between them else @@ -561,11 +595,19 @@ astcenc_error astcenc_config_init( #define LERPUI(param) static_cast(LERPI(param)) config.tune_partition_count_limit = LERPI(tune_partition_count_limit); - config.tune_partition_index_limit = LERPI(tune_partition_index_limit); + config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit); + config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit); + config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit); config.tune_block_mode_limit = LERPI(tune_block_mode_limit); config.tune_refinement_limit = LERPI(tune_refinement_limit); config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit), TUNE_MAX_TRIAL_CANDIDATES); + config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit), + BLOCK_MAX_PARTITIONINGS); + config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit), + BLOCK_MAX_PARTITIONINGS); + config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit), + BLOCK_MAX_PARTITIONINGS); config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels, LERP(tune_db_limit_b_base) - 19 * ltexels); @@ -575,7 +617,6 @@ astcenc_error astcenc_config_init( config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor); config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor); config.tune_2_plane_early_out_limit_correlation = LERP(tune_2_plane_early_out_limit_correlation); - config.tune_low_weight_count_limit = LERPI(tune_low_weight_count_limit); #undef LERP #undef LERPI #undef LERPUI @@ -676,13 +717,13 @@ astcenc_error astcenc_context_alloc( astcenc_error status; const astcenc_config& config = *configp; - status = validate_cpu_float(); + status = validate_cpu_isa(); if (status != ASTCENC_SUCCESS) { return status; } - status = validate_cpu_isa(); + status = validate_cpu_float(); if (status != ASTCENC_SUCCESS) { return status; @@ -714,7 +755,7 @@ astcenc_error astcenc_context_alloc( status = validate_config(ctx->config); if (status != ASTCENC_SUCCESS) { - delete ctx; + delete ctxo; return status; } diff --git a/3rdparty/astc-encoder/source/astcenc_find_best_partitioning.cpp b/3rdparty/astc-encoder/source/astcenc_find_best_partitioning.cpp index 2ea3e43..c9e1835 100644 --- a/3rdparty/astc-encoder/source/astcenc_find_best_partitioning.cpp +++ b/3rdparty/astc-encoder/source/astcenc_find_best_partitioning.cpp @@ -485,13 +485,59 @@ static unsigned int compute_kmeans_partition_ordering( mismatch_counts, partition_ordering); } +/** + * @brief Insert a partitioning into an order list of results, sorted by error. + * + * @param max_values The max number of entries in the best result arrays/ + * @param this_error The error of the new entry. + * @param this_partition The partition ID of the new entry. + * @param[out] best_errors The array of best error values. + * @param[out] best_partitions The array of best partition values. + */ +static void insert_result( + unsigned int max_values, + float this_error, + unsigned int this_partition, + float* best_errors, + unsigned int* best_partitions) +{ + // Don't bother searching if the current worst error beats the new error + if (this_error >= best_errors[max_values - 1]) + { + return; + } + + // Else insert into the list in error-order + for (unsigned int i = 0; i < max_values; i++) + { + // Existing result is better - move on ... + if (this_error > best_errors[i]) + { + continue; + } + + // Move existing results down one + for (unsigned int j = max_values - 1; j > i; j--) + { + best_errors[j] = best_errors[j - 1]; + best_partitions[j] = best_partitions[j - 1]; + } + + // Insert new result + best_errors[i] = this_error; + best_partitions[i] = this_partition; + break; + } +} + /* See header for documentation. */ -void find_best_partition_candidates( +unsigned int find_best_partition_candidates( const block_size_descriptor& bsd, const image_block& blk, unsigned int partition_count, unsigned int partition_search_limit, - unsigned int best_partitions[2] + unsigned int best_partitions[BLOCK_MAX_PARTITIONINGS], + unsigned int requested_candidates ) { // Constant used to estimate quantization error for a given partitioning; the optimal value for // this depends on bitrate. These values have been determined empirically. @@ -518,17 +564,23 @@ void find_best_partition_candidates( unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS]; unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence); partition_search_limit = astc::min(partition_search_limit, sequence_len); + requested_candidates = astc::min(partition_search_limit, requested_candidates); bool uses_alpha = !blk.is_constant_channel(3); // Partitioning errors assuming uncorrelated-chrominance endpoints - float uncor_best_error { ERROR_CALC_DEFAULT }; - unsigned int uncor_best_partition { 0 }; + float uncor_best_errors[TUNE_MAX_PARTITIIONING_CANDIDATES]; + unsigned int uncor_best_partitions[TUNE_MAX_PARTITIIONING_CANDIDATES]; // Partitioning errors assuming same-chrominance endpoints - // Store two so we can always return one different to uncorr - float samec_best_errors[2] { ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT }; - unsigned int samec_best_partitions[2] { 0, 0 }; + float samec_best_errors[TUNE_MAX_PARTITIIONING_CANDIDATES]; + unsigned int samec_best_partitions[TUNE_MAX_PARTITIIONING_CANDIDATES]; + + for (unsigned int i = 0; i < requested_candidates; i++) + { + uncor_best_errors[i] = ERROR_CALC_DEFAULT; + samec_best_errors[i] = ERROR_CALC_DEFAULT; + } if (uses_alpha) { @@ -602,25 +654,8 @@ void find_best_partition_candidates( samec_error += dot_s(samec_vector * samec_vector, error_weights); } - if (uncor_error < uncor_best_error) - { - uncor_best_error = uncor_error; - uncor_best_partition = partition; - } - - if (samec_error < samec_best_errors[0]) - { - samec_best_errors[1] = samec_best_errors[0]; - samec_best_partitions[1] = samec_best_partitions[0]; - - samec_best_errors[0] = samec_error; - samec_best_partitions[0] = partition; - } - else if (samec_error < samec_best_errors[1]) - { - samec_best_errors[1] = samec_error; - samec_best_partitions[1] = partition; - } + insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions); + insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions); } } else @@ -687,50 +722,55 @@ void find_best_partition_candidates( samec_error += dot3_s(samec_vector * samec_vector, error_weights); } - if (uncor_error < uncor_best_error) - { - uncor_best_error = uncor_error; - uncor_best_partition = partition; - } + insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions); + insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions); + } + } - if (samec_error < samec_best_errors[0]) - { - samec_best_errors[1] = samec_best_errors[0]; - samec_best_partitions[1] = samec_best_partitions[0]; + bool best_is_uncor = uncor_best_partitions[0] > samec_best_partitions[0]; - samec_best_errors[0] = samec_error; - samec_best_partitions[0] = partition; - } - else if (samec_error < samec_best_errors[1]) + unsigned int interleave[2 * TUNE_MAX_PARTITIIONING_CANDIDATES]; + for (unsigned int i = 0; i < requested_candidates; i++) + { + if (best_is_uncor) + { + interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index; + interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index; + } + else + { + interleave[2 * i] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index; + interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index; + } + } + + uint64_t bitmasks[1024/64] { 0 }; + unsigned int emitted = 0; + + // Deduplicate the first "requested" entries + for (unsigned int i = 0; i < requested_candidates * 2; i++) + { + unsigned int partition = interleave[i]; + + unsigned int word = partition / 64; + unsigned int bit = partition % 64; + + bool written = bitmasks[word] & (1ull << bit); + + if (!written) + { + best_partitions[emitted] = partition; + bitmasks[word] |= 1ull << bit; + emitted++; + + if (emitted == requested_candidates) { - samec_best_errors[1] = samec_error; - samec_best_partitions[1] = partition; + break; } } } - // Same partition is best for both, so use this first unconditionally - if (uncor_best_partition == samec_best_partitions[0]) - { - best_partitions[0] = samec_best_partitions[0]; - best_partitions[1] = samec_best_partitions[1]; - } - // Uncor is best - else if (uncor_best_error <= samec_best_errors[0]) - { - best_partitions[0] = uncor_best_partition; - best_partitions[1] = samec_best_partitions[0]; - } - // Samec is best - else - { - best_partitions[0] = samec_best_partitions[0]; - best_partitions[1] = uncor_best_partition; - } - - // Convert these back into canonical partition IDs for the rest of the codec - best_partitions[0] = bsd.get_raw_partition_info(partition_count, best_partitions[0]).partition_index; - best_partitions[1] = bsd.get_raw_partition_info(partition_count, best_partitions[1]).partition_index; + return emitted; } #endif diff --git a/3rdparty/astc-encoder/source/astcenc_internal.h b/3rdparty/astc-encoder/source/astcenc_internal.h index 33b12eb..6ec395f 100644 --- a/3rdparty/astc-encoder/source/astcenc_internal.h +++ b/3rdparty/astc-encoder/source/astcenc_internal.h @@ -130,7 +130,14 @@ static constexpr unsigned int TUNE_MIN_TEXELS_MODE0_FASTPATH { 24 }; * * This can be dynamically reduced by the compression quality preset. */ -static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 4 }; +static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 8 }; + +/** + * @brief The maximum number of candidate partitionings tested for each encoding mode. + * + * This can be dynamically reduced by the compression quality preset. + */ +static constexpr unsigned int TUNE_MAX_PARTITIIONING_CANDIDATES { 32 }; /** * @brief The maximum quant level using full angular endpoint search method. @@ -1345,11 +1352,11 @@ extern const int8_t quant_mode_table[10][128]; * Note that BISE can return strings that are not a whole number of bytes in length, and ASTC can * start storing strings in a block at arbitrary bit offsets in the encoded data. * - * @param quant_level The BISE alphabet size. - * @param character_count The number of characters in the string. - * @param input_data The unpacked string, one byte per character. - * @param[in,out] output_data The output packed string. - * @param bit_offset The starting offset in the output storage. + * @param quant_level The BISE alphabet size. + * @param character_count The number of characters in the string. + * @param input_data The unpacked string, one byte per character. + * @param[in,out] output_data The output packed string. + * @param bit_offset The starting offset in the output storage. */ void encode_ise( quant_method quant_level, @@ -1436,11 +1443,11 @@ void compute_avgs_and_dirs_3_comp( * This is a specialization of @c compute_avgs_and_dirs_3_comp where the omitted component is * always alpha, a common case during partition search. * - * @param pi The partition info for the current trial. - * @param blk The image block color data to be compressed. - * @param[out] pm The output partition metrics. - * - Only pi.partition_count array entries actually get initialized. - * - Direction vectors @c pm.dir are not normalized. + * @param pi The partition info for the current trial. + * @param blk The image block color data to be compressed. + * @param[out] pm The output partition metrics. + * - Only pi.partition_count array entries actually get initialized. + * - Direction vectors @c pm.dir are not normalized. */ void compute_avgs_and_dirs_3_comp_rgb( const partition_info& pi, @@ -1471,11 +1478,11 @@ void compute_avgs_and_dirs_4_comp( * * This function computes the squared error when using these two representations. * - * @param pi The partition info for the current trial. - * @param blk The image block color data to be compressed. - * @param[in,out] plines Processed line inputs, and line length outputs. - * @param[out] uncor_error The cumulative error for using the uncorrelated line. - * @param[out] samec_error The cumulative error for using the same chroma line. + * @param pi The partition info for the current trial. + * @param blk The image block color data to be compressed. + * @param[in,out] plines Processed line inputs, and line length outputs. + * @param[out] uncor_error The cumulative error for using the uncorrelated line. + * @param[out] samec_error The cumulative error for using the same chroma line. */ void compute_error_squared_rgb( const partition_info& pi, @@ -1520,18 +1527,23 @@ void compute_error_squared_rgba( * candidates; one assuming data has uncorrelated chroma and one assuming the * data has correlated chroma. The best candidate is returned first in the list. * - * @param bsd The block size information. - * @param blk The image block color data to compress. - * @param partition_count The number of partitions in the block. - * @param partition_search_limit The number of candidate partition encodings to trial. - * @param[out] best_partitions The best partition candidates. + * @param bsd The block size information. + * @param blk The image block color data to compress. + * @param partition_count The number of partitions in the block. + * @param partition_search_limit The number of candidate partition encodings to trial. + * @param[out] best_partitions The best partition candidates. + * @param requested_candidates The number of requsted partitionings. May return fewer if + * candidates are not avaiable. + * + * @return The actual number of candidates returned. */ -void find_best_partition_candidates( +unsigned int find_best_partition_candidates( const block_size_descriptor& bsd, const image_block& blk, unsigned int partition_count, unsigned int partition_search_limit, - unsigned int best_partitions[2]); + unsigned int best_partitions[TUNE_MAX_PARTITIIONING_CANDIDATES], + unsigned int requested_candidates); /* ============================================================================ Functionality for managing images and image related data. @@ -1545,10 +1557,10 @@ void find_best_partition_candidates( * * Results are written back into @c img->input_alpha_averages. * - * @param img The input image data, also holds output data. - * @param alpha_kernel_radius The kernel radius (in pixels) for alpha mods. - * @param swz Input data component swizzle. - * @param[out] ag The average variance arguments to init. + * @param img The input image data, also holds output data. + * @param alpha_kernel_radius The kernel radius (in pixels) for alpha mods. + * @param swz Input data component swizzle. + * @param[out] ag The average variance arguments to init. * * @return The number of tasks in the processing stage. */ @@ -1766,13 +1778,13 @@ float compute_error_of_weight_set_2planes( * The user requests a base color endpoint mode in @c format, but the quantizer may choose a * delta-based representation. It will report back the format variant it actually used. * - * @param color0 The input unquantized color0 endpoint for absolute endpoint pairs. - * @param color1 The input unquantized color1 endpoint for absolute endpoint pairs. - * @param rgbs_color The input unquantized RGBS variant endpoint for same chroma endpoints. - * @param rgbo_color The input unquantized RGBS variant endpoint for HDR endpoints.. - * @param format The desired base format. - * @param[out] output The output storage for the quantized colors/ - * @param quant_level The quantization level requested. + * @param color0 The input unquantized color0 endpoint for absolute endpoint pairs. + * @param color1 The input unquantized color1 endpoint for absolute endpoint pairs. + * @param rgbs_color The input unquantized RGBS variant endpoint for same chroma endpoints. + * @param rgbo_color The input unquantized RGBS variant endpoint for HDR endpoints. + * @param format The desired base format. + * @param[out] output The output storage for the quantized colors/ + * @param quant_level The quantization level requested. * * @return The actual endpoint mode used. */ @@ -1873,13 +1885,13 @@ unsigned int compute_ideal_endpoint_formats( * As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must * recompute the ideal colors for a specific weight set. * - * @param blk The image block color data to compress. - * @param pi The partition info for the current trial. - * @param di The weight grid decimation table. + * @param blk The image block color data to compress. + * @param pi The partition info for the current trial. + * @param di The weight grid decimation table. * @param dec_weights_uquant The quantized weight set. - * @param[in,out] ep The color endpoints (modifed in place). - * @param[out] rgbs_vectors The RGB+scale vectors for LDR blocks. - * @param[out] rgbo_vectors The RGB+offset vectors for HDR blocks. + * @param[in,out] ep The color endpoints (modifed in place). + * @param[out] rgbs_vectors The RGB+scale vectors for LDR blocks. + * @param[out] rgbo_vectors The RGB+offset vectors for HDR blocks. */ void recompute_ideal_colors_1plane( const image_block& blk, @@ -1896,15 +1908,15 @@ void recompute_ideal_colors_1plane( * As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must * recompute the ideal colors for a specific weight set. * - * @param blk The image block color data to compress. - * @param bsd The block_size descriptor. - * @param di The weight grid decimation table. + * @param blk The image block color data to compress. + * @param bsd The block_size descriptor. + * @param di The weight grid decimation table. * @param dec_weights_uquant_plane1 The quantized weight set for plane 1. * @param dec_weights_uquant_plane2 The quantized weight set for plane 2. - * @param[in,out] ep The color endpoints (modifed in place). - * @param[out] rgbs_vector The RGB+scale color for LDR blocks. - * @param[out] rgbo_vector The RGB+offset color for HDR blocks. - * @param plane2_component The component assigned to plane 2. + * @param[in,out] ep The color endpoints (modifed in place). + * @param[out] rgbs_vector The RGB+scale color for LDR blocks. + * @param[out] rgbo_vector The RGB+offset color for HDR blocks. + * @param plane2_component The component assigned to plane 2. */ void recompute_ideal_colors_2planes( const image_block& blk, @@ -1925,15 +1937,13 @@ void prepare_angular_tables(); /** * @brief Compute the angular endpoints for one plane for each block mode. * - * @param tune_low_weight_limit Weight count cutoff below which we use simpler searches. - * @param only_always Only consider block modes that are always enabled. - * @param bsd The block size descriptor for the current trial. - * @param dec_weight_ideal_value The ideal decimated unquantized weight values. - * @param max_weight_quant The maximum block mode weight quantization allowed. - * @param[out] tmpbuf Preallocated scratch buffers for the compressor. + * @param only_always Only consider block modes that are always enabled. + * @param bsd The block size descriptor for the current trial. + * @param dec_weight_ideal_value The ideal decimated unquantized weight values. + * @param max_weight_quant The maximum block mode weight quantization allowed. + * @param[out] tmpbuf Preallocated scratch buffers for the compressor. */ void compute_angular_endpoints_1plane( - unsigned int tune_low_weight_limit, bool only_always, const block_size_descriptor& bsd, const float* dec_weight_ideal_value, @@ -1943,14 +1953,12 @@ void compute_angular_endpoints_1plane( /** * @brief Compute the angular endpoints for two planes for each block mode. * - * @param tune_low_weight_limit Weight count cutoff below which we use simpler searches. - * @param bsd The block size descriptor for the current trial. - * @param dec_weight_ideal_value The ideal decimated unquantized weight values. - * @param max_weight_quant The maximum block mode weight quantization allowed. - * @param[out] tmpbuf Preallocated scratch buffers for the compressor. + * @param bsd The block size descriptor for the current trial. + * @param dec_weight_ideal_value The ideal decimated unquantized weight values. + * @param max_weight_quant The maximum block mode weight quantization allowed. + * @param[out] tmpbuf Preallocated scratch buffers for the compressor. */ void compute_angular_endpoints_2planes( - unsigned int tune_low_weight_limit, const block_size_descriptor& bsd, const float* dec_weight_ideal_value, unsigned int max_weight_quant, @@ -2162,18 +2170,4 @@ void aligned_free(T* ptr) #endif } -static inline void dump_weights(const char* label, uint8_t* weights, int weight_count) -{ - printf("%s\n", label); - vint lane = vint::lane_id(); - for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) - { - vmask mask = lane < vint(weight_count); - vint val(weights + i); - val = select(vint::zero(), val, mask); - print(val); - lane += vint(ASTCENC_SIMD_WIDTH); - } -} - #endif diff --git a/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h b/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h index 2609c8f..86ee4fd 100644 --- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h +++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h @@ -361,6 +361,23 @@ static inline int popcount(uint64_t v) #endif +/** + * @brief Apply signed bit transfer. + * + * @param input0 The first encoded endpoint. + * @param input1 The second encoded endpoint. + */ +static ASTCENC_SIMD_INLINE void bit_transfer_signed( + vint4& input0, + vint4& input1 +) { + input1 = lsr<1>(input1) | (input0 & 0x80); + input0 = lsr<1>(input0) & 0x3F; + + vmask4 mask = (input0 & 0x20) != vint4::zero(); + input0 = select(input0, input0 - 0x40, mask); +} + /** * @brief Debug function to print a vector of ints. */ diff --git a/3rdparty/astc-encoder/source/astcenc_weight_align.cpp b/3rdparty/astc-encoder/source/astcenc_weight_align.cpp index f066cb7..e40a318 100644 --- a/3rdparty/astc-encoder/source/astcenc_weight_align.cpp +++ b/3rdparty/astc-encoder/source/astcenc_weight_align.cpp @@ -333,156 +333,8 @@ static void compute_angular_endpoints_for_quant_levels( } } -/** - * @brief For a given step size compute the lowest and highest weight, variant for low weight count. - * - * Compute the lowest and highest weight that results from quantizing using the given stepsize and - * offset, and then compute the resulting error. The cut errors indicate the error that results from - * forcing samples that should have had one weight value one step up or down. - * - * @param weight_count The number of (decimated) weights. - * @param dec_weight_quant_uvalue The decimated and quantized weight values. - * @param max_angular_steps The maximum number of steps to be tested. - * @param max_quant_steps The maximum quantization level to be tested. - * @param offsets The angular offsets array. - * @param[out] lowest_weight Per angular step, the lowest weight. - * @param[out] weight_span Per angular step, the span between lowest and highest weight. - * @param[out] error Per angular step, the error. - */ -static void compute_lowest_and_highest_weight_lwc( - unsigned int weight_count, - const float* dec_weight_quant_uvalue, - unsigned int max_angular_steps, - unsigned int max_quant_steps, - const float* offsets, - float* lowest_weight, - int* weight_span, - float* error -) { - promise(weight_count > 0); - promise(max_angular_steps > 0); - - vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f); - - // Arrays are ANGULAR_STEPS long, so always safe to run full vectors - for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH) - { - vfloat minidx(128.0f); - vfloat maxidx(-128.0f); - vfloat errval = vfloat::zero(); - vfloat offset = loada(offsets + sp); - - for (unsigned int j = 0; j < weight_count; j++) - { - vfloat sval = load1(dec_weight_quant_uvalue + j) * rcp_stepsize - offset; - vfloat svalrte = round(sval); - vfloat diff = sval - svalrte; - errval += diff * diff; - - // Compute min and max quantized weight spans for each step - minidx = min(minidx, svalrte); - maxidx = max(maxidx, svalrte); - } - - // Write out min weight and weight span; clamp span to a usable range - vint span = float_to_int(maxidx - minidx + vfloat(1.0f)); - span = min(span, vint(max_quant_steps + 3)); - span = max(span, vint(2)); - storea(minidx, lowest_weight + sp); - storea(span, weight_span + sp); - - vfloat ssize = 1.0f / rcp_stepsize; - vfloat errscale = ssize * ssize; - storea(errval * errscale, error + sp); - - rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH); - } -} - -/** - * @brief The main function for the angular algorithm, variant for low weight count. - * - * @param weight_count The number of (decimated) weights. - * @param dec_weight_ideal_value The ideal decimated unquantized weight values. - * @param max_quant_level The maximum quantization level to be tested. - * @param[out] low_value Per angular step, the lowest weight value. - * @param[out] high_value Per angular step, the highest weight value. - */ -static void compute_angular_endpoints_for_quant_levels_lwc( - unsigned int weight_count, - const float* dec_weight_ideal_value, - unsigned int max_quant_level, - float low_value[TUNE_MAX_ANGULAR_QUANT + 1], - float high_value[TUNE_MAX_ANGULAR_QUANT + 1] -) { - unsigned int max_quant_steps = steps_for_quant_level[max_quant_level]; - unsigned int max_angular_steps = steps_for_quant_level[max_quant_level]; - - alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS]; - alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS]; - alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS]; - alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS]; - - compute_angular_offsets(weight_count, dec_weight_ideal_value, - max_angular_steps, angular_offsets); - - - compute_lowest_and_highest_weight_lwc(weight_count, dec_weight_ideal_value, - max_angular_steps, max_quant_steps, - angular_offsets, lowest_weight, weight_span, error); - - // For each quantization level, find the best error terms. Use packed vectors so data-dependent - // branches can become selects. This involves some integer to float casts, but the values are - // small enough so they never round the wrong way. - vfloat4 best_results[36]; - - // Initialize the array to some safe defaults - promise(max_quant_steps > 0); - for (unsigned int i = 0; i < (max_quant_steps + 4); i++) - { - best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f); - } - - promise(max_angular_steps > 0); - for (unsigned int i = 0; i < max_angular_steps; i++) - { - int idx_span = weight_span[i]; - - // Check best error against record N - vfloat4 current_best = best_results[idx_span]; - vfloat4 candidate = vfloat4(error[i], static_cast(i), 0.0f, 0.0f); - vmask4 mask = vfloat4(current_best.lane<0>()) > vfloat4(error[i]); - best_results[idx_span] = select(current_best, candidate, mask); - } - - for (unsigned int i = 0; i <= max_quant_level; i++) - { - unsigned int q = steps_for_quant_level[i]; - int bsi = static_cast(best_results[q].lane<1>()); - - // Did we find anything? -#if defined(ASTCENC_DIAGNOSTICS) - if ((bsi < 0) && print_once) - { - print_once = false; - printf("INFO: Unable to find low weight encoding within search error limit.\n\n"); - } -#endif - - bsi = astc::max(0, bsi); - - float lwi = lowest_weight[bsi]; - float hwi = lwi + static_cast(q) - 1.0f; - - float stepsize = 1.0f / (1.0f + static_cast(bsi)); - low_value[i] = (angular_offsets[bsi] + lwi) * stepsize; - high_value[i] = (angular_offsets[bsi] + hwi) * stepsize; - } -} - /* See header for documentation. */ void compute_angular_endpoints_1plane( - unsigned int tune_low_weight_limit, bool only_always, const block_size_descriptor& bsd, const float* dec_weight_ideal_value, @@ -519,20 +371,10 @@ void compute_angular_endpoints_1plane( max_precision = max_weight_quant; } - if (weight_count < tune_low_weight_limit) - { - compute_angular_endpoints_for_quant_levels_lwc( - weight_count, - dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS, - max_precision, low_values[i], high_values[i]); - } - else - { - compute_angular_endpoints_for_quant_levels( - weight_count, - dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS, - max_precision, low_values[i], high_values[i]); - } + compute_angular_endpoints_for_quant_levels( + weight_count, + dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS, + max_precision, low_values[i], high_values[i]); } unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always @@ -561,7 +403,6 @@ void compute_angular_endpoints_1plane( /* See header for documentation. */ void compute_angular_endpoints_2planes( - unsigned int tune_low_weight_limit, const block_size_descriptor& bsd, const float* dec_weight_ideal_value, unsigned int max_weight_quant, @@ -599,30 +440,15 @@ void compute_angular_endpoints_2planes( max_precision = max_weight_quant; } - if (weight_count < tune_low_weight_limit) - { - compute_angular_endpoints_for_quant_levels_lwc( - weight_count, - dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS, - max_precision, low_values1[i], high_values1[i]); + compute_angular_endpoints_for_quant_levels( + weight_count, + dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS, + max_precision, low_values1[i], high_values1[i]); - compute_angular_endpoints_for_quant_levels_lwc( - weight_count, - dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET, - max_precision, low_values2[i], high_values2[i]); - } - else - { - compute_angular_endpoints_for_quant_levels( - weight_count, - dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS, - max_precision, low_values1[i], high_values1[i]); - - compute_angular_endpoints_for_quant_levels( - weight_count, - dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET, - max_precision, low_values2[i], high_values2[i]); - } + compute_angular_endpoints_for_quant_levels( + weight_count, + dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET, + max_precision, low_values2[i], high_values2[i]); } unsigned int start = bsd.block_mode_count_1plane_selected; diff --git a/scripts/bimg.lua b/scripts/bimg.lua index 3b2dc99..0dde12d 100644 --- a/scripts/bimg.lua +++ b/scripts/bimg.lua @@ -25,6 +25,12 @@ project "bimg" using_bx() + configuration {} + + removeflags { + "FloatFast", -- astc-encoder doesn't work with it. + } + configuration { "linux-*" } buildoptions { "-fPIC", diff --git a/scripts/bimg_encode.lua b/scripts/bimg_encode.lua index 1063523..6bc48d3 100644 --- a/scripts/bimg_encode.lua +++ b/scripts/bimg_encode.lua @@ -6,10 +6,6 @@ project "bimg_encode" kind "StaticLib" - removeflags { - "FloatFast", -- astc-encoder doesn't work with it. - } - includedirs { path.join(BIMG_DIR, "include"), path.join(BIMG_DIR, "3rdparty"), @@ -42,6 +38,12 @@ project "bimg_encode" using_bx() + configuration {} + + removeflags { + "FloatFast", -- astc-encoder doesn't work with it. + } + configuration { "linux-*" } buildoptions { "-fPIC", diff --git a/src/image.cpp b/src/image.cpp index dc6e0be..63d4625 100644 --- a/src/image.cpp +++ b/src/image.cpp @@ -147,7 +147,7 @@ namespace bimg "ATCE", // ATCE "ATCI", // ATCI "ASTC4x4", // ASTC4x4 - "ASTC5x4", // ASTC5x4 + "ASTC5x4", // ASTC5x4 "ASTC5x5", // ASTC5x5 "ASTC6x5", // ASTC6x5 "ASTC6x6", // ASTC6x6 @@ -3851,7 +3851,7 @@ namespace bimg #define KTX_ATC_RGB_AMD 0x8C92 #define KTX_ATC_RGBA_EXPLICIT_ALPHA_AMD 0x8C93 #define KTX_ATC_RGBA_INTERPOLATED_ALPHA_AMD 0x87EE -#define KTX_COMPRESSED_RGBA_ASTC_4x4_KHR 0x93B0 +#define KTX_COMPRESSED_RGBA_ASTC_4x4_KHR 0x93B0 #define KTX_COMPRESSED_RGBA_ASTC_5x4_KHR 0x93B1 #define KTX_COMPRESSED_RGBA_ASTC_5x5_KHR 0x93B2 #define KTX_COMPRESSED_RGBA_ASTC_6x5_KHR 0x93B3 @@ -4918,25 +4918,32 @@ namespace bimg case TextureFormat::ASTC12x12: if (BX_ENABLED(BIMG_DECODE_ASTC) ) { - const unsigned int thread_count = 1; const bimg::ImageBlockInfo& astcBlockInfo = bimg::getBlockInfo(_srcFormat); - const float quality = ASTCENC_PRE_MEDIUM; - const astcenc_profile profile = ASTCENC_PRF_LDR; //Linear LDR color profile - astcenc_error status; - //Create and init config and context astcenc_config config{}; - const unsigned int astcFlags = ASTCENC_FLG_DECOMPRESS_ONLY; - status = astcenc_config_init(profile, astcBlockInfo.blockWidth, astcBlockInfo.blockHeight, 1, quality, astcFlags, &config); - if (status != ASTCENC_SUCCESS) { + + astcenc_error status = astcenc_config_init( + ASTCENC_PRF_LDR + , astcBlockInfo.blockWidth + , astcBlockInfo.blockHeight + , 1 + , ASTCENC_PRE_MEDIUM + , ASTCENC_FLG_DECOMPRESS_ONLY + , &config + ); + + if (status != ASTCENC_SUCCESS) + { BX_TRACE("astc error in config init %s", astcenc_get_error_string(status)); imageCheckerboard(_dst, _width, _height, 16, UINT32_C(0xff000000), UINT32_C(0xffffff00) ); break; } astcenc_context* context; - status = astcenc_context_alloc(&config, thread_count, &context); - if (status != ASTCENC_SUCCESS) { + status = astcenc_context_alloc(&config, 1, &context); + + if (status != ASTCENC_SUCCESS) + { BX_TRACE("astc error in context alloc %s", astcenc_get_error_string(status)); imageCheckerboard(_dst, _width, _height, 16, UINT32_C(0xff000000), UINT32_C(0xffffff00) ); break; @@ -4944,21 +4951,36 @@ namespace bimg //Put image data into an astcenc_image astcenc_image image{}; - image.dim_x = _width; - image.dim_y = _height; - image.dim_z = 1; + image.dim_x = _width; + image.dim_y = _height; + image.dim_z = 1; image.data_type = ASTCENC_TYPE_U8; - image.data = &_dst; + image.data = &_dst; + const uint32_t size = imageGetSize(NULL, uint16_t(_width), uint16_t(_height), 0, false, false, 1, _srcFormat); - static const astcenc_swizzle swizzle { //0123/rgba swizzle corresponds to ASTC_RGBA - ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A + static const astcenc_swizzle swizzle + { //0123/rgba swizzle corresponds to ASTC_RGBA + ASTCENC_SWZ_R, + ASTCENC_SWZ_G, + ASTCENC_SWZ_B, + ASTCENC_SWZ_A, }; - status = astcenc_decompress_image(context, static_cast(_src), size, &image, &swizzle, 0); - - if (status != ASTCENC_SUCCESS) { + + status = astcenc_decompress_image( + context + , (const uint8_t*)_src + , size + , &image + , &swizzle + , 0 + ); + + if (status != ASTCENC_SUCCESS) + { BX_TRACE("astc error in compress image %s", astcenc_get_error_string(status)); imageCheckerboard(_dst, _width, _height, 16, UINT32_C(0xff000000), UINT32_C(0xffffff00) ); + astcenc_context_free(context); break; } diff --git a/src/image_encode.cpp b/src/image_encode.cpp index 39f8ae1..e006940 100644 --- a/src/image_encode.cpp +++ b/src/image_encode.cpp @@ -52,7 +52,7 @@ namespace bimg ASTCENC_PRE_THOROUGH, // Highest ASTCENC_PRE_FAST, // Fastest }; - BX_STATIC_ASSERT(Quality::Count == BX_COUNTOF(s_astcQuality)); + BX_STATIC_ASSERT(Quality::Count == BX_COUNTOF(s_astcQuality) ); void imageEncodeFromRgba8(bx::AllocatorI* _allocator, void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint32_t _depth, TextureFormat::Enum _format, Quality::Enum _quality, bx::Error* _err) { @@ -156,61 +156,83 @@ namespace bimg case TextureFormat::ASTC12x10: case TextureFormat::ASTC12x12: { - const unsigned int thread_count = 1; const bimg::ImageBlockInfo& astcBlockInfo = bimg::getBlockInfo(_format); - const float quality = s_astcQuality[_quality]; - const astcenc_profile profile = ASTCENC_PRF_LDR; //Linear LDR color profile - astcenc_error status; - //Create and init config and context astcenc_config config{}; - unsigned int astcFlags = ASTCENC_FLG_SELF_DECOMPRESS_ONLY; - if (Quality::NormalMapDefault <= _quality) { + + uint32_t astcFlags = ASTCENC_FLG_SELF_DECOMPRESS_ONLY; + + if (Quality::NormalMapDefault <= _quality) + { astcFlags |= ASTCENC_FLG_MAP_NORMAL; } - status = astcenc_config_init(profile, astcBlockInfo.blockWidth, astcBlockInfo.blockHeight, 1, quality, astcFlags, &config); - if (status != ASTCENC_SUCCESS) { - BX_TRACE("astc error in config init %s", astcenc_get_error_string(status)); + + astcenc_error status = astcenc_config_init( + ASTCENC_PRF_LDR + , astcBlockInfo.blockWidth + , astcBlockInfo.blockHeight + , 1 + , s_astcQuality[_quality] + , astcFlags + , &config + ); + + if (status != ASTCENC_SUCCESS) + { + BX_TRACE("astc error in config init %s", astcenc_get_error_string(status) ); BX_ERROR_SET(_err, BIMG_ERROR, "Unable to initialize astc config!"); break; } astcenc_context* context; - status = astcenc_context_alloc(&config, thread_count, &context); - if (status != ASTCENC_SUCCESS) { - BX_TRACE("astc error in context alloc %s", astcenc_get_error_string(status)); + status = astcenc_context_alloc(&config, 1, &context); + + if (status != ASTCENC_SUCCESS) + { + BX_TRACE("astc error in context alloc %s", astcenc_get_error_string(status) ); BX_ERROR_SET(_err, BIMG_ERROR, "Unable to alloc astc context!"); break; } - //Put image data into an astcenc_image astcenc_image image{}; - image.dim_x = _width; - image.dim_y = _height; - image.dim_z = 1; + image.dim_x = _width; + image.dim_y = _height; + image.dim_z = 1; image.data_type = ASTCENC_TYPE_U8; - image.data = reinterpret_cast(const_cast(&src)); + image.data = (void**)&src; - const size_t block_count_x = (_width + astcBlockInfo.blockWidth - 1) / astcBlockInfo.blockWidth; - const size_t block_count_y = (_height + astcBlockInfo.blockHeight - 1) / astcBlockInfo.blockHeight; - const size_t comp_len = block_count_x * block_count_y * 16; + const size_t blockCountX = (_width + astcBlockInfo.blockWidth - 1) / astcBlockInfo.blockWidth; + const size_t blockCountY = (_height + astcBlockInfo.blockHeight - 1) / astcBlockInfo.blockHeight; + const size_t compLen = blockCountX * blockCountY * 16; if (Quality::NormalMapDefault <= _quality) { - static const astcenc_swizzle swizzle { //0001/rrrg swizzle corresponds to ASTC_ENC_NORMAL_RA - ASTCENC_SWZ_R, ASTCENC_SWZ_R, ASTCENC_SWZ_R, ASTCENC_SWZ_G + static const astcenc_swizzle swizzle + { //0001/rrrg swizzle corresponds to ASTC_ENC_NORMAL_RA + ASTCENC_SWZ_R, + ASTCENC_SWZ_R, + ASTCENC_SWZ_R, + ASTCENC_SWZ_G, }; - status = astcenc_compress_image(context, &image, &swizzle, dst, comp_len, 0); + + status = astcenc_compress_image(context, &image, &swizzle, dst, compLen, 0); } else { - static const astcenc_swizzle swizzle { //0123/rgba swizzle corresponds to ASTC_RGBA - ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A + static const astcenc_swizzle swizzle + { //0123/rgba swizzle corresponds to ASTC_RGBA + ASTCENC_SWZ_R, + ASTCENC_SWZ_G, + ASTCENC_SWZ_B, + ASTCENC_SWZ_A, }; - status = astcenc_compress_image(context, &image, &swizzle, dst, comp_len, 0); + + status = astcenc_compress_image(context, &image, &swizzle, dst, compLen, 0); } - if (status != ASTCENC_SUCCESS) { - BX_TRACE("astc error in compress image %s", astcenc_get_error_string(status)); + + if (status != ASTCENC_SUCCESS) + { + BX_TRACE("astc error in compress image %s", astcenc_get_error_string(status) ); BX_ERROR_SET(_err, BIMG_ERROR, "Unable to compress astc image!"); astcenc_context_free(context); break;