From 1395f4e969fa0aac6158fb3caf0873eaed38d77f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=91=D1=80=D0=B0=D0=BD=D0=B8=D0=BC=D0=B8=D1=80=20=D0=9A?= =?UTF-8?q?=D0=B0=D1=80=D0=B0=D1=9F=D0=B8=D1=9B?= Date: Sun, 1 Jan 2023 19:16:52 -0800 Subject: [PATCH] Updated astc-encoder. --- 3rdparty/astc-encoder/include/astcenc.h | 22 +-- .../source/astcenc_color_quantize.cpp | 8 +- .../source/astcenc_compress_symbolic.cpp | 10 +- .../source/astcenc_decompress_symbolic.cpp | 2 +- .../astc-encoder/source/astcenc_entry.cpp | 58 ++++---- .../source/astcenc_find_best_partitioning.cpp | 8 +- .../astcenc_ideal_endpoints_and_weights.cpp | 95 +++++++------ .../source/astcenc_integer_sequence.cpp | 127 +++++++++--------- .../astc-encoder/source/astcenc_internal.h | 3 - .../source/astcenc_mathlib_softfloat.cpp | 4 +- .../source/astcenc_percentile_tables.cpp | 4 +- .../astcenc_pick_best_endpoint_format.cpp | 33 ++--- .../source/astcenc_platform_isa_detection.cpp | 4 +- .../source/astcenc_symbolic_physical.cpp | 4 + .../astc-encoder/source/astcenc_vecmathlib.h | 4 +- .../source/astcenc_vecmathlib_neon_4.h | 9 +- .../astcenc_weight_quant_xfer_tables.cpp | 38 ++---- 17 files changed, 201 insertions(+), 232 deletions(-) diff --git a/3rdparty/astc-encoder/include/astcenc.h b/3rdparty/astc-encoder/include/astcenc.h index 56f1ad8..add9491 100644 --- a/3rdparty/astc-encoder/include/astcenc.h +++ b/3rdparty/astc-encoder/include/astcenc.h @@ -514,25 +514,15 @@ struct astcenc_config float tune_db_limit; /** - * @brief The amount of overshoot needed to early-out mode 0 fast path. + * @brief The amount of MSE overshoot needed to early-out trials. * - * We have a fast-path for mode 0 (1 partition, 1 plane) which uses only essential block modes - * as an initial search. This can short-cut compression for simple blocks, but to avoid - * short-cutting too much we force this to overshoot the MSE threshold needed to hit the - * block-local db_limit e.g. 1.0 = no overshoot, 2.0 = need half the error to trigger. - */ - float tune_mode0_mse_overshoot; - - /** - * @brief The amount of overshoot needed to early-out refinement. + * The first early-out is for 1 partition, 1 plane trials, where we try a minimal encode using + * the high probability block modes. This can short-cut compression for simple blocks. * - * The codec will refine block candidates iteratively to improve the encoding, based on the - * @c tune_refinement_limit count. Earlier implementations will use all refinement iterations, - * even if the target threshold is reached. This tuning parameter allows an early out, but with - * an overshoot MSE threshold. Setting this to 1.0 will early-out as soon as the target is hit, - * but does reduce image quality vs the default behavior of over-refinement. + * The second early-out is for refinement trials, where we can exit refinement once quality is + * reached. */ - float tune_refinement_mse_overshoot; + float tune_mse_overshoot; /** * @brief The threshold for skipping 3.1/4.1 trials (-2partitionlimitfactor). diff --git a/3rdparty/astc-encoder/source/astcenc_color_quantize.cpp b/3rdparty/astc-encoder/source/astcenc_color_quantize.cpp index 1f5a4d9..edcfe4f 100644 --- a/3rdparty/astc-encoder/source/astcenc_color_quantize.cpp +++ b/3rdparty/astc-encoder/source/astcenc_color_quantize.cpp @@ -1960,7 +1960,7 @@ uint8_t pack_color_endpoints( switch (format) { case FMT_RGB: - if (quant_level <= 18) + if (quant_level <= QUANT_160) { if (try_quantize_rgb_delta_blue_contract(color0, color1, output, quant_level)) { @@ -1973,7 +1973,7 @@ uint8_t pack_color_endpoints( break; } } - if (try_quantize_rgb_blue_contract(color0, color1, output, quant_level)) + if (quant_level < QUANT_256 && try_quantize_rgb_blue_contract(color0, color1, output, quant_level)) { retval = FMT_RGB; break; @@ -1983,7 +1983,7 @@ uint8_t pack_color_endpoints( break; case FMT_RGBA: - if (quant_level <= 18) + if (quant_level <= QUANT_160) { if (try_quantize_rgba_delta_blue_contract(color0, color1, output, quant_level)) { @@ -1996,7 +1996,7 @@ uint8_t pack_color_endpoints( break; } } - if (try_quantize_rgba_blue_contract(color0, color1, output, quant_level)) + if (quant_level < QUANT_256 && try_quantize_rgba_blue_contract(color0, color1, output, quant_level)) { retval = FMT_RGBA; break; diff --git a/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp b/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp index 4dbe6f1..f89aea0 100644 --- a/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp +++ b/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp @@ -82,7 +82,7 @@ static bool realign_weights_undecimated( const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level]; unsigned int max_plane = bm.is_dual_plane; - int plane2_component = bm.is_dual_plane ? scb.plane2_component : -1; + int plane2_component = scb.plane2_component; vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component); // Decode the color endpoints @@ -206,7 +206,7 @@ static bool realign_weights_decimated( assert(weight_count != bsd.texel_count); unsigned int max_plane = bm.is_dual_plane; - int plane2_component = bm.is_dual_plane ? scb.plane2_component : -1; + int plane2_component = scb.plane2_component; vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component); // Decode the color endpoints @@ -1279,13 +1279,13 @@ void compress_block( // compression and slightly reduces image quality. float errorval_mult[2] { - 1.0f / ctx.config.tune_mode0_mse_overshoot, + 1.0f / ctx.config.tune_mse_overshoot, 1.0f }; - static const float errorval_overshoot = 1.0f / ctx.config.tune_refinement_mse_overshoot; + static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot; - // Only enable MODE0 fast path (trial 0) if 2D and more than 25 texels + // Only enable MODE0 fast path (trial 0) if 2D, and more than 25 texels int start_trial = 1; if ((bsd.texel_count >= TUNE_MIN_TEXELS_MODE0_FASTPATH) && (bsd.zdim == 1)) { diff --git a/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp b/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp index 8be222b..e462b91 100644 --- a/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp +++ b/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp @@ -286,7 +286,7 @@ void decompress_symbolic_block( unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights); // Now that we have endpoint colors and weights, we can unpack texel colors - int plane2_component = is_dual_plane ? scb.plane2_component : -1; + int plane2_component = scb.plane2_component; vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component); for (int i = 0; i < partition_count; i++) diff --git a/3rdparty/astc-encoder/source/astcenc_entry.cpp b/3rdparty/astc-encoder/source/astcenc_entry.cpp index 95125b3..fcd6dae 100644 --- a/3rdparty/astc-encoder/source/astcenc_entry.cpp +++ b/3rdparty/astc-encoder/source/astcenc_entry.cpp @@ -51,89 +51,84 @@ struct astcenc_preset_config unsigned int tune_4partitioning_candidate_limit; float tune_db_limit_a_base; float tune_db_limit_b_base; - float tune_mode0_mse_overshoot; - float tune_refinement_mse_overshoot; + float tune_mse_overshoot; float tune_2_partition_early_out_limit_factor; float tune_3_partition_early_out_limit_factor; float tune_2_plane_early_out_limit_correlation; }; - /** - * @brief The static quality presets that are built-in for high bandwidth - * presets (x < 25 texels per block). + * @brief The static presets for high bandwidth encodings (x < 25 texels per block). */ static const std::array preset_configs_high {{ { ASTCENC_PRE_FASTEST, - 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f + 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f }, { ASTCENC_PRE_FAST, - 3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.90f + 3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f }, { ASTCENC_PRE_MEDIUM, - 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 2.5f, 1.1f, 1.05f, 0.95f + 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f }, { ASTCENC_PRE_THOROUGH, - 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.35f, 1.15f, 0.97f + 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f }, { ASTCENC_PRE_VERYTHOROUGH, - 4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f + 4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f }, { ASTCENC_PRE_EXHAUSTIVE, - 4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f + 4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f } }}; /** - * @brief The static quality presets that are built-in for medium bandwidth - * presets (25 <= x < 64 texels per block). + * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block). */ static const std::array preset_configs_mid {{ { ASTCENC_PRE_FASTEST, - 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f + 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f }, { ASTCENC_PRE_FAST, - 3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f + 3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f }, { ASTCENC_PRE_MEDIUM, - 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 3.0f, 1.1f, 1.05f, 0.90f + 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f }, { ASTCENC_PRE_THOROUGH, - 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.4f, 1.2f, 0.95f + 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f }, { ASTCENC_PRE_VERYTHOROUGH, - 4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f + 4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f }, { ASTCENC_PRE_EXHAUSTIVE, - 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f + 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f } }}; /** - * @brief The static quality presets that are built-in for low bandwidth - * presets (64 <= x texels per block). + * @brief The static presets for low bandwidth encodings (64 <= x texels per block). */ static const std::array preset_configs_low {{ { ASTCENC_PRE_FASTEST, - 2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f + 2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f }, { ASTCENC_PRE_FAST, - 2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f + 2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f }, { ASTCENC_PRE_MEDIUM, - 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 3.5f, 1.1f, 1.05f, 0.90f + 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f }, { ASTCENC_PRE_THOROUGH, - 4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.3f, 1.2f, 0.97f + 4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f }, { ASTCENC_PRE_VERYTHOROUGH, - 4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f + 4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f }, { ASTCENC_PRE_EXHAUSTIVE, - 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f + 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f } }}; @@ -444,8 +439,7 @@ static astcenc_error validate_config( config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES); config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES); config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f); - config.tune_mode0_mse_overshoot = astc::max(config.tune_mode0_mse_overshoot, 1.0f); - config.tune_refinement_mse_overshoot = astc::max(config.tune_refinement_mse_overshoot, 1.0f); + config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f); config.tune_2_partition_early_out_limit_factor = astc::max(config.tune_2_partition_early_out_limit_factor, 0.0f); config.tune_3_partition_early_out_limit_factor = astc::max(config.tune_3_partition_early_out_limit_factor, 0.0f); config.tune_2_plane_early_out_limit_correlation = astc::max(config.tune_2_plane_early_out_limit_correlation, 0.0f); @@ -568,8 +562,7 @@ astcenc_error astcenc_config_init( config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels, (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels); - config.tune_mode0_mse_overshoot = (*preset_configs)[start].tune_mode0_mse_overshoot; - config.tune_refinement_mse_overshoot = (*preset_configs)[start].tune_refinement_mse_overshoot; + config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot; config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor; config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor; @@ -611,8 +604,7 @@ astcenc_error astcenc_config_init( config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels, LERP(tune_db_limit_b_base) - 19 * ltexels); - config.tune_mode0_mse_overshoot = LERP(tune_mode0_mse_overshoot); - config.tune_refinement_mse_overshoot = LERP(tune_refinement_mse_overshoot); + config.tune_mse_overshoot = LERP(tune_mse_overshoot); config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor); config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor); diff --git a/3rdparty/astc-encoder/source/astcenc_find_best_partitioning.cpp b/3rdparty/astc-encoder/source/astcenc_find_best_partitioning.cpp index c59e093..71fe44f 100644 --- a/3rdparty/astc-encoder/source/astcenc_find_best_partitioning.cpp +++ b/3rdparty/astc-encoder/source/astcenc_find_best_partitioning.cpp @@ -362,6 +362,7 @@ static void count_partition_mismatch_bits( unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS] ) { unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1]; + promise(active_count > 0); if (partition_count == 2) { @@ -400,6 +401,7 @@ static unsigned int get_partition_ordering_by_mismatch_bits( const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS], unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS] ) { + promise(partitioning_count > 0); unsigned int mscount[256] { 0 }; // Create the histogram of mismatch counts @@ -488,7 +490,7 @@ static unsigned int compute_kmeans_partition_ordering( /** * @brief Insert a partitioning into an order list of results, sorted by error. * - * @param max_values The max number of entries in the best result arrays/ + * @param max_values The max number of entries in the best result arrays. * @param this_error The error of the new entry. * @param this_partition The partition ID of the new entry. * @param[out] best_errors The array of best error values. @@ -501,6 +503,8 @@ static void insert_result( float* best_errors, unsigned int* best_partitions) { + promise(max_values > 0); + // Don't bother searching if the current worst error beats the new error if (this_error >= best_errors[max_values - 1]) { @@ -508,7 +512,7 @@ static void insert_result( } // Else insert into the list in error-order - for (unsigned int i = 0; i < max_values; i++) + for (unsigned int i = 0; i < max_values; i++) { // Existing result is better - move on ... if (this_error > best_errors[i]) diff --git a/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp b/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp index 3c18e87..f4da608 100644 --- a/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp +++ b/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp @@ -692,6 +692,7 @@ float compute_error_of_weight_set_1plane( ) { vfloatacc error_summav = vfloatacc::zero(); unsigned int texel_count = di.texel_count; + promise(texel_count > 0); // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized if (di.max_texel_weight_count > 2) @@ -757,6 +758,7 @@ float compute_error_of_weight_set_2planes( ) { vfloatacc error_summav = vfloatacc::zero(); unsigned int texel_count = di.texel_count; + promise(texel_count > 0); // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized if (di.max_texel_weight_count > 2) @@ -861,8 +863,7 @@ void compute_ideal_weights_for_decimation( // zero-initialized SIMD over-fetch region if (is_direct) { - unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); - for (unsigned int i = 0; i < texel_count_simd; i += ASTCENC_SIMD_WIDTH) + for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) { vfloat weight(ei.weights + i); storea(weight, dec_weight_ideal_value + i); @@ -970,7 +971,7 @@ void compute_ideal_weights_for_decimation( vfloat step = (error_change1 * chd_scale) / error_change0; step = clamp(-stepsize, stepsize, step); - // Update the weight; note this can store negative values. + // Update the weight; note this can store negative values storea(weight_val + step, dec_weight_ideal_value + i); } } @@ -1215,7 +1216,7 @@ void recompute_ideal_colors_1plane( // Only compute a partition mean if more than one partition if (partition_count > 1) { - rgba_sum = vfloat4(1e-17f); + rgba_sum = vfloat4::zero(); promise(texel_count > 0); for (unsigned int j = 0; j < texel_count; j++) { @@ -1251,7 +1252,6 @@ void recompute_ideal_colors_1plane( for (unsigned int j = 0; j < texel_count; j++) { unsigned int tix = texel_indexes[j]; - vfloat4 rgba = blk.texel(tix); float idx0 = undec_weight_ref[tix]; @@ -1284,9 +1284,6 @@ void recompute_ideal_colors_1plane( vfloat4 right_sum = vfloat4(right_sum_s) * color_weight; vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight; - vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight; - float psum = right_sum_s * hadd_rgb_s(color_weight); - color_vec_x = color_vec_x * color_weight; color_vec_y = color_vec_y * color_weight; @@ -1349,26 +1346,32 @@ void recompute_ideal_colors_1plane( } } - // Calculations specific to mode #7, the HDR RGB-scale mode - vfloat4 rgbq_sum = color_vec_x + color_vec_y; - rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); - - vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); - rgbo_vectors[i] = rgbovec; - - // We can get a failure due to the use of a singular (non-invertible) matrix - // If it failed, compute rgbo_vectors[] with a different method ... - if (astc::isnan(dot_s(rgbovec, rgbovec))) + // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR + if (blk.rgb_lns[0] || blk.alpha_lns[0]) { - vfloat4 v0 = ep.endpt0[i]; - vfloat4 v1 = ep.endpt1[i]; + vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight; + float psum = right_sum_s * hadd_rgb_s(color_weight); - float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f); - avgdif = astc::max(avgdif, 0.0f); + vfloat4 rgbq_sum = color_vec_x + color_vec_y; + rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); - vfloat4 avg = (v0 + v1) * 0.5f; - vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f; - rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif); + vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); + rgbo_vectors[i] = rgbovec; + + // We can get a failure due to the use of a singular (non-invertible) matrix + // If it failed, compute rgbo_vectors[] with a different method ... + if (astc::isnan(dot_s(rgbovec, rgbovec))) + { + vfloat4 v0 = ep.endpt0[i]; + vfloat4 v1 = ep.endpt1[i]; + + float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f); + avgdif = astc::max(avgdif, 0.0f); + + vfloat4 avg = (v0 + v1) * 0.5f; + vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f; + rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif); + } } } } @@ -1516,7 +1519,7 @@ void recompute_ideal_colors_2planes( color_vec_x += cwprod - cwiprod; scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale); - weight_weight_sum += (color_weight * color_idx); + weight_weight_sum += color_idx; } vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight; @@ -1528,8 +1531,6 @@ void recompute_ideal_colors_2planes( vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight; vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight; - float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight); - color_vec_x = color_vec_x * color_weight; color_vec_y = color_vec_y * color_weight; @@ -1630,26 +1631,32 @@ void recompute_ideal_colors_2planes( ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask); } - // Calculations specific to mode #7, the HDR RGB-scale mode - vfloat4 rgbq_sum = color_vec_x + color_vec_y; - rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); - - rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); - - // We can get a failure due to the use of a singular (non-invertible) matrix - // If it failed, compute rgbo_vectors[] with a different method ... - if (astc::isnan(dot_s(rgbo_vector, rgbo_vector))) + // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR + if (blk.rgb_lns[0] || blk.alpha_lns[0]) { - vfloat4 v0 = ep.endpt0[0]; - vfloat4 v1 = ep.endpt1[0]; + weight_weight_sum = weight_weight_sum * color_weight; + float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight); - float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f); - avgdif = astc::max(avgdif, 0.0f); + vfloat4 rgbq_sum = color_vec_x + color_vec_y; + rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); - vfloat4 avg = (v0 + v1) * 0.5f; - vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f; + rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); - rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif); + // We can get a failure due to the use of a singular (non-invertible) matrix + // If it failed, compute rgbo_vectors[] with a different method ... + if (astc::isnan(dot_s(rgbo_vector, rgbo_vector))) + { + vfloat4 v0 = ep.endpt0[0]; + vfloat4 v1 = ep.endpt1[0]; + + float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f); + avgdif = astc::max(avgdif, 0.0f); + + vfloat4 avg = (v0 + v1) * 0.5f; + vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f; + + rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif); + } } } diff --git a/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp b/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp index 9675ab5..4167503 100644 --- a/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp +++ b/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp @@ -24,6 +24,7 @@ #include /** @brief Unpacked quint triplets for each packed value */ +// TODO: Bitpack these into a uint16_t? static const uint8_t quints_of_integer[128][3] { {0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0}, {4, 0, 0}, {0, 4, 0}, {4, 4, 0}, {4, 4, 4}, @@ -99,6 +100,7 @@ static const uint8_t integer_of_quints[5][5][5] { }; /** @brief Unpacked trit quintuplets for each packed value */ +// TODO: Bitpack these into a uint16_t? static const uint8_t trits_of_integer[256][5] { {0, 0, 0, 0, 0}, {1, 0, 0, 0, 0}, {2, 0, 0, 0, 0}, {0, 0, 2, 0, 0}, {0, 1, 0, 0, 0}, {1, 1, 0, 0, 0}, {2, 1, 0, 0, 0}, {1, 0, 2, 0, 0}, @@ -334,44 +336,41 @@ static const uint8_t integer_of_trits[3][3][3][3][3] { */ struct btq_count { - /** @brief The quantization level. */ - uint8_t quant; - /** @brief The number of bits. */ - uint8_t bits; + uint8_t bits:6; /** @brief The number of trits. */ - uint8_t trits; + uint8_t trits:1; /** @brief The number of quints. */ - uint8_t quints; + uint8_t quints:1; }; /** * @brief The table of bits, trits, and quints needed for a quant encode. */ static const std::array btq_counts {{ - { QUANT_2, 1, 0, 0 }, - { QUANT_3, 0, 1, 0 }, - { QUANT_4, 2, 0, 0 }, - { QUANT_5, 0, 0, 1 }, - { QUANT_6, 1, 1, 0 }, - { QUANT_8, 3, 0, 0 }, - { QUANT_10, 1, 0, 1 }, - { QUANT_12, 2, 1, 0 }, - { QUANT_16, 4, 0, 0 }, - { QUANT_20, 2, 0, 1 }, - { QUANT_24, 3, 1, 0 }, - { QUANT_32, 5, 0, 0 }, - { QUANT_40, 3, 0, 1 }, - { QUANT_48, 4, 1, 0 }, - { QUANT_64, 6, 0, 0 }, - { QUANT_80, 4, 0, 1 }, - { QUANT_96, 5, 1, 0 }, - { QUANT_128, 7, 0, 0 }, - { QUANT_160, 5, 0, 1 }, - { QUANT_192, 6, 1, 0 }, - { QUANT_256, 8, 0, 0 } + { 1, 0, 0 }, // QUANT_2 + { 0, 1, 0 }, // QUANT_3 + { 2, 0, 0 }, // QUANT_4 + { 0, 0, 1 }, // QUANT_5 + { 1, 1, 0 }, // QUANT_6 + { 3, 0, 0 }, // QUANT_8 + { 1, 0, 1 }, // QUANT_10 + { 2, 1, 0 }, // QUANT_12 + { 4, 0, 0 }, // QUANT_16 + { 2, 0, 1 }, // QUANT_20 + { 3, 1, 0 }, // QUANT_24 + { 5, 0, 0 }, // QUANT_32 + { 3, 0, 1 }, // QUANT_40 + { 4, 1, 0 }, // QUANT_48 + { 6, 0, 0 }, // QUANT_64 + { 4, 0, 1 }, // QUANT_80 + { 5, 1, 0 }, // QUANT_96 + { 7, 0, 0 }, // QUANT_128 + { 5, 0, 1 }, // QUANT_160 + { 6, 1, 0 }, // QUANT_192 + { 8, 0, 0 } // QUANT_256 }}; /** @@ -382,44 +381,38 @@ static const std::array btq_counts {{ */ struct ise_size { - /** @brief The quantization level. */ - uint8_t quant; - /** @brief The scaling parameter. */ - uint8_t scale; - - /** @brief The rounding parameter. */ - uint8_t round; + uint8_t scale:6; /** @brief The divisor parameter. */ - uint8_t divisor; + uint8_t divisor:2; }; /** * @brief The table of scale, round, and divisors needed for quant sizing. */ static const std::array ise_sizes {{ - { QUANT_2, 1, 0, 1 }, - { QUANT_3, 8, 4, 5 }, - { QUANT_4, 2, 0, 1 }, - { QUANT_5, 7, 2, 3 }, - { QUANT_6, 13, 4, 5 }, - { QUANT_8, 3, 0, 1 }, - { QUANT_10, 10, 2, 3 }, - { QUANT_12, 18, 4, 5 }, - { QUANT_16, 4, 0, 1 }, - { QUANT_20, 13, 2, 3 }, - { QUANT_24, 23, 4, 5 }, - { QUANT_32, 5, 0, 1 }, - { QUANT_40, 16, 2, 3 }, - { QUANT_48, 28, 4, 5 }, - { QUANT_64, 6, 0, 1 }, - { QUANT_80, 19, 2, 3 }, - { QUANT_96, 33, 4, 5 }, - { QUANT_128, 7, 0, 1 }, - { QUANT_160, 22, 2, 3 }, - { QUANT_192, 38, 4, 5 }, - { QUANT_256, 8, 0, 1 } + { 1, 0 }, // QUANT_2 + { 8, 2 }, // QUANT_3 + { 2, 0 }, // QUANT_4 + { 7, 1 }, // QUANT_5 + { 13, 2 }, // QUANT_6 + { 3, 0 }, // QUANT_8 + { 10, 1 }, // QUANT_10 + { 18, 2 }, // QUANT_12 + { 4, 0 }, // QUANT_16 + { 13, 1 }, // QUANT_20 + { 23, 2 }, // QUANT_24 + { 5, 0 }, // QUANT_32 + { 16, 1 }, // QUANT_40 + { 28, 2 }, // QUANT_48 + { 6, 0 }, // QUANT_64 + { 19, 1 }, // QUANT_80 + { 33, 2 }, // QUANT_96 + { 7, 0 }, // QUANT_128 + { 22, 1 }, // QUANT_160 + { 38, 2 }, // QUANT_192 + { 8, 0 } // QUANT_256 }}; /* See header for documentation. */ @@ -435,7 +428,8 @@ unsigned int get_ise_sequence_bitcount( } auto& entry = ise_sizes[quant_level]; - return (entry.scale * character_count + entry.round) / entry.divisor; + unsigned int divisor = (entry.divisor << 1) + 1; + return (entry.scale * character_count + divisor - 1) / divisor; } /** @@ -645,7 +639,6 @@ void encode_ise( // Write out just bits else { - promise(character_count > 0); for (unsigned int i = 0; i < character_count; i++) { write_bits(input_data[i], bits, bit_offset, output_data); @@ -685,10 +678,10 @@ void decode_ise( if (trits) { - static const unsigned int bits_to_read[5] { 2, 2, 1, 2, 1 }; - static const unsigned int block_shift[5] { 0, 2, 4, 5, 7 }; - static const unsigned int next_lcounter[5] { 1, 2, 3, 4, 0 }; - static const unsigned int hcounter_incr[5] { 0, 0, 0, 0, 1 }; + static const uint8_t bits_to_read[5] { 2, 2, 1, 2, 1 }; + static const uint8_t block_shift[5] { 0, 2, 4, 5, 7 }; + static const uint8_t next_lcounter[5] { 1, 2, 3, 4, 0 }; + static const uint8_t hcounter_incr[5] { 0, 0, 0, 0, 1 }; unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data); bit_offset += bits_to_read[lcounter]; tq_blocks[hcounter] |= tdata << block_shift[lcounter]; @@ -698,10 +691,10 @@ void decode_ise( if (quints) { - static const unsigned int bits_to_read[3] { 3, 2, 2 }; - static const unsigned int block_shift[3] { 0, 3, 5 }; - static const unsigned int next_lcounter[3] { 1, 2, 0 }; - static const unsigned int hcounter_incr[3] { 0, 0, 1 }; + static const uint8_t bits_to_read[3] { 3, 2, 2 }; + static const uint8_t block_shift[3] { 0, 3, 5 }; + static const uint8_t next_lcounter[3] { 1, 2, 0 }; + static const uint8_t hcounter_incr[3] { 0, 0, 1 }; unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data); bit_offset += bits_to_read[lcounter]; tq_blocks[hcounter] |= tdata << block_shift[lcounter]; @@ -714,6 +707,7 @@ void decode_ise( if (trits) { unsigned int trit_blocks = (character_count + 4) / 5; + promise(trit_blocks > 0); for (unsigned int i = 0; i < trit_blocks; i++) { const uint8_t *tritptr = trits_of_integer[tq_blocks[i]]; @@ -728,6 +722,7 @@ void decode_ise( if (quints) { unsigned int quint_blocks = (character_count + 2) / 3; + promise(quint_blocks > 0); for (unsigned int i = 0; i < quint_blocks; i++) { const uint8_t *quintptr = quints_of_integer[tq_blocks[i]]; diff --git a/3rdparty/astc-encoder/source/astcenc_internal.h b/3rdparty/astc-encoder/source/astcenc_internal.h index 0fa8ec6..2cac014 100644 --- a/3rdparty/astc-encoder/source/astcenc_internal.h +++ b/3rdparty/astc-encoder/source/astcenc_internal.h @@ -1008,9 +1008,6 @@ struct dt_init_working_buffers */ struct quant_and_transfer_table { - /** @brief The quantization level used. */ - quant_method method; - /** @brief The unscrambled unquantized value. */ int8_t quant_to_unquant[32]; diff --git a/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp b/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp index 42db764..fa66036 100644 --- a/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp +++ b/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp @@ -15,13 +15,13 @@ // under the License. // ---------------------------------------------------------------------------- +#include "astcenc_mathlib.h" + /** * @brief Soft-float library for IEEE-754. */ #if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0) -#include "astcenc_mathlib.h" - /* sized soft-float types. These are mapped to the sized integer types of C99, instead of C's floating-point types; this is because the library needs to maintain exact, bit-level control on all diff --git a/3rdparty/astc-encoder/source/astcenc_percentile_tables.cpp b/3rdparty/astc-encoder/source/astcenc_percentile_tables.cpp index 3914ef2..448ddcc 100644 --- a/3rdparty/astc-encoder/source/astcenc_percentile_tables.cpp +++ b/3rdparty/astc-encoder/source/astcenc_percentile_tables.cpp @@ -1166,11 +1166,11 @@ const float *get_2d_percentile_table( unsigned int xdim, unsigned int ydim ) { - float* unpacked_table = new float[2048]; + float* unpacked_table = new float[WEIGHTS_MAX_BLOCK_MODES]; const packed_percentile_table *apt = get_packed_table(xdim, ydim); // Set the default percentile - for (unsigned int i = 0; i < 2048; i++) + for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++) { unpacked_table[i] = 1.0f; } diff --git a/3rdparty/astc-encoder/source/astcenc_pick_best_endpoint_format.cpp b/3rdparty/astc-encoder/source/astcenc_pick_best_endpoint_format.cpp index 5bf0b36..f25140d 100644 --- a/3rdparty/astc-encoder/source/astcenc_pick_best_endpoint_format.cpp +++ b/3rdparty/astc-encoder/source/astcenc_pick_best_endpoint_format.cpp @@ -325,11 +325,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level( ) { int partition_size = pi.partition_texel_count[partition_index]; - static const float baseline_quant_error[21] { - (65536.0f * 65536.0f / 18.0f), // 2 values, 1 step - (65536.0f * 65536.0f / 18.0f) / (2 * 2), // 3 values, 2 steps - (65536.0f * 65536.0f / 18.0f) / (3 * 3), // 4 values, 3 steps - (65536.0f * 65536.0f / 18.0f) / (4 * 4), // 5 values + static const float baseline_quant_error[21 - QUANT_6] { (65536.0f * 65536.0f / 18.0f) / (5 * 5), (65536.0f * 65536.0f / 18.0f) / (7 * 7), (65536.0f * 65536.0f / 18.0f) / (9 * 9), @@ -528,7 +524,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level( // The base_quant_error should depend on the scale-factor that would be used during // actual encode of the color value - float base_quant_error = baseline_quant_error[i] * static_cast(partition_size); + float base_quant_error = baseline_quant_error[i - QUANT_6] * static_cast(partition_size); float rgb_quantization_error = error_weight_rgbsum * base_quant_error * 2.0f; float alpha_quantization_error = error_weight.lane<3>() * base_quant_error * 2.0f; float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error; @@ -591,7 +587,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level( error_scale_oe_rgb = 1.0f; } - float base_quant_error = baseline_quant_error[i]; + float base_quant_error = baseline_quant_error[i - QUANT_6]; float quant_error_rgb = base_quant_error_rgb * base_quant_error; float quant_error_rgba = base_quant_error_rgba * base_quant_error; @@ -1136,22 +1132,19 @@ unsigned int compute_ideal_endpoint_formats( uint8_t (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats; // Ensure that the first iteration understep contains data that will never be picked + vfloat clear_error(ERROR_CALC_DEFAULT); + vint clear_quant(0); + unsigned int packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode); - for (unsigned int i = packed_start_block_mode; i < start_block_mode; i++) - { - errors_of_best_combination[i] = ERROR_CALC_DEFAULT; - best_quant_levels[i] = QUANT_2; - best_quant_levels_mod[i] = QUANT_2; - } + storea(clear_error, errors_of_best_combination + packed_start_block_mode); + store_nbytes(clear_quant, best_quant_levels + packed_start_block_mode); + store_nbytes(clear_quant, best_quant_levels_mod + packed_start_block_mode); // Ensure that last iteration overstep contains data that will never be picked - const unsigned int packed_end_block_mode = round_up_to_simd_multiple_vla(end_block_mode); - for (unsigned int i = end_block_mode; i < packed_end_block_mode; i++) - { - errors_of_best_combination[i] = ERROR_CALC_DEFAULT; - best_quant_levels[i] = QUANT_2; - best_quant_levels_mod[i] = QUANT_2; - } + unsigned int packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1); + storea(clear_error, errors_of_best_combination + packed_end_block_mode); + store_nbytes(clear_quant, best_quant_levels + packed_end_block_mode); + store_nbytes(clear_quant, best_quant_levels_mod + packed_end_block_mode); // Track a scalar best to avoid expensive search at least once ... float error_of_best_combination = ERROR_CALC_DEFAULT; diff --git a/3rdparty/astc-encoder/source/astcenc_platform_isa_detection.cpp b/3rdparty/astc-encoder/source/astcenc_platform_isa_detection.cpp index 4158da3..8ed9843 100644 --- a/3rdparty/astc-encoder/source/astcenc_platform_isa_detection.cpp +++ b/3rdparty/astc-encoder/source/astcenc_platform_isa_detection.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2020-2021 Arm Limited +// Copyright 2020-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -47,7 +47,7 @@ static bool g_cpu_has_f16c { false }; ============================================================================ */ #if !defined(__clang__) && defined(_MSC_VER) #define WIN32_LEAN_AND_MEAN -#include +#include #include /** diff --git a/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp b/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp index a19b907..80221a6 100644 --- a/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp +++ b/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp @@ -371,12 +371,15 @@ void physical_to_symbolic( const auto& di = bsd.get_decimation_info(bm.decimation_mode); int weight_count = di.weight_count; + promise(weight_count > 0); + quant_method weight_quant_method = static_cast(bm.quant_mode); int is_dual_plane = bm.is_dual_plane; int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count; int partition_count = read_bits(2, 11, pcb.data) + 1; + promise(partition_count > 0); scb.block_mode = static_cast(block_mode); scb.partition_count = static_cast(partition_count); @@ -523,6 +526,7 @@ void physical_to_symbolic( } // Fetch component for second-plane in the case of dual plane of weights. + scb.plane2_component = -1; if (is_dual_plane) { scb.plane2_component = static_cast(read_bits(2, below_weights_pos - 2, pcb.data)); diff --git a/3rdparty/astc-encoder/source/astcenc_vecmathlib.h b/3rdparty/astc-encoder/source/astcenc_vecmathlib.h index 6085c8e..d48f1d7 100644 --- a/3rdparty/astc-encoder/source/astcenc_vecmathlib.h +++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib.h @@ -26,7 +26,7 @@ * with that is available at compile time. The current vector width is * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant. * - * Explicit scalar types are acessible via the vint1, vfloat1, vmask1 types. + * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types. * These are provided primarily for prototyping and algorithm debug of VLA * implementations. * @@ -402,7 +402,7 @@ static ASTCENC_SIMD_INLINE vint4 clz(vint4 a) // the original integer value into a 2^N encoding we can recover easily. // Convert to float without risk of rounding up by keeping only top 8 bits. - // This trick is is guranteed to keep top 8 bits and clear the 9th. + // This trick is is guaranteed to keep top 8 bits and clear the 9th. a = (~lsr<8>(a)) & a; a = float_as_int(int_to_float(a)); diff --git a/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h b/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h index e668850..e742eae 100644 --- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h +++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h @@ -106,7 +106,7 @@ struct vfloat4 */ template ASTCENC_SIMD_INLINE void set_lane(float a) { - m = vld1q_lane_f32(&a, m, l); + m = vsetq_lane_f32(a, m, l); } /** @@ -122,7 +122,7 @@ struct vfloat4 */ static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p) { - return vfloat4(vdupq_n_f32(*p)); + return vfloat4(vld1q_dup_f32(p)); } /** @@ -202,9 +202,8 @@ struct vint4 */ ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p) { - uint32x2_t t8 {}; // Cast is safe - NEON loads are allowed to be unaligned - t8 = vld1_lane_u32(reinterpret_cast(p), t8, 0); + uint32x2_t t8 = vld1_dup_u32(reinterpret_cast(p)); uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8))); m = vreinterpretq_s32_u32(vmovl_u16(t16)); } @@ -251,7 +250,7 @@ struct vint4 */ template ASTCENC_SIMD_INLINE void set_lane(int a) { - m = vld1q_lane_s32(&a, m, l); + m = vsetq_lane_s32(a, m, l); } /** diff --git a/3rdparty/astc-encoder/source/astcenc_weight_quant_xfer_tables.cpp b/3rdparty/astc-encoder/source/astcenc_weight_quant_xfer_tables.cpp index 35bfa08..8fdf73a 100644 --- a/3rdparty/astc-encoder/source/astcenc_weight_quant_xfer_tables.cpp +++ b/3rdparty/astc-encoder/source/astcenc_weight_quant_xfer_tables.cpp @@ -24,9 +24,8 @@ #define _ 0 // Using _ to indicate an entry that will not be used. const quant_and_transfer_table quant_and_xfer_tables[12] { - // Quantization method 0, range 0..1 + // QUANT2, range 0..1 { - QUANT_2, {0, 64}, {0, 1}, {0, 64}, @@ -34,9 +33,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] { _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, 0x4000} }, - // Quantization method 1, range 0..2 + // QUANT_3, range 0..2 { - QUANT_3, {0, 32, 64}, {0, 1, 2}, {0, 32, 64}, @@ -44,19 +42,17 @@ const quant_and_transfer_table quant_and_xfer_tables[12] { _,_,0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, _,_,_,_,0x4020} }, - // Quantization method 2, range 0..3 + // QUANT_4, range 0..3 { - QUANT_4, {0, 21, 43, 64}, {0, 1, 2, 3}, - {0, 21, 43, 64}, + {0, 21, 43, 64}, {0x1500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2b00,_,_,_,_, _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4015,_,_,_,_,_,_,_,_,_,_,_,_, _,_,_,_,_,_,_,_,0x402b} }, - // Quantization method 3, range 0..4 + //QUANT_5, range 0..4 { - QUANT_5, {0, 16, 32, 48, 64}, {0, 1, 2, 3, 4}, {0, 16, 32, 48, 64}, @@ -64,9 +60,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] { _,_,_,_,_,_,0x3010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4020,_,_,_, _,_,_,_,_,_,_,_,_,_,_,_,0x4030} }, - // Quantization method 4, range 0..5 + // QUANT_6, range 0..5 { - QUANT_6, {0, 12, 25, 39, 52, 64}, {0, 2, 4, 5, 3, 1}, {0, 64, 12, 52, 25, 39}, @@ -74,9 +69,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] { 0x270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x3419,_,_,_,_,_,_,_,_,_,_, _,_,0x4027,_,_,_,_,_,_,_,_,_,_,_,0x4034} }, - // Quantization method 5, range 0..7 + // QUANT_8, range 0..7 { - QUANT_8, {0, 9, 18, 27, 37, 46, 55, 64}, {0, 1, 2, 3, 4, 5, 6, 7}, {0, 9, 18, 27, 37, 46, 55, 64}, @@ -84,9 +78,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] { _,_,_,_,_,_,0x2512,_,_,_,_,_,_,_,_,_,0x2e1b,_,_,_,_,_,_,_,_, 0x3725,_,_,_,_,_,_,_,_,0x402e,_,_,_,_,_,_,_,_,0x4037} }, - // Quantization method 6, range 0..9 + // QUANT_10, range 0..9 { - QUANT_10, {0, 7, 14, 21, 28, 36, 43, 50, 57, 64}, {0, 2, 4, 6, 8, 9, 7, 5, 3, 1}, {0, 64, 7, 57, 14, 50, 21, 43, 28, 36}, @@ -95,9 +88,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] { _,0x3224,_,_,_,_,_,_,0x392b,_,_,_,_,_,_,0x4032,_,_,_,_,_, _,0x4039} }, - // Quantization method 7, range 0..11 + // QUANT_12, range 0..11 { - QUANT_12, {0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64}, {0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1}, {0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36}, @@ -106,9 +98,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] { 0x291c,_,_,_,_,0x2f24,_,_,_,_,_,0x3529,_,_,_,_,_, 0x3b2f,_,_,_,_,_,0x4035,_,_,_,_,0x403b} }, - // Quantization method 8, range 0..15 + // QUANT_16, range 0..15 { - QUANT_16, {0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64}, @@ -117,9 +108,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] { _,0x271d,_,_,_,0x2b23,_,_,_,0x2f27,_,_,_,0x342b,_,_,_, _,0x382f,_,_,_,0x3c34,_,_,_,0x4038,_,_,_,0x403c} }, - // Quantization method 9, range 0..19 + // QUANT_20, range 0..19 { - QUANT_20, {0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58, 61, 64}, {0, 4, 8, 12, 16, 2, 6, 10, 14, 18, 19, 15, 11, 7, 3, 17, 13, 9, 5, 1}, {0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35}, @@ -129,9 +119,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] { 0x2d26,_,_,_,0x3029,_,_,0x332d,_,_,0x3730,_,_,_, 0x3a33,_,_,0x3d37,_,_,0x403a,_,_,0x403d} }, - // Quantization method 10, range 0..23 + // QUANT_24, range 0..23 { - QUANT_24, {0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48, 51, 53, 56, 59, 62, 64}, {0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19, 11, 3, 17, 9, 1}, {0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34}, @@ -142,9 +131,8 @@ const quant_and_transfer_table quant_and_xfer_tables[12] { _,_,0x3530,_,0x3833,_,_,0x3b35,_,_,0x3e38,_,_, 0x403b,_,0x403e} }, - // Quantization method 11, range 0..31 + // QUANT_32, range 0..31 { - QUANT_32, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},