mirror of
https://github.com/bkaradzic/bimg.git
synced 2026-02-17 20:52:38 +01:00
Updated astc-encoder.
This commit is contained in:
47
3rdparty/astc-encoder/include/astcenc.h
vendored
47
3rdparty/astc-encoder/include/astcenc.h
vendored
@@ -241,6 +241,9 @@ static const float ASTCENC_PRE_MEDIUM = 60.0f;
|
||||
/** @brief The thorough quality search preset. */
|
||||
static const float ASTCENC_PRE_THOROUGH = 98.0f;
|
||||
|
||||
/** @brief The thorough quality search preset. */
|
||||
static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f;
|
||||
|
||||
/** @brief The exhaustive, highest quality, search preset. */
|
||||
static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f;
|
||||
|
||||
@@ -440,11 +443,25 @@ struct astcenc_config
|
||||
unsigned int tune_partition_count_limit;
|
||||
|
||||
/**
|
||||
* @brief The maximum number of partitions searched (-partitionindexlimit).
|
||||
* @brief The maximum number of partitions searched (-2partitionindexlimit).
|
||||
*
|
||||
* Valid values are between 1 and 1024.
|
||||
*/
|
||||
unsigned int tune_partition_index_limit;
|
||||
unsigned int tune_2partition_index_limit;
|
||||
|
||||
/**
|
||||
* @brief The maximum number of partitions searched (-3partitionindexlimit).
|
||||
*
|
||||
* Valid values are between 1 and 1024.
|
||||
*/
|
||||
unsigned int tune_3partition_index_limit;
|
||||
|
||||
/**
|
||||
* @brief The maximum number of partitions searched (-4partitionindexlimit).
|
||||
*
|
||||
* Valid values are between 1 and 1024.
|
||||
*/
|
||||
unsigned int tune_4partition_index_limit;
|
||||
|
||||
/**
|
||||
* @brief The maximum centile for block modes searched (-blockmodelimit).
|
||||
@@ -468,6 +485,27 @@ struct astcenc_config
|
||||
*/
|
||||
unsigned int tune_candidate_limit;
|
||||
|
||||
/**
|
||||
* @brief The number of trial partitionings per search (-2partitioncandidatelimit).
|
||||
*
|
||||
* Valid values are between 1 and TUNE_MAX_PARTITIIONING_CANDIDATES.
|
||||
*/
|
||||
unsigned int tune_2partitioning_candidate_limit;
|
||||
|
||||
/**
|
||||
* @brief The number of trial partitionings per search (-3partitioncandidatelimit).
|
||||
*
|
||||
* Valid values are between 1 and TUNE_MAX_PARTITIIONING_CANDIDATES.
|
||||
*/
|
||||
unsigned int tune_3partitioning_candidate_limit;
|
||||
|
||||
/**
|
||||
* @brief The number of trial partitionings per search (-4partitioncandidatelimit).
|
||||
*
|
||||
* Valid values are between 1 and TUNE_MAX_PARTITIIONING_CANDIDATES.
|
||||
*/
|
||||
unsigned int tune_4partitioning_candidate_limit;
|
||||
|
||||
/**
|
||||
* @brief The dB threshold for stopping block search (-dblimit).
|
||||
*
|
||||
@@ -517,11 +555,6 @@ struct astcenc_config
|
||||
*/
|
||||
float tune_2_plane_early_out_limit_correlation;
|
||||
|
||||
/**
|
||||
* @brief The threshold below which (inclusive) we stop testing low/high/low+high cutoffs.
|
||||
*/
|
||||
unsigned int tune_low_weight_count_limit;
|
||||
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
/**
|
||||
* @brief The path to save the diagnostic trace data to.
|
||||
|
||||
@@ -334,13 +334,13 @@ static bool try_quantize_rgb_delta(
|
||||
int g0be = quant_color(quant_level, g0b);
|
||||
int b0be = quant_color(quant_level, b0b);
|
||||
|
||||
r0b = unquant_color(quant_level, r0be);
|
||||
g0b = unquant_color(quant_level, g0be);
|
||||
b0b = unquant_color(quant_level, b0be);
|
||||
int r0bu = unquant_color(quant_level, r0be);
|
||||
int g0bu = unquant_color(quant_level, g0be);
|
||||
int b0bu = unquant_color(quant_level, b0be);
|
||||
|
||||
r0b |= r0a & 0x100;
|
||||
g0b |= g0a & 0x100;
|
||||
b0b |= b0a & 0x100;
|
||||
r0b = r0bu | (r0a & 0x100);
|
||||
g0b = g0bu | (g0a & 0x100);
|
||||
b0b = b0bu | (b0a & 0x100);
|
||||
|
||||
// Get hold of the second value
|
||||
int r1d = astc::flt2int_rtn(r1);
|
||||
@@ -386,36 +386,18 @@ static bool try_quantize_rgb_delta(
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check that the sum of the encoded offsets is nonnegative, else encoding fails
|
||||
r1du &= 0x7f;
|
||||
g1du &= 0x7f;
|
||||
b1du &= 0x7f;
|
||||
|
||||
if (r1du & 0x40)
|
||||
{
|
||||
r1du -= 0x80;
|
||||
}
|
||||
|
||||
if (g1du & 0x40)
|
||||
{
|
||||
g1du -= 0x80;
|
||||
}
|
||||
|
||||
if (b1du & 0x40)
|
||||
{
|
||||
b1du -= 0x80;
|
||||
}
|
||||
|
||||
if (r1du + g1du + b1du < 0)
|
||||
// If the sum of offsets triggers blue-contraction then encoding fails
|
||||
vint4 ep0(r0bu, g0bu, b0bu, 0);
|
||||
vint4 ep1(r1du, g1du, b1du, 0);
|
||||
bit_transfer_signed(ep1, ep0);
|
||||
if (hadd_rgb_s(ep1) < 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check that the offsets produce legitimate sums as well
|
||||
r1du += r0b;
|
||||
g1du += g0b;
|
||||
b1du += b0b;
|
||||
if (r1du < 0 || r1du > 0x1FF || g1du < 0 || g1du > 0x1FF || b1du < 0 || b1du > 0x1FF)
|
||||
ep0 = ep0 + ep1;
|
||||
if (any((ep0 < vint4(0)) | (ep0 > vint4(0xFF))))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@@ -477,13 +459,13 @@ static bool try_quantize_rgb_delta_blue_contract(
|
||||
int g0be = quant_color(quant_level, g0b);
|
||||
int b0be = quant_color(quant_level, b0b);
|
||||
|
||||
r0b = unquant_color(quant_level, r0be);
|
||||
g0b = unquant_color(quant_level, g0be);
|
||||
b0b = unquant_color(quant_level, b0be);
|
||||
int r0bu = unquant_color(quant_level, r0be);
|
||||
int g0bu = unquant_color(quant_level, g0be);
|
||||
int b0bu = unquant_color(quant_level, b0be);
|
||||
|
||||
r0b |= r0a & 0x100;
|
||||
g0b |= g0a & 0x100;
|
||||
b0b |= b0a & 0x100;
|
||||
r0b = r0bu | (r0a & 0x100);
|
||||
g0b = g0bu | (g0a & 0x100);
|
||||
b0b = b0bu | (b0a & 0x100);
|
||||
|
||||
// Get hold of the second value
|
||||
int r1d = astc::flt2int_rtn(r1);
|
||||
@@ -530,38 +512,18 @@ static bool try_quantize_rgb_delta_blue_contract(
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check that the sum of the encoded offsets is negative, else encoding fails
|
||||
// Note that this is inverse of the test for non-blue-contracted RGB.
|
||||
r1du &= 0x7f;
|
||||
g1du &= 0x7f;
|
||||
b1du &= 0x7f;
|
||||
|
||||
if (r1du & 0x40)
|
||||
{
|
||||
r1du -= 0x80;
|
||||
}
|
||||
|
||||
if (g1du & 0x40)
|
||||
{
|
||||
g1du -= 0x80;
|
||||
}
|
||||
|
||||
if (b1du & 0x40)
|
||||
{
|
||||
b1du -= 0x80;
|
||||
}
|
||||
|
||||
if (r1du + g1du + b1du >= 0)
|
||||
// If the sum of offsets does not trigger blue-contraction then encoding fails
|
||||
vint4 ep0(r0bu, g0bu, b0bu, 0);
|
||||
vint4 ep1(r1du, g1du, b1du, 0);
|
||||
bit_transfer_signed(ep1, ep0);
|
||||
if (hadd_rgb_s(ep1) >= 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check that the offsets produce legitimate sums as well
|
||||
r1du += r0b;
|
||||
g1du += g0b;
|
||||
b1du += b0b;
|
||||
|
||||
if (r1du < 0 || r1du > 0x1FF || g1du < 0 || g1du > 0x1FF || b1du < 0 || b1du > 0x1FF)
|
||||
ep0 = ep0 + ep1;
|
||||
if (any((ep0 < vint4(0)) | (ep0 > vint4(0xFF))))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -97,15 +97,8 @@ static void rgba_delta_unpack(
|
||||
vint4 input0 = unquant_color(quant_level, input0q);
|
||||
vint4 input1 = unquant_color(quant_level, input1q);
|
||||
|
||||
// Perform bit-transfer
|
||||
input0 = input0 | lsl<1>(input1 & 0x80);
|
||||
input1 = input1 & 0x7F;
|
||||
vmask4 mask = (input1 & 0x40) != vint4::zero();
|
||||
input1 = select(input1, input1 - 0x80, mask);
|
||||
|
||||
// Scale
|
||||
input0 = asr<1>(input0);
|
||||
input1 = asr<1>(input1);
|
||||
// Apply bit transfer
|
||||
bit_transfer_signed(input1, input0);
|
||||
|
||||
// Apply blue-uncontraction if needed
|
||||
int rgb_sum = hadd_rgb_s(input1);
|
||||
|
||||
@@ -424,11 +424,7 @@ static float compress_symbolic_block_for_partition_1plane(
|
||||
|
||||
// For each mode, use the angular method to compute a shift
|
||||
compute_angular_endpoints_1plane(
|
||||
config.tune_low_weight_count_limit,
|
||||
only_always, bsd,
|
||||
dec_weights_ideal,
|
||||
max_weight_quant,
|
||||
tmpbuf);
|
||||
only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
|
||||
|
||||
float* weight_low_value = tmpbuf.weight_low_value1;
|
||||
float* weight_high_value = tmpbuf.weight_high_value1;
|
||||
@@ -795,9 +791,7 @@ static float compress_symbolic_block_for_partition_2planes(
|
||||
float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
|
||||
|
||||
compute_angular_endpoints_2planes(
|
||||
config.tune_low_weight_count_limit,
|
||||
bsd, dec_weights_ideal, max_weight_quant,
|
||||
tmpbuf);
|
||||
bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
|
||||
|
||||
// For each mode (which specifies a decimation and a quantization):
|
||||
// * Compute number of bits needed for the quantized weights
|
||||
@@ -1130,12 +1124,13 @@ static float prepare_block_statistics(
|
||||
|
||||
aa_var -= as * (as * rpt);
|
||||
|
||||
rg_cov *= astc::rsqrt(astc::max(rr_var * gg_var, 1e-30f));
|
||||
rb_cov *= astc::rsqrt(astc::max(rr_var * bb_var, 1e-30f));
|
||||
ra_cov *= astc::rsqrt(astc::max(rr_var * aa_var, 1e-30f));
|
||||
gb_cov *= astc::rsqrt(astc::max(gg_var * bb_var, 1e-30f));
|
||||
ga_cov *= astc::rsqrt(astc::max(gg_var * aa_var, 1e-30f));
|
||||
ba_cov *= astc::rsqrt(astc::max(bb_var * aa_var, 1e-30f));
|
||||
// These will give a NaN if a channel is constant - these are fixed up in the next step
|
||||
rg_cov *= astc::rsqrt(rr_var * gg_var);
|
||||
rb_cov *= astc::rsqrt(rr_var * bb_var);
|
||||
ra_cov *= astc::rsqrt(rr_var * aa_var);
|
||||
gb_cov *= astc::rsqrt(gg_var * bb_var);
|
||||
ga_cov *= astc::rsqrt(gg_var * aa_var);
|
||||
ba_cov *= astc::rsqrt(bb_var * aa_var);
|
||||
|
||||
if (astc::isnan(rg_cov)) rg_cov = 1.0f;
|
||||
if (astc::isnan(rb_cov)) rb_cov = 1.0f;
|
||||
@@ -1144,7 +1139,7 @@ static float prepare_block_statistics(
|
||||
if (astc::isnan(ga_cov)) ga_cov = 1.0f;
|
||||
if (astc::isnan(ba_cov)) ba_cov = 1.0f;
|
||||
|
||||
float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
|
||||
float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
|
||||
lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov));
|
||||
lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov));
|
||||
lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov));
|
||||
@@ -1197,6 +1192,18 @@ void compress_block(
|
||||
bool block_skip_two_plane = false;
|
||||
int max_partitions = ctx.config.tune_partition_count_limit;
|
||||
|
||||
unsigned int requested_partition_indices[3] {
|
||||
ctx.config.tune_2partition_index_limit,
|
||||
ctx.config.tune_3partition_index_limit,
|
||||
ctx.config.tune_4partition_index_limit
|
||||
};
|
||||
|
||||
unsigned int requested_partition_trials[3] {
|
||||
ctx.config.tune_2partitioning_candidate_limit,
|
||||
ctx.config.tune_3partitioning_candidate_limit,
|
||||
ctx.config.tune_4partitioning_candidate_limit
|
||||
};
|
||||
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
// Do this early in diagnostic builds so we can dump uniform metrics
|
||||
// for every block. Do it later in release builds to avoid redundant work!
|
||||
@@ -1366,13 +1373,19 @@ void compress_block(
|
||||
// Find best blocks for 2, 3 and 4 partitions
|
||||
for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
|
||||
{
|
||||
unsigned int partition_indices[2] { 0 };
|
||||
unsigned int partition_indices[TUNE_MAX_PARTITIIONING_CANDIDATES];
|
||||
|
||||
find_best_partition_candidates(bsd, blk, partition_count,
|
||||
ctx.config.tune_partition_index_limit,
|
||||
partition_indices);
|
||||
unsigned int requested_indices = requested_partition_indices[partition_count - 2];
|
||||
|
||||
for (unsigned int i = 0; i < 2; i++)
|
||||
unsigned int requested_trials = requested_partition_trials[partition_count - 2];
|
||||
requested_trials = astc::min(requested_trials, requested_indices);
|
||||
|
||||
unsigned int actual_trials = find_best_partition_candidates(
|
||||
bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
|
||||
|
||||
float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
|
||||
|
||||
for (unsigned int i = 0; i < actual_trials; i++)
|
||||
{
|
||||
TRACE_NODE(node1, "pass");
|
||||
trace_add_data("partition_count", partition_count);
|
||||
@@ -1387,6 +1400,20 @@ void compress_block(
|
||||
scb, tmpbuf, quant_limit);
|
||||
|
||||
best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
|
||||
|
||||
// If using N partitions doesn't improve much over using N-1 partitions then skip trying
|
||||
// N+1. Error can dramatically improve if the data is correlated or non-correlated and
|
||||
// aligns with a partitioning that suits that encoding, so for this inner loop check add
|
||||
// a large error scale because the "other" trial could be a lot better. In total the
|
||||
// error must be at least 2x worse than the best existing error to early-out.
|
||||
float best_error = best_errorvals_for_pcount[partition_count - 1];
|
||||
float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 2.0f;
|
||||
if (best_error > (best_error_in_prev * best_error_scale))
|
||||
{
|
||||
trace_add_data("skip", "tune_partition_early_out_limit_factor");
|
||||
goto END_OF_TESTS;
|
||||
}
|
||||
|
||||
if (errorval < error_threshold)
|
||||
{
|
||||
trace_add_data("exit", "quality hit");
|
||||
@@ -1396,7 +1423,6 @@ void compress_block(
|
||||
|
||||
// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
|
||||
float best_error = best_errorvals_for_pcount[partition_count - 1];
|
||||
float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
|
||||
float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
|
||||
if (best_error > (best_error_in_prev * best_error_scale))
|
||||
{
|
||||
|
||||
107
3rdparty/astc-encoder/source/astcenc_entry.cpp
vendored
107
3rdparty/astc-encoder/source/astcenc_entry.cpp
vendored
@@ -40,10 +40,15 @@ struct astcenc_preset_config
|
||||
{
|
||||
float quality;
|
||||
unsigned int tune_partition_count_limit;
|
||||
unsigned int tune_partition_index_limit;
|
||||
unsigned int tune_2partition_index_limit;
|
||||
unsigned int tune_3partition_index_limit;
|
||||
unsigned int tune_4partition_index_limit;
|
||||
unsigned int tune_block_mode_limit;
|
||||
unsigned int tune_refinement_limit;
|
||||
unsigned int tune_candidate_limit;
|
||||
unsigned int tune_2partitioning_candidate_limit;
|
||||
unsigned int tune_3partitioning_candidate_limit;
|
||||
unsigned int tune_4partitioning_candidate_limit;
|
||||
float tune_db_limit_a_base;
|
||||
float tune_db_limit_b_base;
|
||||
float tune_mode0_mse_overshoot;
|
||||
@@ -51,7 +56,6 @@ struct astcenc_preset_config
|
||||
float tune_2_partition_early_out_limit_factor;
|
||||
float tune_3_partition_early_out_limit_factor;
|
||||
float tune_2_plane_early_out_limit_correlation;
|
||||
unsigned int tune_low_weight_count_limit;
|
||||
};
|
||||
|
||||
|
||||
@@ -59,22 +63,25 @@ struct astcenc_preset_config
|
||||
* @brief The static quality presets that are built-in for high bandwidth
|
||||
* presets (x < 25 texels per block).
|
||||
*/
|
||||
static const std::array<astcenc_preset_config, 5> preset_configs_high {{
|
||||
static const std::array<astcenc_preset_config, 6> preset_configs_high {{
|
||||
{
|
||||
ASTCENC_PRE_FASTEST,
|
||||
2, 10, 43, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 25
|
||||
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
|
||||
}, {
|
||||
ASTCENC_PRE_FAST,
|
||||
3, 14, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.65f, 20
|
||||
3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.90f
|
||||
}, {
|
||||
ASTCENC_PRE_MEDIUM,
|
||||
4, 28, 76, 3, 3, 95.0f, 70.0f, 2.5f, 2.5f, 1.2f, 1.25f, 0.85f, 16
|
||||
4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 2.5f, 1.1f, 1.05f, 0.95f
|
||||
}, {
|
||||
ASTCENC_PRE_THOROUGH,
|
||||
4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 12
|
||||
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.35f, 1.15f, 0.97f
|
||||
}, {
|
||||
ASTCENC_PRE_VERYTHOROUGH,
|
||||
4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
|
||||
}, {
|
||||
ASTCENC_PRE_EXHAUSTIVE,
|
||||
4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
|
||||
4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
|
||||
}
|
||||
}};
|
||||
|
||||
@@ -82,46 +89,51 @@ static const std::array<astcenc_preset_config, 5> preset_configs_high {{
|
||||
* @brief The static quality presets that are built-in for medium bandwidth
|
||||
* presets (25 <= x < 64 texels per block).
|
||||
*/
|
||||
static const std::array<astcenc_preset_config, 5> preset_configs_mid {{
|
||||
static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
|
||||
{
|
||||
ASTCENC_PRE_FASTEST,
|
||||
2, 10, 43, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
|
||||
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f
|
||||
}, {
|
||||
ASTCENC_PRE_FAST,
|
||||
3, 15, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
|
||||
3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
|
||||
}, {
|
||||
ASTCENC_PRE_MEDIUM,
|
||||
4, 30, 76, 3, 3, 95.0f, 70.0f, 3.0f, 3.0f, 1.2f, 1.25f, 0.75f, 14
|
||||
4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 3.0f, 1.1f, 1.05f, 0.90f
|
||||
}, {
|
||||
ASTCENC_PRE_THOROUGH,
|
||||
4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 10
|
||||
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.4f, 1.2f, 0.95f
|
||||
}, {
|
||||
ASTCENC_PRE_VERYTHOROUGH,
|
||||
4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
|
||||
}, {
|
||||
ASTCENC_PRE_EXHAUSTIVE,
|
||||
4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
|
||||
4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
|
||||
}
|
||||
}};
|
||||
|
||||
|
||||
/**
|
||||
* @brief The static quality presets that are built-in for low bandwidth
|
||||
* presets (64 <= x texels per block).
|
||||
*/
|
||||
static const std::array<astcenc_preset_config, 5> preset_configs_low {{
|
||||
static const std::array<astcenc_preset_config, 6> preset_configs_low {{
|
||||
{
|
||||
ASTCENC_PRE_FASTEST,
|
||||
2, 10, 40, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
|
||||
2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f
|
||||
}, {
|
||||
ASTCENC_PRE_FAST,
|
||||
2, 15, 55, 3, 3, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
|
||||
2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
|
||||
}, {
|
||||
ASTCENC_PRE_MEDIUM,
|
||||
3, 30, 76, 3, 3, 95.0f, 70.0f, 3.5f, 3.5f, 1.2f, 1.25f, 0.65f, 12
|
||||
3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 3.5f, 1.1f, 1.05f, 0.90f
|
||||
}, {
|
||||
ASTCENC_PRE_THOROUGH,
|
||||
4, 75, 92, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.85f, 10
|
||||
4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.3f, 1.2f, 0.97f
|
||||
}, {
|
||||
ASTCENC_PRE_VERYTHOROUGH,
|
||||
4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
|
||||
}, {
|
||||
ASTCENC_PRE_EXHAUSTIVE,
|
||||
4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
|
||||
4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
|
||||
}
|
||||
}};
|
||||
|
||||
@@ -422,10 +434,15 @@ static astcenc_error validate_config(
|
||||
config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
|
||||
|
||||
config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
|
||||
config.tune_partition_index_limit = astc::clamp(config.tune_partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
|
||||
config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
|
||||
config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
|
||||
config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
|
||||
config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
|
||||
config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
|
||||
config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
|
||||
config.tune_mode0_mse_overshoot = astc::max(config.tune_mode0_mse_overshoot, 1.0f);
|
||||
config.tune_refinement_mse_overshoot = astc::max(config.tune_refinement_mse_overshoot, 1.0f);
|
||||
@@ -464,9 +481,23 @@ astcenc_error astcenc_config_init(
|
||||
astcenc_config* configp
|
||||
) {
|
||||
astcenc_error status;
|
||||
astcenc_config& config = *configp;
|
||||
|
||||
// Check basic library compatibility options here so they are checked early. Note, these checks
|
||||
// are repeated in context_alloc for cases where callers use a manually defined config struct
|
||||
status = validate_cpu_isa();
|
||||
if (status != ASTCENC_SUCCESS)
|
||||
{
|
||||
return status;
|
||||
}
|
||||
|
||||
status = validate_cpu_float();
|
||||
if (status != ASTCENC_SUCCESS)
|
||||
{
|
||||
return status;
|
||||
}
|
||||
|
||||
// Zero init all config fields; although most of will be over written
|
||||
astcenc_config& config = *configp;
|
||||
std::memset(&config, 0, sizeof(config));
|
||||
|
||||
// Process the block size
|
||||
@@ -493,7 +524,7 @@ astcenc_error astcenc_config_init(
|
||||
return ASTCENC_ERR_BAD_QUALITY;
|
||||
}
|
||||
|
||||
static const std::array<astcenc_preset_config, 5>* preset_configs;
|
||||
static const std::array<astcenc_preset_config, 6>* preset_configs;
|
||||
int texels_int = block_x * block_y * block_z;
|
||||
if (texels_int < 25)
|
||||
{
|
||||
@@ -525,11 +556,15 @@ astcenc_error astcenc_config_init(
|
||||
if (start == end)
|
||||
{
|
||||
config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
|
||||
config.tune_partition_index_limit = (*preset_configs)[start].tune_partition_index_limit;
|
||||
config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
|
||||
config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
|
||||
config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
|
||||
config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
|
||||
config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
|
||||
config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit,
|
||||
TUNE_MAX_TRIAL_CANDIDATES);
|
||||
config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES);
|
||||
config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
|
||||
config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
|
||||
config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
|
||||
config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
|
||||
(*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
|
||||
|
||||
@@ -539,7 +574,6 @@ astcenc_error astcenc_config_init(
|
||||
config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor;
|
||||
config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor;
|
||||
config.tune_2_plane_early_out_limit_correlation = (*preset_configs)[start].tune_2_plane_early_out_limit_correlation;
|
||||
config.tune_low_weight_count_limit = (*preset_configs)[start].tune_low_weight_count_limit;
|
||||
}
|
||||
// Start and end node are not the same - so interpolate between them
|
||||
else
|
||||
@@ -561,11 +595,19 @@ astcenc_error astcenc_config_init(
|
||||
#define LERPUI(param) static_cast<unsigned int>(LERPI(param))
|
||||
|
||||
config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
|
||||
config.tune_partition_index_limit = LERPI(tune_partition_index_limit);
|
||||
config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
|
||||
config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
|
||||
config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
|
||||
config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
|
||||
config.tune_refinement_limit = LERPI(tune_refinement_limit);
|
||||
config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit),
|
||||
TUNE_MAX_TRIAL_CANDIDATES);
|
||||
config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit),
|
||||
BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit),
|
||||
BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit),
|
||||
BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
|
||||
LERP(tune_db_limit_b_base) - 19 * ltexels);
|
||||
|
||||
@@ -575,7 +617,6 @@ astcenc_error astcenc_config_init(
|
||||
config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor);
|
||||
config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor);
|
||||
config.tune_2_plane_early_out_limit_correlation = LERP(tune_2_plane_early_out_limit_correlation);
|
||||
config.tune_low_weight_count_limit = LERPI(tune_low_weight_count_limit);
|
||||
#undef LERP
|
||||
#undef LERPI
|
||||
#undef LERPUI
|
||||
@@ -676,13 +717,13 @@ astcenc_error astcenc_context_alloc(
|
||||
astcenc_error status;
|
||||
const astcenc_config& config = *configp;
|
||||
|
||||
status = validate_cpu_float();
|
||||
status = validate_cpu_isa();
|
||||
if (status != ASTCENC_SUCCESS)
|
||||
{
|
||||
return status;
|
||||
}
|
||||
|
||||
status = validate_cpu_isa();
|
||||
status = validate_cpu_float();
|
||||
if (status != ASTCENC_SUCCESS)
|
||||
{
|
||||
return status;
|
||||
@@ -714,7 +755,7 @@ astcenc_error astcenc_context_alloc(
|
||||
status = validate_config(ctx->config);
|
||||
if (status != ASTCENC_SUCCESS)
|
||||
{
|
||||
delete ctx;
|
||||
delete ctxo;
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
@@ -485,13 +485,59 @@ static unsigned int compute_kmeans_partition_ordering(
|
||||
mismatch_counts, partition_ordering);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Insert a partitioning into an order list of results, sorted by error.
|
||||
*
|
||||
* @param max_values The max number of entries in the best result arrays/
|
||||
* @param this_error The error of the new entry.
|
||||
* @param this_partition The partition ID of the new entry.
|
||||
* @param[out] best_errors The array of best error values.
|
||||
* @param[out] best_partitions The array of best partition values.
|
||||
*/
|
||||
static void insert_result(
|
||||
unsigned int max_values,
|
||||
float this_error,
|
||||
unsigned int this_partition,
|
||||
float* best_errors,
|
||||
unsigned int* best_partitions)
|
||||
{
|
||||
// Don't bother searching if the current worst error beats the new error
|
||||
if (this_error >= best_errors[max_values - 1])
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Else insert into the list in error-order
|
||||
for (unsigned int i = 0; i < max_values; i++)
|
||||
{
|
||||
// Existing result is better - move on ...
|
||||
if (this_error > best_errors[i])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Move existing results down one
|
||||
for (unsigned int j = max_values - 1; j > i; j--)
|
||||
{
|
||||
best_errors[j] = best_errors[j - 1];
|
||||
best_partitions[j] = best_partitions[j - 1];
|
||||
}
|
||||
|
||||
// Insert new result
|
||||
best_errors[i] = this_error;
|
||||
best_partitions[i] = this_partition;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void find_best_partition_candidates(
|
||||
unsigned int find_best_partition_candidates(
|
||||
const block_size_descriptor& bsd,
|
||||
const image_block& blk,
|
||||
unsigned int partition_count,
|
||||
unsigned int partition_search_limit,
|
||||
unsigned int best_partitions[2]
|
||||
unsigned int best_partitions[BLOCK_MAX_PARTITIONINGS],
|
||||
unsigned int requested_candidates
|
||||
) {
|
||||
// Constant used to estimate quantization error for a given partitioning; the optimal value for
|
||||
// this depends on bitrate. These values have been determined empirically.
|
||||
@@ -518,17 +564,23 @@ void find_best_partition_candidates(
|
||||
unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS];
|
||||
unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
|
||||
partition_search_limit = astc::min(partition_search_limit, sequence_len);
|
||||
requested_candidates = astc::min(partition_search_limit, requested_candidates);
|
||||
|
||||
bool uses_alpha = !blk.is_constant_channel(3);
|
||||
|
||||
// Partitioning errors assuming uncorrelated-chrominance endpoints
|
||||
float uncor_best_error { ERROR_CALC_DEFAULT };
|
||||
unsigned int uncor_best_partition { 0 };
|
||||
float uncor_best_errors[TUNE_MAX_PARTITIIONING_CANDIDATES];
|
||||
unsigned int uncor_best_partitions[TUNE_MAX_PARTITIIONING_CANDIDATES];
|
||||
|
||||
// Partitioning errors assuming same-chrominance endpoints
|
||||
// Store two so we can always return one different to uncorr
|
||||
float samec_best_errors[2] { ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT };
|
||||
unsigned int samec_best_partitions[2] { 0, 0 };
|
||||
float samec_best_errors[TUNE_MAX_PARTITIIONING_CANDIDATES];
|
||||
unsigned int samec_best_partitions[TUNE_MAX_PARTITIIONING_CANDIDATES];
|
||||
|
||||
for (unsigned int i = 0; i < requested_candidates; i++)
|
||||
{
|
||||
uncor_best_errors[i] = ERROR_CALC_DEFAULT;
|
||||
samec_best_errors[i] = ERROR_CALC_DEFAULT;
|
||||
}
|
||||
|
||||
if (uses_alpha)
|
||||
{
|
||||
@@ -602,25 +654,8 @@ void find_best_partition_candidates(
|
||||
samec_error += dot_s(samec_vector * samec_vector, error_weights);
|
||||
}
|
||||
|
||||
if (uncor_error < uncor_best_error)
|
||||
{
|
||||
uncor_best_error = uncor_error;
|
||||
uncor_best_partition = partition;
|
||||
}
|
||||
|
||||
if (samec_error < samec_best_errors[0])
|
||||
{
|
||||
samec_best_errors[1] = samec_best_errors[0];
|
||||
samec_best_partitions[1] = samec_best_partitions[0];
|
||||
|
||||
samec_best_errors[0] = samec_error;
|
||||
samec_best_partitions[0] = partition;
|
||||
}
|
||||
else if (samec_error < samec_best_errors[1])
|
||||
{
|
||||
samec_best_errors[1] = samec_error;
|
||||
samec_best_partitions[1] = partition;
|
||||
}
|
||||
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
|
||||
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -687,50 +722,55 @@ void find_best_partition_candidates(
|
||||
samec_error += dot3_s(samec_vector * samec_vector, error_weights);
|
||||
}
|
||||
|
||||
if (uncor_error < uncor_best_error)
|
||||
{
|
||||
uncor_best_error = uncor_error;
|
||||
uncor_best_partition = partition;
|
||||
}
|
||||
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
|
||||
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
|
||||
}
|
||||
}
|
||||
|
||||
if (samec_error < samec_best_errors[0])
|
||||
{
|
||||
samec_best_errors[1] = samec_best_errors[0];
|
||||
samec_best_partitions[1] = samec_best_partitions[0];
|
||||
bool best_is_uncor = uncor_best_partitions[0] > samec_best_partitions[0];
|
||||
|
||||
samec_best_errors[0] = samec_error;
|
||||
samec_best_partitions[0] = partition;
|
||||
}
|
||||
else if (samec_error < samec_best_errors[1])
|
||||
unsigned int interleave[2 * TUNE_MAX_PARTITIIONING_CANDIDATES];
|
||||
for (unsigned int i = 0; i < requested_candidates; i++)
|
||||
{
|
||||
if (best_is_uncor)
|
||||
{
|
||||
interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
|
||||
interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
|
||||
}
|
||||
else
|
||||
{
|
||||
interleave[2 * i] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
|
||||
interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t bitmasks[1024/64] { 0 };
|
||||
unsigned int emitted = 0;
|
||||
|
||||
// Deduplicate the first "requested" entries
|
||||
for (unsigned int i = 0; i < requested_candidates * 2; i++)
|
||||
{
|
||||
unsigned int partition = interleave[i];
|
||||
|
||||
unsigned int word = partition / 64;
|
||||
unsigned int bit = partition % 64;
|
||||
|
||||
bool written = bitmasks[word] & (1ull << bit);
|
||||
|
||||
if (!written)
|
||||
{
|
||||
best_partitions[emitted] = partition;
|
||||
bitmasks[word] |= 1ull << bit;
|
||||
emitted++;
|
||||
|
||||
if (emitted == requested_candidates)
|
||||
{
|
||||
samec_best_errors[1] = samec_error;
|
||||
samec_best_partitions[1] = partition;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Same partition is best for both, so use this first unconditionally
|
||||
if (uncor_best_partition == samec_best_partitions[0])
|
||||
{
|
||||
best_partitions[0] = samec_best_partitions[0];
|
||||
best_partitions[1] = samec_best_partitions[1];
|
||||
}
|
||||
// Uncor is best
|
||||
else if (uncor_best_error <= samec_best_errors[0])
|
||||
{
|
||||
best_partitions[0] = uncor_best_partition;
|
||||
best_partitions[1] = samec_best_partitions[0];
|
||||
}
|
||||
// Samec is best
|
||||
else
|
||||
{
|
||||
best_partitions[0] = samec_best_partitions[0];
|
||||
best_partitions[1] = uncor_best_partition;
|
||||
}
|
||||
|
||||
// Convert these back into canonical partition IDs for the rest of the codec
|
||||
best_partitions[0] = bsd.get_raw_partition_info(partition_count, best_partitions[0]).partition_index;
|
||||
best_partitions[1] = bsd.get_raw_partition_info(partition_count, best_partitions[1]).partition_index;
|
||||
return emitted;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
142
3rdparty/astc-encoder/source/astcenc_internal.h
vendored
142
3rdparty/astc-encoder/source/astcenc_internal.h
vendored
@@ -130,7 +130,14 @@ static constexpr unsigned int TUNE_MIN_TEXELS_MODE0_FASTPATH { 24 };
|
||||
*
|
||||
* This can be dynamically reduced by the compression quality preset.
|
||||
*/
|
||||
static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 4 };
|
||||
static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 8 };
|
||||
|
||||
/**
|
||||
* @brief The maximum number of candidate partitionings tested for each encoding mode.
|
||||
*
|
||||
* This can be dynamically reduced by the compression quality preset.
|
||||
*/
|
||||
static constexpr unsigned int TUNE_MAX_PARTITIIONING_CANDIDATES { 32 };
|
||||
|
||||
/**
|
||||
* @brief The maximum quant level using full angular endpoint search method.
|
||||
@@ -1345,11 +1352,11 @@ extern const int8_t quant_mode_table[10][128];
|
||||
* Note that BISE can return strings that are not a whole number of bytes in length, and ASTC can
|
||||
* start storing strings in a block at arbitrary bit offsets in the encoded data.
|
||||
*
|
||||
* @param quant_level The BISE alphabet size.
|
||||
* @param character_count The number of characters in the string.
|
||||
* @param input_data The unpacked string, one byte per character.
|
||||
* @param[in,out] output_data The output packed string.
|
||||
* @param bit_offset The starting offset in the output storage.
|
||||
* @param quant_level The BISE alphabet size.
|
||||
* @param character_count The number of characters in the string.
|
||||
* @param input_data The unpacked string, one byte per character.
|
||||
* @param[in,out] output_data The output packed string.
|
||||
* @param bit_offset The starting offset in the output storage.
|
||||
*/
|
||||
void encode_ise(
|
||||
quant_method quant_level,
|
||||
@@ -1436,11 +1443,11 @@ void compute_avgs_and_dirs_3_comp(
|
||||
* This is a specialization of @c compute_avgs_and_dirs_3_comp where the omitted component is
|
||||
* always alpha, a common case during partition search.
|
||||
*
|
||||
* @param pi The partition info for the current trial.
|
||||
* @param blk The image block color data to be compressed.
|
||||
* @param[out] pm The output partition metrics.
|
||||
* - Only pi.partition_count array entries actually get initialized.
|
||||
* - Direction vectors @c pm.dir are not normalized.
|
||||
* @param pi The partition info for the current trial.
|
||||
* @param blk The image block color data to be compressed.
|
||||
* @param[out] pm The output partition metrics.
|
||||
* - Only pi.partition_count array entries actually get initialized.
|
||||
* - Direction vectors @c pm.dir are not normalized.
|
||||
*/
|
||||
void compute_avgs_and_dirs_3_comp_rgb(
|
||||
const partition_info& pi,
|
||||
@@ -1471,11 +1478,11 @@ void compute_avgs_and_dirs_4_comp(
|
||||
*
|
||||
* This function computes the squared error when using these two representations.
|
||||
*
|
||||
* @param pi The partition info for the current trial.
|
||||
* @param blk The image block color data to be compressed.
|
||||
* @param[in,out] plines Processed line inputs, and line length outputs.
|
||||
* @param[out] uncor_error The cumulative error for using the uncorrelated line.
|
||||
* @param[out] samec_error The cumulative error for using the same chroma line.
|
||||
* @param pi The partition info for the current trial.
|
||||
* @param blk The image block color data to be compressed.
|
||||
* @param[in,out] plines Processed line inputs, and line length outputs.
|
||||
* @param[out] uncor_error The cumulative error for using the uncorrelated line.
|
||||
* @param[out] samec_error The cumulative error for using the same chroma line.
|
||||
*/
|
||||
void compute_error_squared_rgb(
|
||||
const partition_info& pi,
|
||||
@@ -1520,18 +1527,23 @@ void compute_error_squared_rgba(
|
||||
* candidates; one assuming data has uncorrelated chroma and one assuming the
|
||||
* data has correlated chroma. The best candidate is returned first in the list.
|
||||
*
|
||||
* @param bsd The block size information.
|
||||
* @param blk The image block color data to compress.
|
||||
* @param partition_count The number of partitions in the block.
|
||||
* @param partition_search_limit The number of candidate partition encodings to trial.
|
||||
* @param[out] best_partitions The best partition candidates.
|
||||
* @param bsd The block size information.
|
||||
* @param blk The image block color data to compress.
|
||||
* @param partition_count The number of partitions in the block.
|
||||
* @param partition_search_limit The number of candidate partition encodings to trial.
|
||||
* @param[out] best_partitions The best partition candidates.
|
||||
* @param requested_candidates The number of requsted partitionings. May return fewer if
|
||||
* candidates are not avaiable.
|
||||
*
|
||||
* @return The actual number of candidates returned.
|
||||
*/
|
||||
void find_best_partition_candidates(
|
||||
unsigned int find_best_partition_candidates(
|
||||
const block_size_descriptor& bsd,
|
||||
const image_block& blk,
|
||||
unsigned int partition_count,
|
||||
unsigned int partition_search_limit,
|
||||
unsigned int best_partitions[2]);
|
||||
unsigned int best_partitions[TUNE_MAX_PARTITIIONING_CANDIDATES],
|
||||
unsigned int requested_candidates);
|
||||
|
||||
/* ============================================================================
|
||||
Functionality for managing images and image related data.
|
||||
@@ -1545,10 +1557,10 @@ void find_best_partition_candidates(
|
||||
*
|
||||
* Results are written back into @c img->input_alpha_averages.
|
||||
*
|
||||
* @param img The input image data, also holds output data.
|
||||
* @param alpha_kernel_radius The kernel radius (in pixels) for alpha mods.
|
||||
* @param swz Input data component swizzle.
|
||||
* @param[out] ag The average variance arguments to init.
|
||||
* @param img The input image data, also holds output data.
|
||||
* @param alpha_kernel_radius The kernel radius (in pixels) for alpha mods.
|
||||
* @param swz Input data component swizzle.
|
||||
* @param[out] ag The average variance arguments to init.
|
||||
*
|
||||
* @return The number of tasks in the processing stage.
|
||||
*/
|
||||
@@ -1766,13 +1778,13 @@ float compute_error_of_weight_set_2planes(
|
||||
* The user requests a base color endpoint mode in @c format, but the quantizer may choose a
|
||||
* delta-based representation. It will report back the format variant it actually used.
|
||||
*
|
||||
* @param color0 The input unquantized color0 endpoint for absolute endpoint pairs.
|
||||
* @param color1 The input unquantized color1 endpoint for absolute endpoint pairs.
|
||||
* @param rgbs_color The input unquantized RGBS variant endpoint for same chroma endpoints.
|
||||
* @param rgbo_color The input unquantized RGBS variant endpoint for HDR endpoints..
|
||||
* @param format The desired base format.
|
||||
* @param[out] output The output storage for the quantized colors/
|
||||
* @param quant_level The quantization level requested.
|
||||
* @param color0 The input unquantized color0 endpoint for absolute endpoint pairs.
|
||||
* @param color1 The input unquantized color1 endpoint for absolute endpoint pairs.
|
||||
* @param rgbs_color The input unquantized RGBS variant endpoint for same chroma endpoints.
|
||||
* @param rgbo_color The input unquantized RGBS variant endpoint for HDR endpoints.
|
||||
* @param format The desired base format.
|
||||
* @param[out] output The output storage for the quantized colors/
|
||||
* @param quant_level The quantization level requested.
|
||||
*
|
||||
* @return The actual endpoint mode used.
|
||||
*/
|
||||
@@ -1873,13 +1885,13 @@ unsigned int compute_ideal_endpoint_formats(
|
||||
* As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must
|
||||
* recompute the ideal colors for a specific weight set.
|
||||
*
|
||||
* @param blk The image block color data to compress.
|
||||
* @param pi The partition info for the current trial.
|
||||
* @param di The weight grid decimation table.
|
||||
* @param blk The image block color data to compress.
|
||||
* @param pi The partition info for the current trial.
|
||||
* @param di The weight grid decimation table.
|
||||
* @param dec_weights_uquant The quantized weight set.
|
||||
* @param[in,out] ep The color endpoints (modifed in place).
|
||||
* @param[out] rgbs_vectors The RGB+scale vectors for LDR blocks.
|
||||
* @param[out] rgbo_vectors The RGB+offset vectors for HDR blocks.
|
||||
* @param[in,out] ep The color endpoints (modifed in place).
|
||||
* @param[out] rgbs_vectors The RGB+scale vectors for LDR blocks.
|
||||
* @param[out] rgbo_vectors The RGB+offset vectors for HDR blocks.
|
||||
*/
|
||||
void recompute_ideal_colors_1plane(
|
||||
const image_block& blk,
|
||||
@@ -1896,15 +1908,15 @@ void recompute_ideal_colors_1plane(
|
||||
* As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must
|
||||
* recompute the ideal colors for a specific weight set.
|
||||
*
|
||||
* @param blk The image block color data to compress.
|
||||
* @param bsd The block_size descriptor.
|
||||
* @param di The weight grid decimation table.
|
||||
* @param blk The image block color data to compress.
|
||||
* @param bsd The block_size descriptor.
|
||||
* @param di The weight grid decimation table.
|
||||
* @param dec_weights_uquant_plane1 The quantized weight set for plane 1.
|
||||
* @param dec_weights_uquant_plane2 The quantized weight set for plane 2.
|
||||
* @param[in,out] ep The color endpoints (modifed in place).
|
||||
* @param[out] rgbs_vector The RGB+scale color for LDR blocks.
|
||||
* @param[out] rgbo_vector The RGB+offset color for HDR blocks.
|
||||
* @param plane2_component The component assigned to plane 2.
|
||||
* @param[in,out] ep The color endpoints (modifed in place).
|
||||
* @param[out] rgbs_vector The RGB+scale color for LDR blocks.
|
||||
* @param[out] rgbo_vector The RGB+offset color for HDR blocks.
|
||||
* @param plane2_component The component assigned to plane 2.
|
||||
*/
|
||||
void recompute_ideal_colors_2planes(
|
||||
const image_block& blk,
|
||||
@@ -1925,15 +1937,13 @@ void prepare_angular_tables();
|
||||
/**
|
||||
* @brief Compute the angular endpoints for one plane for each block mode.
|
||||
*
|
||||
* @param tune_low_weight_limit Weight count cutoff below which we use simpler searches.
|
||||
* @param only_always Only consider block modes that are always enabled.
|
||||
* @param bsd The block size descriptor for the current trial.
|
||||
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
|
||||
* @param max_weight_quant The maximum block mode weight quantization allowed.
|
||||
* @param[out] tmpbuf Preallocated scratch buffers for the compressor.
|
||||
* @param only_always Only consider block modes that are always enabled.
|
||||
* @param bsd The block size descriptor for the current trial.
|
||||
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
|
||||
* @param max_weight_quant The maximum block mode weight quantization allowed.
|
||||
* @param[out] tmpbuf Preallocated scratch buffers for the compressor.
|
||||
*/
|
||||
void compute_angular_endpoints_1plane(
|
||||
unsigned int tune_low_weight_limit,
|
||||
bool only_always,
|
||||
const block_size_descriptor& bsd,
|
||||
const float* dec_weight_ideal_value,
|
||||
@@ -1943,14 +1953,12 @@ void compute_angular_endpoints_1plane(
|
||||
/**
|
||||
* @brief Compute the angular endpoints for two planes for each block mode.
|
||||
*
|
||||
* @param tune_low_weight_limit Weight count cutoff below which we use simpler searches.
|
||||
* @param bsd The block size descriptor for the current trial.
|
||||
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
|
||||
* @param max_weight_quant The maximum block mode weight quantization allowed.
|
||||
* @param[out] tmpbuf Preallocated scratch buffers for the compressor.
|
||||
* @param bsd The block size descriptor for the current trial.
|
||||
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
|
||||
* @param max_weight_quant The maximum block mode weight quantization allowed.
|
||||
* @param[out] tmpbuf Preallocated scratch buffers for the compressor.
|
||||
*/
|
||||
void compute_angular_endpoints_2planes(
|
||||
unsigned int tune_low_weight_limit,
|
||||
const block_size_descriptor& bsd,
|
||||
const float* dec_weight_ideal_value,
|
||||
unsigned int max_weight_quant,
|
||||
@@ -2162,18 +2170,4 @@ void aligned_free(T* ptr)
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void dump_weights(const char* label, uint8_t* weights, int weight_count)
|
||||
{
|
||||
printf("%s\n", label);
|
||||
vint lane = vint::lane_id();
|
||||
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vmask mask = lane < vint(weight_count);
|
||||
vint val(weights + i);
|
||||
val = select(vint::zero(), val, mask);
|
||||
print(val);
|
||||
lane += vint(ASTCENC_SIMD_WIDTH);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -361,6 +361,23 @@ static inline int popcount(uint64_t v)
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Apply signed bit transfer.
|
||||
*
|
||||
* @param input0 The first encoded endpoint.
|
||||
* @param input1 The second encoded endpoint.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE void bit_transfer_signed(
|
||||
vint4& input0,
|
||||
vint4& input1
|
||||
) {
|
||||
input1 = lsr<1>(input1) | (input0 & 0x80);
|
||||
input0 = lsr<1>(input0) & 0x3F;
|
||||
|
||||
vmask4 mask = (input0 & 0x20) != vint4::zero();
|
||||
input0 = select(input0, input0 - 0x40, mask);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Debug function to print a vector of ints.
|
||||
*/
|
||||
|
||||
@@ -333,156 +333,8 @@ static void compute_angular_endpoints_for_quant_levels(
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief For a given step size compute the lowest and highest weight, variant for low weight count.
|
||||
*
|
||||
* Compute the lowest and highest weight that results from quantizing using the given stepsize and
|
||||
* offset, and then compute the resulting error. The cut errors indicate the error that results from
|
||||
* forcing samples that should have had one weight value one step up or down.
|
||||
*
|
||||
* @param weight_count The number of (decimated) weights.
|
||||
* @param dec_weight_quant_uvalue The decimated and quantized weight values.
|
||||
* @param max_angular_steps The maximum number of steps to be tested.
|
||||
* @param max_quant_steps The maximum quantization level to be tested.
|
||||
* @param offsets The angular offsets array.
|
||||
* @param[out] lowest_weight Per angular step, the lowest weight.
|
||||
* @param[out] weight_span Per angular step, the span between lowest and highest weight.
|
||||
* @param[out] error Per angular step, the error.
|
||||
*/
|
||||
static void compute_lowest_and_highest_weight_lwc(
|
||||
unsigned int weight_count,
|
||||
const float* dec_weight_quant_uvalue,
|
||||
unsigned int max_angular_steps,
|
||||
unsigned int max_quant_steps,
|
||||
const float* offsets,
|
||||
float* lowest_weight,
|
||||
int* weight_span,
|
||||
float* error
|
||||
) {
|
||||
promise(weight_count > 0);
|
||||
promise(max_angular_steps > 0);
|
||||
|
||||
vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
|
||||
|
||||
// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
|
||||
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vfloat minidx(128.0f);
|
||||
vfloat maxidx(-128.0f);
|
||||
vfloat errval = vfloat::zero();
|
||||
vfloat offset = loada(offsets + sp);
|
||||
|
||||
for (unsigned int j = 0; j < weight_count; j++)
|
||||
{
|
||||
vfloat sval = load1(dec_weight_quant_uvalue + j) * rcp_stepsize - offset;
|
||||
vfloat svalrte = round(sval);
|
||||
vfloat diff = sval - svalrte;
|
||||
errval += diff * diff;
|
||||
|
||||
// Compute min and max quantized weight spans for each step
|
||||
minidx = min(minidx, svalrte);
|
||||
maxidx = max(maxidx, svalrte);
|
||||
}
|
||||
|
||||
// Write out min weight and weight span; clamp span to a usable range
|
||||
vint span = float_to_int(maxidx - minidx + vfloat(1.0f));
|
||||
span = min(span, vint(max_quant_steps + 3));
|
||||
span = max(span, vint(2));
|
||||
storea(minidx, lowest_weight + sp);
|
||||
storea(span, weight_span + sp);
|
||||
|
||||
vfloat ssize = 1.0f / rcp_stepsize;
|
||||
vfloat errscale = ssize * ssize;
|
||||
storea(errval * errscale, error + sp);
|
||||
|
||||
rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief The main function for the angular algorithm, variant for low weight count.
|
||||
*
|
||||
* @param weight_count The number of (decimated) weights.
|
||||
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
|
||||
* @param max_quant_level The maximum quantization level to be tested.
|
||||
* @param[out] low_value Per angular step, the lowest weight value.
|
||||
* @param[out] high_value Per angular step, the highest weight value.
|
||||
*/
|
||||
static void compute_angular_endpoints_for_quant_levels_lwc(
|
||||
unsigned int weight_count,
|
||||
const float* dec_weight_ideal_value,
|
||||
unsigned int max_quant_level,
|
||||
float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
|
||||
float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
|
||||
) {
|
||||
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
|
||||
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
|
||||
|
||||
alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
|
||||
|
||||
compute_angular_offsets(weight_count, dec_weight_ideal_value,
|
||||
max_angular_steps, angular_offsets);
|
||||
|
||||
|
||||
compute_lowest_and_highest_weight_lwc(weight_count, dec_weight_ideal_value,
|
||||
max_angular_steps, max_quant_steps,
|
||||
angular_offsets, lowest_weight, weight_span, error);
|
||||
|
||||
// For each quantization level, find the best error terms. Use packed vectors so data-dependent
|
||||
// branches can become selects. This involves some integer to float casts, but the values are
|
||||
// small enough so they never round the wrong way.
|
||||
vfloat4 best_results[36];
|
||||
|
||||
// Initialize the array to some safe defaults
|
||||
promise(max_quant_steps > 0);
|
||||
for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
|
||||
{
|
||||
best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
promise(max_angular_steps > 0);
|
||||
for (unsigned int i = 0; i < max_angular_steps; i++)
|
||||
{
|
||||
int idx_span = weight_span[i];
|
||||
|
||||
// Check best error against record N
|
||||
vfloat4 current_best = best_results[idx_span];
|
||||
vfloat4 candidate = vfloat4(error[i], static_cast<float>(i), 0.0f, 0.0f);
|
||||
vmask4 mask = vfloat4(current_best.lane<0>()) > vfloat4(error[i]);
|
||||
best_results[idx_span] = select(current_best, candidate, mask);
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i <= max_quant_level; i++)
|
||||
{
|
||||
unsigned int q = steps_for_quant_level[i];
|
||||
int bsi = static_cast<int>(best_results[q].lane<1>());
|
||||
|
||||
// Did we find anything?
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
if ((bsi < 0) && print_once)
|
||||
{
|
||||
print_once = false;
|
||||
printf("INFO: Unable to find low weight encoding within search error limit.\n\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
bsi = astc::max(0, bsi);
|
||||
|
||||
float lwi = lowest_weight[bsi];
|
||||
float hwi = lwi + static_cast<float>(q) - 1.0f;
|
||||
|
||||
float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
|
||||
low_value[i] = (angular_offsets[bsi] + lwi) * stepsize;
|
||||
high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_angular_endpoints_1plane(
|
||||
unsigned int tune_low_weight_limit,
|
||||
bool only_always,
|
||||
const block_size_descriptor& bsd,
|
||||
const float* dec_weight_ideal_value,
|
||||
@@ -519,20 +371,10 @@ void compute_angular_endpoints_1plane(
|
||||
max_precision = max_weight_quant;
|
||||
}
|
||||
|
||||
if (weight_count < tune_low_weight_limit)
|
||||
{
|
||||
compute_angular_endpoints_for_quant_levels_lwc(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
max_precision, low_values[i], high_values[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
max_precision, low_values[i], high_values[i]);
|
||||
}
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
max_precision, low_values[i], high_values[i]);
|
||||
}
|
||||
|
||||
unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
|
||||
@@ -561,7 +403,6 @@ void compute_angular_endpoints_1plane(
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_angular_endpoints_2planes(
|
||||
unsigned int tune_low_weight_limit,
|
||||
const block_size_descriptor& bsd,
|
||||
const float* dec_weight_ideal_value,
|
||||
unsigned int max_weight_quant,
|
||||
@@ -599,30 +440,15 @@ void compute_angular_endpoints_2planes(
|
||||
max_precision = max_weight_quant;
|
||||
}
|
||||
|
||||
if (weight_count < tune_low_weight_limit)
|
||||
{
|
||||
compute_angular_endpoints_for_quant_levels_lwc(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
max_precision, low_values1[i], high_values1[i]);
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
max_precision, low_values1[i], high_values1[i]);
|
||||
|
||||
compute_angular_endpoints_for_quant_levels_lwc(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
|
||||
max_precision, low_values2[i], high_values2[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
max_precision, low_values1[i], high_values1[i]);
|
||||
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
|
||||
max_precision, low_values2[i], high_values2[i]);
|
||||
}
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
|
||||
max_precision, low_values2[i], high_values2[i]);
|
||||
}
|
||||
|
||||
unsigned int start = bsd.block_mode_count_1plane_selected;
|
||||
|
||||
Reference in New Issue
Block a user