Updated astc-encoder.

This commit is contained in:
Бранимир Караџић
2022-11-10 17:27:01 -08:00
parent 225dad7441
commit 8c3aabc3c1
13 changed files with 496 additions and 512 deletions

View File

@@ -241,6 +241,9 @@ static const float ASTCENC_PRE_MEDIUM = 60.0f;
/** @brief The thorough quality search preset. */
static const float ASTCENC_PRE_THOROUGH = 98.0f;
/** @brief The thorough quality search preset. */
static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f;
/** @brief The exhaustive, highest quality, search preset. */
static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f;
@@ -440,11 +443,25 @@ struct astcenc_config
unsigned int tune_partition_count_limit;
/**
* @brief The maximum number of partitions searched (-partitionindexlimit).
* @brief The maximum number of partitions searched (-2partitionindexlimit).
*
* Valid values are between 1 and 1024.
*/
unsigned int tune_partition_index_limit;
unsigned int tune_2partition_index_limit;
/**
* @brief The maximum number of partitions searched (-3partitionindexlimit).
*
* Valid values are between 1 and 1024.
*/
unsigned int tune_3partition_index_limit;
/**
* @brief The maximum number of partitions searched (-4partitionindexlimit).
*
* Valid values are between 1 and 1024.
*/
unsigned int tune_4partition_index_limit;
/**
* @brief The maximum centile for block modes searched (-blockmodelimit).
@@ -468,6 +485,27 @@ struct astcenc_config
*/
unsigned int tune_candidate_limit;
/**
* @brief The number of trial partitionings per search (-2partitioncandidatelimit).
*
* Valid values are between 1 and TUNE_MAX_PARTITIIONING_CANDIDATES.
*/
unsigned int tune_2partitioning_candidate_limit;
/**
* @brief The number of trial partitionings per search (-3partitioncandidatelimit).
*
* Valid values are between 1 and TUNE_MAX_PARTITIIONING_CANDIDATES.
*/
unsigned int tune_3partitioning_candidate_limit;
/**
* @brief The number of trial partitionings per search (-4partitioncandidatelimit).
*
* Valid values are between 1 and TUNE_MAX_PARTITIIONING_CANDIDATES.
*/
unsigned int tune_4partitioning_candidate_limit;
/**
* @brief The dB threshold for stopping block search (-dblimit).
*
@@ -517,11 +555,6 @@ struct astcenc_config
*/
float tune_2_plane_early_out_limit_correlation;
/**
* @brief The threshold below which (inclusive) we stop testing low/high/low+high cutoffs.
*/
unsigned int tune_low_weight_count_limit;
#if defined(ASTCENC_DIAGNOSTICS)
/**
* @brief The path to save the diagnostic trace data to.

View File

@@ -334,13 +334,13 @@ static bool try_quantize_rgb_delta(
int g0be = quant_color(quant_level, g0b);
int b0be = quant_color(quant_level, b0b);
r0b = unquant_color(quant_level, r0be);
g0b = unquant_color(quant_level, g0be);
b0b = unquant_color(quant_level, b0be);
int r0bu = unquant_color(quant_level, r0be);
int g0bu = unquant_color(quant_level, g0be);
int b0bu = unquant_color(quant_level, b0be);
r0b |= r0a & 0x100;
g0b |= g0a & 0x100;
b0b |= b0a & 0x100;
r0b = r0bu | (r0a & 0x100);
g0b = g0bu | (g0a & 0x100);
b0b = b0bu | (b0a & 0x100);
// Get hold of the second value
int r1d = astc::flt2int_rtn(r1);
@@ -386,36 +386,18 @@ static bool try_quantize_rgb_delta(
return false;
}
// Check that the sum of the encoded offsets is nonnegative, else encoding fails
r1du &= 0x7f;
g1du &= 0x7f;
b1du &= 0x7f;
if (r1du & 0x40)
{
r1du -= 0x80;
}
if (g1du & 0x40)
{
g1du -= 0x80;
}
if (b1du & 0x40)
{
b1du -= 0x80;
}
if (r1du + g1du + b1du < 0)
// If the sum of offsets triggers blue-contraction then encoding fails
vint4 ep0(r0bu, g0bu, b0bu, 0);
vint4 ep1(r1du, g1du, b1du, 0);
bit_transfer_signed(ep1, ep0);
if (hadd_rgb_s(ep1) < 0)
{
return false;
}
// Check that the offsets produce legitimate sums as well
r1du += r0b;
g1du += g0b;
b1du += b0b;
if (r1du < 0 || r1du > 0x1FF || g1du < 0 || g1du > 0x1FF || b1du < 0 || b1du > 0x1FF)
ep0 = ep0 + ep1;
if (any((ep0 < vint4(0)) | (ep0 > vint4(0xFF))))
{
return false;
}
@@ -477,13 +459,13 @@ static bool try_quantize_rgb_delta_blue_contract(
int g0be = quant_color(quant_level, g0b);
int b0be = quant_color(quant_level, b0b);
r0b = unquant_color(quant_level, r0be);
g0b = unquant_color(quant_level, g0be);
b0b = unquant_color(quant_level, b0be);
int r0bu = unquant_color(quant_level, r0be);
int g0bu = unquant_color(quant_level, g0be);
int b0bu = unquant_color(quant_level, b0be);
r0b |= r0a & 0x100;
g0b |= g0a & 0x100;
b0b |= b0a & 0x100;
r0b = r0bu | (r0a & 0x100);
g0b = g0bu | (g0a & 0x100);
b0b = b0bu | (b0a & 0x100);
// Get hold of the second value
int r1d = astc::flt2int_rtn(r1);
@@ -530,38 +512,18 @@ static bool try_quantize_rgb_delta_blue_contract(
return false;
}
// Check that the sum of the encoded offsets is negative, else encoding fails
// Note that this is inverse of the test for non-blue-contracted RGB.
r1du &= 0x7f;
g1du &= 0x7f;
b1du &= 0x7f;
if (r1du & 0x40)
{
r1du -= 0x80;
}
if (g1du & 0x40)
{
g1du -= 0x80;
}
if (b1du & 0x40)
{
b1du -= 0x80;
}
if (r1du + g1du + b1du >= 0)
// If the sum of offsets does not trigger blue-contraction then encoding fails
vint4 ep0(r0bu, g0bu, b0bu, 0);
vint4 ep1(r1du, g1du, b1du, 0);
bit_transfer_signed(ep1, ep0);
if (hadd_rgb_s(ep1) >= 0)
{
return false;
}
// Check that the offsets produce legitimate sums as well
r1du += r0b;
g1du += g0b;
b1du += b0b;
if (r1du < 0 || r1du > 0x1FF || g1du < 0 || g1du > 0x1FF || b1du < 0 || b1du > 0x1FF)
ep0 = ep0 + ep1;
if (any((ep0 < vint4(0)) | (ep0 > vint4(0xFF))))
{
return false;
}

View File

@@ -97,15 +97,8 @@ static void rgba_delta_unpack(
vint4 input0 = unquant_color(quant_level, input0q);
vint4 input1 = unquant_color(quant_level, input1q);
// Perform bit-transfer
input0 = input0 | lsl<1>(input1 & 0x80);
input1 = input1 & 0x7F;
vmask4 mask = (input1 & 0x40) != vint4::zero();
input1 = select(input1, input1 - 0x80, mask);
// Scale
input0 = asr<1>(input0);
input1 = asr<1>(input1);
// Apply bit transfer
bit_transfer_signed(input1, input0);
// Apply blue-uncontraction if needed
int rgb_sum = hadd_rgb_s(input1);

View File

@@ -424,11 +424,7 @@ static float compress_symbolic_block_for_partition_1plane(
// For each mode, use the angular method to compute a shift
compute_angular_endpoints_1plane(
config.tune_low_weight_count_limit,
only_always, bsd,
dec_weights_ideal,
max_weight_quant,
tmpbuf);
only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
float* weight_low_value = tmpbuf.weight_low_value1;
float* weight_high_value = tmpbuf.weight_high_value1;
@@ -795,9 +791,7 @@ static float compress_symbolic_block_for_partition_2planes(
float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
compute_angular_endpoints_2planes(
config.tune_low_weight_count_limit,
bsd, dec_weights_ideal, max_weight_quant,
tmpbuf);
bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
// For each mode (which specifies a decimation and a quantization):
// * Compute number of bits needed for the quantized weights
@@ -1130,12 +1124,13 @@ static float prepare_block_statistics(
aa_var -= as * (as * rpt);
rg_cov *= astc::rsqrt(astc::max(rr_var * gg_var, 1e-30f));
rb_cov *= astc::rsqrt(astc::max(rr_var * bb_var, 1e-30f));
ra_cov *= astc::rsqrt(astc::max(rr_var * aa_var, 1e-30f));
gb_cov *= astc::rsqrt(astc::max(gg_var * bb_var, 1e-30f));
ga_cov *= astc::rsqrt(astc::max(gg_var * aa_var, 1e-30f));
ba_cov *= astc::rsqrt(astc::max(bb_var * aa_var, 1e-30f));
// These will give a NaN if a channel is constant - these are fixed up in the next step
rg_cov *= astc::rsqrt(rr_var * gg_var);
rb_cov *= astc::rsqrt(rr_var * bb_var);
ra_cov *= astc::rsqrt(rr_var * aa_var);
gb_cov *= astc::rsqrt(gg_var * bb_var);
ga_cov *= astc::rsqrt(gg_var * aa_var);
ba_cov *= astc::rsqrt(bb_var * aa_var);
if (astc::isnan(rg_cov)) rg_cov = 1.0f;
if (astc::isnan(rb_cov)) rb_cov = 1.0f;
@@ -1144,7 +1139,7 @@ static float prepare_block_statistics(
if (astc::isnan(ga_cov)) ga_cov = 1.0f;
if (astc::isnan(ba_cov)) ba_cov = 1.0f;
float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov));
lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov));
lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov));
@@ -1197,6 +1192,18 @@ void compress_block(
bool block_skip_two_plane = false;
int max_partitions = ctx.config.tune_partition_count_limit;
unsigned int requested_partition_indices[3] {
ctx.config.tune_2partition_index_limit,
ctx.config.tune_3partition_index_limit,
ctx.config.tune_4partition_index_limit
};
unsigned int requested_partition_trials[3] {
ctx.config.tune_2partitioning_candidate_limit,
ctx.config.tune_3partitioning_candidate_limit,
ctx.config.tune_4partitioning_candidate_limit
};
#if defined(ASTCENC_DIAGNOSTICS)
// Do this early in diagnostic builds so we can dump uniform metrics
// for every block. Do it later in release builds to avoid redundant work!
@@ -1366,13 +1373,19 @@ void compress_block(
// Find best blocks for 2, 3 and 4 partitions
for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
{
unsigned int partition_indices[2] { 0 };
unsigned int partition_indices[TUNE_MAX_PARTITIIONING_CANDIDATES];
find_best_partition_candidates(bsd, blk, partition_count,
ctx.config.tune_partition_index_limit,
partition_indices);
unsigned int requested_indices = requested_partition_indices[partition_count - 2];
for (unsigned int i = 0; i < 2; i++)
unsigned int requested_trials = requested_partition_trials[partition_count - 2];
requested_trials = astc::min(requested_trials, requested_indices);
unsigned int actual_trials = find_best_partition_candidates(
bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
for (unsigned int i = 0; i < actual_trials; i++)
{
TRACE_NODE(node1, "pass");
trace_add_data("partition_count", partition_count);
@@ -1387,6 +1400,20 @@ void compress_block(
scb, tmpbuf, quant_limit);
best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
// If using N partitions doesn't improve much over using N-1 partitions then skip trying
// N+1. Error can dramatically improve if the data is correlated or non-correlated and
// aligns with a partitioning that suits that encoding, so for this inner loop check add
// a large error scale because the "other" trial could be a lot better. In total the
// error must be at least 2x worse than the best existing error to early-out.
float best_error = best_errorvals_for_pcount[partition_count - 1];
float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 2.0f;
if (best_error > (best_error_in_prev * best_error_scale))
{
trace_add_data("skip", "tune_partition_early_out_limit_factor");
goto END_OF_TESTS;
}
if (errorval < error_threshold)
{
trace_add_data("exit", "quality hit");
@@ -1396,7 +1423,6 @@ void compress_block(
// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
float best_error = best_errorvals_for_pcount[partition_count - 1];
float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
if (best_error > (best_error_in_prev * best_error_scale))
{

View File

@@ -40,10 +40,15 @@ struct astcenc_preset_config
{
float quality;
unsigned int tune_partition_count_limit;
unsigned int tune_partition_index_limit;
unsigned int tune_2partition_index_limit;
unsigned int tune_3partition_index_limit;
unsigned int tune_4partition_index_limit;
unsigned int tune_block_mode_limit;
unsigned int tune_refinement_limit;
unsigned int tune_candidate_limit;
unsigned int tune_2partitioning_candidate_limit;
unsigned int tune_3partitioning_candidate_limit;
unsigned int tune_4partitioning_candidate_limit;
float tune_db_limit_a_base;
float tune_db_limit_b_base;
float tune_mode0_mse_overshoot;
@@ -51,7 +56,6 @@ struct astcenc_preset_config
float tune_2_partition_early_out_limit_factor;
float tune_3_partition_early_out_limit_factor;
float tune_2_plane_early_out_limit_correlation;
unsigned int tune_low_weight_count_limit;
};
@@ -59,22 +63,25 @@ struct astcenc_preset_config
* @brief The static quality presets that are built-in for high bandwidth
* presets (x < 25 texels per block).
*/
static const std::array<astcenc_preset_config, 5> preset_configs_high {{
static const std::array<astcenc_preset_config, 6> preset_configs_high {{
{
ASTCENC_PRE_FASTEST,
2, 10, 43, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 25
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
}, {
ASTCENC_PRE_FAST,
3, 14, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.65f, 20
3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.90f
}, {
ASTCENC_PRE_MEDIUM,
4, 28, 76, 3, 3, 95.0f, 70.0f, 2.5f, 2.5f, 1.2f, 1.25f, 0.85f, 16
4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 2.5f, 1.1f, 1.05f, 0.95f
}, {
ASTCENC_PRE_THOROUGH,
4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 12
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.35f, 1.15f, 0.97f
}, {
ASTCENC_PRE_VERYTHOROUGH,
4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
}, {
ASTCENC_PRE_EXHAUSTIVE,
4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
}
}};
@@ -82,46 +89,51 @@ static const std::array<astcenc_preset_config, 5> preset_configs_high {{
* @brief The static quality presets that are built-in for medium bandwidth
* presets (25 <= x < 64 texels per block).
*/
static const std::array<astcenc_preset_config, 5> preset_configs_mid {{
static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
{
ASTCENC_PRE_FASTEST,
2, 10, 43, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f
}, {
ASTCENC_PRE_FAST,
3, 15, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
}, {
ASTCENC_PRE_MEDIUM,
4, 30, 76, 3, 3, 95.0f, 70.0f, 3.0f, 3.0f, 1.2f, 1.25f, 0.75f, 14
4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 3.0f, 1.1f, 1.05f, 0.90f
}, {
ASTCENC_PRE_THOROUGH,
4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 10
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.4f, 1.2f, 0.95f
}, {
ASTCENC_PRE_VERYTHOROUGH,
4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
}, {
ASTCENC_PRE_EXHAUSTIVE,
4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
}
}};
/**
* @brief The static quality presets that are built-in for low bandwidth
* presets (64 <= x texels per block).
*/
static const std::array<astcenc_preset_config, 5> preset_configs_low {{
static const std::array<astcenc_preset_config, 6> preset_configs_low {{
{
ASTCENC_PRE_FASTEST,
2, 10, 40, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f
}, {
ASTCENC_PRE_FAST,
2, 15, 55, 3, 3, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
}, {
ASTCENC_PRE_MEDIUM,
3, 30, 76, 3, 3, 95.0f, 70.0f, 3.5f, 3.5f, 1.2f, 1.25f, 0.65f, 12
3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 3.5f, 1.1f, 1.05f, 0.90f
}, {
ASTCENC_PRE_THOROUGH,
4, 75, 92, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.85f, 10
4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.3f, 1.2f, 0.97f
}, {
ASTCENC_PRE_VERYTHOROUGH,
4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
}, {
ASTCENC_PRE_EXHAUSTIVE,
4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
}
}};
@@ -422,10 +434,15 @@ static astcenc_error validate_config(
config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
config.tune_partition_index_limit = astc::clamp(config.tune_partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
config.tune_mode0_mse_overshoot = astc::max(config.tune_mode0_mse_overshoot, 1.0f);
config.tune_refinement_mse_overshoot = astc::max(config.tune_refinement_mse_overshoot, 1.0f);
@@ -464,9 +481,23 @@ astcenc_error astcenc_config_init(
astcenc_config* configp
) {
astcenc_error status;
astcenc_config& config = *configp;
// Check basic library compatibility options here so they are checked early. Note, these checks
// are repeated in context_alloc for cases where callers use a manually defined config struct
status = validate_cpu_isa();
if (status != ASTCENC_SUCCESS)
{
return status;
}
status = validate_cpu_float();
if (status != ASTCENC_SUCCESS)
{
return status;
}
// Zero init all config fields; although most of will be over written
astcenc_config& config = *configp;
std::memset(&config, 0, sizeof(config));
// Process the block size
@@ -493,7 +524,7 @@ astcenc_error astcenc_config_init(
return ASTCENC_ERR_BAD_QUALITY;
}
static const std::array<astcenc_preset_config, 5>* preset_configs;
static const std::array<astcenc_preset_config, 6>* preset_configs;
int texels_int = block_x * block_y * block_z;
if (texels_int < 25)
{
@@ -525,11 +556,15 @@ astcenc_error astcenc_config_init(
if (start == end)
{
config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
config.tune_partition_index_limit = (*preset_configs)[start].tune_partition_index_limit;
config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit,
TUNE_MAX_TRIAL_CANDIDATES);
config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES);
config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
(*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
@@ -539,7 +574,6 @@ astcenc_error astcenc_config_init(
config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor;
config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor;
config.tune_2_plane_early_out_limit_correlation = (*preset_configs)[start].tune_2_plane_early_out_limit_correlation;
config.tune_low_weight_count_limit = (*preset_configs)[start].tune_low_weight_count_limit;
}
// Start and end node are not the same - so interpolate between them
else
@@ -561,11 +595,19 @@ astcenc_error astcenc_config_init(
#define LERPUI(param) static_cast<unsigned int>(LERPI(param))
config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
config.tune_partition_index_limit = LERPI(tune_partition_index_limit);
config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
config.tune_refinement_limit = LERPI(tune_refinement_limit);
config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit),
TUNE_MAX_TRIAL_CANDIDATES);
config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit),
BLOCK_MAX_PARTITIONINGS);
config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit),
BLOCK_MAX_PARTITIONINGS);
config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit),
BLOCK_MAX_PARTITIONINGS);
config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
LERP(tune_db_limit_b_base) - 19 * ltexels);
@@ -575,7 +617,6 @@ astcenc_error astcenc_config_init(
config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor);
config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor);
config.tune_2_plane_early_out_limit_correlation = LERP(tune_2_plane_early_out_limit_correlation);
config.tune_low_weight_count_limit = LERPI(tune_low_weight_count_limit);
#undef LERP
#undef LERPI
#undef LERPUI
@@ -676,13 +717,13 @@ astcenc_error astcenc_context_alloc(
astcenc_error status;
const astcenc_config& config = *configp;
status = validate_cpu_float();
status = validate_cpu_isa();
if (status != ASTCENC_SUCCESS)
{
return status;
}
status = validate_cpu_isa();
status = validate_cpu_float();
if (status != ASTCENC_SUCCESS)
{
return status;
@@ -714,7 +755,7 @@ astcenc_error astcenc_context_alloc(
status = validate_config(ctx->config);
if (status != ASTCENC_SUCCESS)
{
delete ctx;
delete ctxo;
return status;
}

View File

@@ -485,13 +485,59 @@ static unsigned int compute_kmeans_partition_ordering(
mismatch_counts, partition_ordering);
}
/**
* @brief Insert a partitioning into an order list of results, sorted by error.
*
* @param max_values The max number of entries in the best result arrays/
* @param this_error The error of the new entry.
* @param this_partition The partition ID of the new entry.
* @param[out] best_errors The array of best error values.
* @param[out] best_partitions The array of best partition values.
*/
static void insert_result(
unsigned int max_values,
float this_error,
unsigned int this_partition,
float* best_errors,
unsigned int* best_partitions)
{
// Don't bother searching if the current worst error beats the new error
if (this_error >= best_errors[max_values - 1])
{
return;
}
// Else insert into the list in error-order
for (unsigned int i = 0; i < max_values; i++)
{
// Existing result is better - move on ...
if (this_error > best_errors[i])
{
continue;
}
// Move existing results down one
for (unsigned int j = max_values - 1; j > i; j--)
{
best_errors[j] = best_errors[j - 1];
best_partitions[j] = best_partitions[j - 1];
}
// Insert new result
best_errors[i] = this_error;
best_partitions[i] = this_partition;
break;
}
}
/* See header for documentation. */
void find_best_partition_candidates(
unsigned int find_best_partition_candidates(
const block_size_descriptor& bsd,
const image_block& blk,
unsigned int partition_count,
unsigned int partition_search_limit,
unsigned int best_partitions[2]
unsigned int best_partitions[BLOCK_MAX_PARTITIONINGS],
unsigned int requested_candidates
) {
// Constant used to estimate quantization error for a given partitioning; the optimal value for
// this depends on bitrate. These values have been determined empirically.
@@ -518,17 +564,23 @@ void find_best_partition_candidates(
unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS];
unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
partition_search_limit = astc::min(partition_search_limit, sequence_len);
requested_candidates = astc::min(partition_search_limit, requested_candidates);
bool uses_alpha = !blk.is_constant_channel(3);
// Partitioning errors assuming uncorrelated-chrominance endpoints
float uncor_best_error { ERROR_CALC_DEFAULT };
unsigned int uncor_best_partition { 0 };
float uncor_best_errors[TUNE_MAX_PARTITIIONING_CANDIDATES];
unsigned int uncor_best_partitions[TUNE_MAX_PARTITIIONING_CANDIDATES];
// Partitioning errors assuming same-chrominance endpoints
// Store two so we can always return one different to uncorr
float samec_best_errors[2] { ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT };
unsigned int samec_best_partitions[2] { 0, 0 };
float samec_best_errors[TUNE_MAX_PARTITIIONING_CANDIDATES];
unsigned int samec_best_partitions[TUNE_MAX_PARTITIIONING_CANDIDATES];
for (unsigned int i = 0; i < requested_candidates; i++)
{
uncor_best_errors[i] = ERROR_CALC_DEFAULT;
samec_best_errors[i] = ERROR_CALC_DEFAULT;
}
if (uses_alpha)
{
@@ -602,25 +654,8 @@ void find_best_partition_candidates(
samec_error += dot_s(samec_vector * samec_vector, error_weights);
}
if (uncor_error < uncor_best_error)
{
uncor_best_error = uncor_error;
uncor_best_partition = partition;
}
if (samec_error < samec_best_errors[0])
{
samec_best_errors[1] = samec_best_errors[0];
samec_best_partitions[1] = samec_best_partitions[0];
samec_best_errors[0] = samec_error;
samec_best_partitions[0] = partition;
}
else if (samec_error < samec_best_errors[1])
{
samec_best_errors[1] = samec_error;
samec_best_partitions[1] = partition;
}
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
}
}
else
@@ -687,50 +722,55 @@ void find_best_partition_candidates(
samec_error += dot3_s(samec_vector * samec_vector, error_weights);
}
if (uncor_error < uncor_best_error)
{
uncor_best_error = uncor_error;
uncor_best_partition = partition;
}
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
}
}
if (samec_error < samec_best_errors[0])
{
samec_best_errors[1] = samec_best_errors[0];
samec_best_partitions[1] = samec_best_partitions[0];
bool best_is_uncor = uncor_best_partitions[0] > samec_best_partitions[0];
samec_best_errors[0] = samec_error;
samec_best_partitions[0] = partition;
}
else if (samec_error < samec_best_errors[1])
unsigned int interleave[2 * TUNE_MAX_PARTITIIONING_CANDIDATES];
for (unsigned int i = 0; i < requested_candidates; i++)
{
if (best_is_uncor)
{
interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
}
else
{
interleave[2 * i] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
}
}
uint64_t bitmasks[1024/64] { 0 };
unsigned int emitted = 0;
// Deduplicate the first "requested" entries
for (unsigned int i = 0; i < requested_candidates * 2; i++)
{
unsigned int partition = interleave[i];
unsigned int word = partition / 64;
unsigned int bit = partition % 64;
bool written = bitmasks[word] & (1ull << bit);
if (!written)
{
best_partitions[emitted] = partition;
bitmasks[word] |= 1ull << bit;
emitted++;
if (emitted == requested_candidates)
{
samec_best_errors[1] = samec_error;
samec_best_partitions[1] = partition;
break;
}
}
}
// Same partition is best for both, so use this first unconditionally
if (uncor_best_partition == samec_best_partitions[0])
{
best_partitions[0] = samec_best_partitions[0];
best_partitions[1] = samec_best_partitions[1];
}
// Uncor is best
else if (uncor_best_error <= samec_best_errors[0])
{
best_partitions[0] = uncor_best_partition;
best_partitions[1] = samec_best_partitions[0];
}
// Samec is best
else
{
best_partitions[0] = samec_best_partitions[0];
best_partitions[1] = uncor_best_partition;
}
// Convert these back into canonical partition IDs for the rest of the codec
best_partitions[0] = bsd.get_raw_partition_info(partition_count, best_partitions[0]).partition_index;
best_partitions[1] = bsd.get_raw_partition_info(partition_count, best_partitions[1]).partition_index;
return emitted;
}
#endif

View File

@@ -130,7 +130,14 @@ static constexpr unsigned int TUNE_MIN_TEXELS_MODE0_FASTPATH { 24 };
*
* This can be dynamically reduced by the compression quality preset.
*/
static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 4 };
static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 8 };
/**
* @brief The maximum number of candidate partitionings tested for each encoding mode.
*
* This can be dynamically reduced by the compression quality preset.
*/
static constexpr unsigned int TUNE_MAX_PARTITIIONING_CANDIDATES { 32 };
/**
* @brief The maximum quant level using full angular endpoint search method.
@@ -1345,11 +1352,11 @@ extern const int8_t quant_mode_table[10][128];
* Note that BISE can return strings that are not a whole number of bytes in length, and ASTC can
* start storing strings in a block at arbitrary bit offsets in the encoded data.
*
* @param quant_level The BISE alphabet size.
* @param character_count The number of characters in the string.
* @param input_data The unpacked string, one byte per character.
* @param[in,out] output_data The output packed string.
* @param bit_offset The starting offset in the output storage.
* @param quant_level The BISE alphabet size.
* @param character_count The number of characters in the string.
* @param input_data The unpacked string, one byte per character.
* @param[in,out] output_data The output packed string.
* @param bit_offset The starting offset in the output storage.
*/
void encode_ise(
quant_method quant_level,
@@ -1436,11 +1443,11 @@ void compute_avgs_and_dirs_3_comp(
* This is a specialization of @c compute_avgs_and_dirs_3_comp where the omitted component is
* always alpha, a common case during partition search.
*
* @param pi The partition info for the current trial.
* @param blk The image block color data to be compressed.
* @param[out] pm The output partition metrics.
* - Only pi.partition_count array entries actually get initialized.
* - Direction vectors @c pm.dir are not normalized.
* @param pi The partition info for the current trial.
* @param blk The image block color data to be compressed.
* @param[out] pm The output partition metrics.
* - Only pi.partition_count array entries actually get initialized.
* - Direction vectors @c pm.dir are not normalized.
*/
void compute_avgs_and_dirs_3_comp_rgb(
const partition_info& pi,
@@ -1471,11 +1478,11 @@ void compute_avgs_and_dirs_4_comp(
*
* This function computes the squared error when using these two representations.
*
* @param pi The partition info for the current trial.
* @param blk The image block color data to be compressed.
* @param[in,out] plines Processed line inputs, and line length outputs.
* @param[out] uncor_error The cumulative error for using the uncorrelated line.
* @param[out] samec_error The cumulative error for using the same chroma line.
* @param pi The partition info for the current trial.
* @param blk The image block color data to be compressed.
* @param[in,out] plines Processed line inputs, and line length outputs.
* @param[out] uncor_error The cumulative error for using the uncorrelated line.
* @param[out] samec_error The cumulative error for using the same chroma line.
*/
void compute_error_squared_rgb(
const partition_info& pi,
@@ -1520,18 +1527,23 @@ void compute_error_squared_rgba(
* candidates; one assuming data has uncorrelated chroma and one assuming the
* data has correlated chroma. The best candidate is returned first in the list.
*
* @param bsd The block size information.
* @param blk The image block color data to compress.
* @param partition_count The number of partitions in the block.
* @param partition_search_limit The number of candidate partition encodings to trial.
* @param[out] best_partitions The best partition candidates.
* @param bsd The block size information.
* @param blk The image block color data to compress.
* @param partition_count The number of partitions in the block.
* @param partition_search_limit The number of candidate partition encodings to trial.
* @param[out] best_partitions The best partition candidates.
* @param requested_candidates The number of requsted partitionings. May return fewer if
* candidates are not avaiable.
*
* @return The actual number of candidates returned.
*/
void find_best_partition_candidates(
unsigned int find_best_partition_candidates(
const block_size_descriptor& bsd,
const image_block& blk,
unsigned int partition_count,
unsigned int partition_search_limit,
unsigned int best_partitions[2]);
unsigned int best_partitions[TUNE_MAX_PARTITIIONING_CANDIDATES],
unsigned int requested_candidates);
/* ============================================================================
Functionality for managing images and image related data.
@@ -1545,10 +1557,10 @@ void find_best_partition_candidates(
*
* Results are written back into @c img->input_alpha_averages.
*
* @param img The input image data, also holds output data.
* @param alpha_kernel_radius The kernel radius (in pixels) for alpha mods.
* @param swz Input data component swizzle.
* @param[out] ag The average variance arguments to init.
* @param img The input image data, also holds output data.
* @param alpha_kernel_radius The kernel radius (in pixels) for alpha mods.
* @param swz Input data component swizzle.
* @param[out] ag The average variance arguments to init.
*
* @return The number of tasks in the processing stage.
*/
@@ -1766,13 +1778,13 @@ float compute_error_of_weight_set_2planes(
* The user requests a base color endpoint mode in @c format, but the quantizer may choose a
* delta-based representation. It will report back the format variant it actually used.
*
* @param color0 The input unquantized color0 endpoint for absolute endpoint pairs.
* @param color1 The input unquantized color1 endpoint for absolute endpoint pairs.
* @param rgbs_color The input unquantized RGBS variant endpoint for same chroma endpoints.
* @param rgbo_color The input unquantized RGBS variant endpoint for HDR endpoints..
* @param format The desired base format.
* @param[out] output The output storage for the quantized colors/
* @param quant_level The quantization level requested.
* @param color0 The input unquantized color0 endpoint for absolute endpoint pairs.
* @param color1 The input unquantized color1 endpoint for absolute endpoint pairs.
* @param rgbs_color The input unquantized RGBS variant endpoint for same chroma endpoints.
* @param rgbo_color The input unquantized RGBS variant endpoint for HDR endpoints.
* @param format The desired base format.
* @param[out] output The output storage for the quantized colors/
* @param quant_level The quantization level requested.
*
* @return The actual endpoint mode used.
*/
@@ -1873,13 +1885,13 @@ unsigned int compute_ideal_endpoint_formats(
* As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must
* recompute the ideal colors for a specific weight set.
*
* @param blk The image block color data to compress.
* @param pi The partition info for the current trial.
* @param di The weight grid decimation table.
* @param blk The image block color data to compress.
* @param pi The partition info for the current trial.
* @param di The weight grid decimation table.
* @param dec_weights_uquant The quantized weight set.
* @param[in,out] ep The color endpoints (modifed in place).
* @param[out] rgbs_vectors The RGB+scale vectors for LDR blocks.
* @param[out] rgbo_vectors The RGB+offset vectors for HDR blocks.
* @param[in,out] ep The color endpoints (modifed in place).
* @param[out] rgbs_vectors The RGB+scale vectors for LDR blocks.
* @param[out] rgbo_vectors The RGB+offset vectors for HDR blocks.
*/
void recompute_ideal_colors_1plane(
const image_block& blk,
@@ -1896,15 +1908,15 @@ void recompute_ideal_colors_1plane(
* As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must
* recompute the ideal colors for a specific weight set.
*
* @param blk The image block color data to compress.
* @param bsd The block_size descriptor.
* @param di The weight grid decimation table.
* @param blk The image block color data to compress.
* @param bsd The block_size descriptor.
* @param di The weight grid decimation table.
* @param dec_weights_uquant_plane1 The quantized weight set for plane 1.
* @param dec_weights_uquant_plane2 The quantized weight set for plane 2.
* @param[in,out] ep The color endpoints (modifed in place).
* @param[out] rgbs_vector The RGB+scale color for LDR blocks.
* @param[out] rgbo_vector The RGB+offset color for HDR blocks.
* @param plane2_component The component assigned to plane 2.
* @param[in,out] ep The color endpoints (modifed in place).
* @param[out] rgbs_vector The RGB+scale color for LDR blocks.
* @param[out] rgbo_vector The RGB+offset color for HDR blocks.
* @param plane2_component The component assigned to plane 2.
*/
void recompute_ideal_colors_2planes(
const image_block& blk,
@@ -1925,15 +1937,13 @@ void prepare_angular_tables();
/**
* @brief Compute the angular endpoints for one plane for each block mode.
*
* @param tune_low_weight_limit Weight count cutoff below which we use simpler searches.
* @param only_always Only consider block modes that are always enabled.
* @param bsd The block size descriptor for the current trial.
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
* @param max_weight_quant The maximum block mode weight quantization allowed.
* @param[out] tmpbuf Preallocated scratch buffers for the compressor.
* @param only_always Only consider block modes that are always enabled.
* @param bsd The block size descriptor for the current trial.
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
* @param max_weight_quant The maximum block mode weight quantization allowed.
* @param[out] tmpbuf Preallocated scratch buffers for the compressor.
*/
void compute_angular_endpoints_1plane(
unsigned int tune_low_weight_limit,
bool only_always,
const block_size_descriptor& bsd,
const float* dec_weight_ideal_value,
@@ -1943,14 +1953,12 @@ void compute_angular_endpoints_1plane(
/**
* @brief Compute the angular endpoints for two planes for each block mode.
*
* @param tune_low_weight_limit Weight count cutoff below which we use simpler searches.
* @param bsd The block size descriptor for the current trial.
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
* @param max_weight_quant The maximum block mode weight quantization allowed.
* @param[out] tmpbuf Preallocated scratch buffers for the compressor.
* @param bsd The block size descriptor for the current trial.
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
* @param max_weight_quant The maximum block mode weight quantization allowed.
* @param[out] tmpbuf Preallocated scratch buffers for the compressor.
*/
void compute_angular_endpoints_2planes(
unsigned int tune_low_weight_limit,
const block_size_descriptor& bsd,
const float* dec_weight_ideal_value,
unsigned int max_weight_quant,
@@ -2162,18 +2170,4 @@ void aligned_free(T* ptr)
#endif
}
static inline void dump_weights(const char* label, uint8_t* weights, int weight_count)
{
printf("%s\n", label);
vint lane = vint::lane_id();
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
{
vmask mask = lane < vint(weight_count);
vint val(weights + i);
val = select(vint::zero(), val, mask);
print(val);
lane += vint(ASTCENC_SIMD_WIDTH);
}
}
#endif

View File

@@ -361,6 +361,23 @@ static inline int popcount(uint64_t v)
#endif
/**
* @brief Apply signed bit transfer.
*
* @param input0 The first encoded endpoint.
* @param input1 The second encoded endpoint.
*/
static ASTCENC_SIMD_INLINE void bit_transfer_signed(
vint4& input0,
vint4& input1
) {
input1 = lsr<1>(input1) | (input0 & 0x80);
input0 = lsr<1>(input0) & 0x3F;
vmask4 mask = (input0 & 0x20) != vint4::zero();
input0 = select(input0, input0 - 0x40, mask);
}
/**
* @brief Debug function to print a vector of ints.
*/

View File

@@ -333,156 +333,8 @@ static void compute_angular_endpoints_for_quant_levels(
}
}
/**
* @brief For a given step size compute the lowest and highest weight, variant for low weight count.
*
* Compute the lowest and highest weight that results from quantizing using the given stepsize and
* offset, and then compute the resulting error. The cut errors indicate the error that results from
* forcing samples that should have had one weight value one step up or down.
*
* @param weight_count The number of (decimated) weights.
* @param dec_weight_quant_uvalue The decimated and quantized weight values.
* @param max_angular_steps The maximum number of steps to be tested.
* @param max_quant_steps The maximum quantization level to be tested.
* @param offsets The angular offsets array.
* @param[out] lowest_weight Per angular step, the lowest weight.
* @param[out] weight_span Per angular step, the span between lowest and highest weight.
* @param[out] error Per angular step, the error.
*/
static void compute_lowest_and_highest_weight_lwc(
unsigned int weight_count,
const float* dec_weight_quant_uvalue,
unsigned int max_angular_steps,
unsigned int max_quant_steps,
const float* offsets,
float* lowest_weight,
int* weight_span,
float* error
) {
promise(weight_count > 0);
promise(max_angular_steps > 0);
vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
{
vfloat minidx(128.0f);
vfloat maxidx(-128.0f);
vfloat errval = vfloat::zero();
vfloat offset = loada(offsets + sp);
for (unsigned int j = 0; j < weight_count; j++)
{
vfloat sval = load1(dec_weight_quant_uvalue + j) * rcp_stepsize - offset;
vfloat svalrte = round(sval);
vfloat diff = sval - svalrte;
errval += diff * diff;
// Compute min and max quantized weight spans for each step
minidx = min(minidx, svalrte);
maxidx = max(maxidx, svalrte);
}
// Write out min weight and weight span; clamp span to a usable range
vint span = float_to_int(maxidx - minidx + vfloat(1.0f));
span = min(span, vint(max_quant_steps + 3));
span = max(span, vint(2));
storea(minidx, lowest_weight + sp);
storea(span, weight_span + sp);
vfloat ssize = 1.0f / rcp_stepsize;
vfloat errscale = ssize * ssize;
storea(errval * errscale, error + sp);
rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
}
}
/**
* @brief The main function for the angular algorithm, variant for low weight count.
*
* @param weight_count The number of (decimated) weights.
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
* @param max_quant_level The maximum quantization level to be tested.
* @param[out] low_value Per angular step, the lowest weight value.
* @param[out] high_value Per angular step, the highest weight value.
*/
static void compute_angular_endpoints_for_quant_levels_lwc(
unsigned int weight_count,
const float* dec_weight_ideal_value,
unsigned int max_quant_level,
float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
) {
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
compute_angular_offsets(weight_count, dec_weight_ideal_value,
max_angular_steps, angular_offsets);
compute_lowest_and_highest_weight_lwc(weight_count, dec_weight_ideal_value,
max_angular_steps, max_quant_steps,
angular_offsets, lowest_weight, weight_span, error);
// For each quantization level, find the best error terms. Use packed vectors so data-dependent
// branches can become selects. This involves some integer to float casts, but the values are
// small enough so they never round the wrong way.
vfloat4 best_results[36];
// Initialize the array to some safe defaults
promise(max_quant_steps > 0);
for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
{
best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
}
promise(max_angular_steps > 0);
for (unsigned int i = 0; i < max_angular_steps; i++)
{
int idx_span = weight_span[i];
// Check best error against record N
vfloat4 current_best = best_results[idx_span];
vfloat4 candidate = vfloat4(error[i], static_cast<float>(i), 0.0f, 0.0f);
vmask4 mask = vfloat4(current_best.lane<0>()) > vfloat4(error[i]);
best_results[idx_span] = select(current_best, candidate, mask);
}
for (unsigned int i = 0; i <= max_quant_level; i++)
{
unsigned int q = steps_for_quant_level[i];
int bsi = static_cast<int>(best_results[q].lane<1>());
// Did we find anything?
#if defined(ASTCENC_DIAGNOSTICS)
if ((bsi < 0) && print_once)
{
print_once = false;
printf("INFO: Unable to find low weight encoding within search error limit.\n\n");
}
#endif
bsi = astc::max(0, bsi);
float lwi = lowest_weight[bsi];
float hwi = lwi + static_cast<float>(q) - 1.0f;
float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
low_value[i] = (angular_offsets[bsi] + lwi) * stepsize;
high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
}
}
/* See header for documentation. */
void compute_angular_endpoints_1plane(
unsigned int tune_low_weight_limit,
bool only_always,
const block_size_descriptor& bsd,
const float* dec_weight_ideal_value,
@@ -519,20 +371,10 @@ void compute_angular_endpoints_1plane(
max_precision = max_weight_quant;
}
if (weight_count < tune_low_weight_limit)
{
compute_angular_endpoints_for_quant_levels_lwc(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
max_precision, low_values[i], high_values[i]);
}
else
{
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
max_precision, low_values[i], high_values[i]);
}
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
max_precision, low_values[i], high_values[i]);
}
unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
@@ -561,7 +403,6 @@ void compute_angular_endpoints_1plane(
/* See header for documentation. */
void compute_angular_endpoints_2planes(
unsigned int tune_low_weight_limit,
const block_size_descriptor& bsd,
const float* dec_weight_ideal_value,
unsigned int max_weight_quant,
@@ -599,30 +440,15 @@ void compute_angular_endpoints_2planes(
max_precision = max_weight_quant;
}
if (weight_count < tune_low_weight_limit)
{
compute_angular_endpoints_for_quant_levels_lwc(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
max_precision, low_values1[i], high_values1[i]);
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
max_precision, low_values1[i], high_values1[i]);
compute_angular_endpoints_for_quant_levels_lwc(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
max_precision, low_values2[i], high_values2[i]);
}
else
{
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
max_precision, low_values1[i], high_values1[i]);
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
max_precision, low_values2[i], high_values2[i]);
}
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
max_precision, low_values2[i], high_values2[i]);
}
unsigned int start = bsd.block_mode_count_1plane_selected;

View File

@@ -25,6 +25,12 @@ project "bimg"
using_bx()
configuration {}
removeflags {
"FloatFast", -- astc-encoder doesn't work with it.
}
configuration { "linux-*" }
buildoptions {
"-fPIC",

View File

@@ -6,10 +6,6 @@
project "bimg_encode"
kind "StaticLib"
removeflags {
"FloatFast", -- astc-encoder doesn't work with it.
}
includedirs {
path.join(BIMG_DIR, "include"),
path.join(BIMG_DIR, "3rdparty"),
@@ -42,6 +38,12 @@ project "bimg_encode"
using_bx()
configuration {}
removeflags {
"FloatFast", -- astc-encoder doesn't work with it.
}
configuration { "linux-*" }
buildoptions {
"-fPIC",

View File

@@ -147,7 +147,7 @@ namespace bimg
"ATCE", // ATCE
"ATCI", // ATCI
"ASTC4x4", // ASTC4x4
"ASTC5x4", // ASTC5x4
"ASTC5x4", // ASTC5x4
"ASTC5x5", // ASTC5x5
"ASTC6x5", // ASTC6x5
"ASTC6x6", // ASTC6x6
@@ -3851,7 +3851,7 @@ namespace bimg
#define KTX_ATC_RGB_AMD 0x8C92
#define KTX_ATC_RGBA_EXPLICIT_ALPHA_AMD 0x8C93
#define KTX_ATC_RGBA_INTERPOLATED_ALPHA_AMD 0x87EE
#define KTX_COMPRESSED_RGBA_ASTC_4x4_KHR 0x93B0
#define KTX_COMPRESSED_RGBA_ASTC_4x4_KHR 0x93B0
#define KTX_COMPRESSED_RGBA_ASTC_5x4_KHR 0x93B1
#define KTX_COMPRESSED_RGBA_ASTC_5x5_KHR 0x93B2
#define KTX_COMPRESSED_RGBA_ASTC_6x5_KHR 0x93B3
@@ -4918,25 +4918,32 @@ namespace bimg
case TextureFormat::ASTC12x12:
if (BX_ENABLED(BIMG_DECODE_ASTC) )
{
const unsigned int thread_count = 1;
const bimg::ImageBlockInfo& astcBlockInfo = bimg::getBlockInfo(_srcFormat);
const float quality = ASTCENC_PRE_MEDIUM;
const astcenc_profile profile = ASTCENC_PRF_LDR; //Linear LDR color profile
astcenc_error status;
//Create and init config and context
astcenc_config config{};
const unsigned int astcFlags = ASTCENC_FLG_DECOMPRESS_ONLY;
status = astcenc_config_init(profile, astcBlockInfo.blockWidth, astcBlockInfo.blockHeight, 1, quality, astcFlags, &config);
if (status != ASTCENC_SUCCESS) {
astcenc_error status = astcenc_config_init(
ASTCENC_PRF_LDR
, astcBlockInfo.blockWidth
, astcBlockInfo.blockHeight
, 1
, ASTCENC_PRE_MEDIUM
, ASTCENC_FLG_DECOMPRESS_ONLY
, &config
);
if (status != ASTCENC_SUCCESS)
{
BX_TRACE("astc error in config init %s", astcenc_get_error_string(status));
imageCheckerboard(_dst, _width, _height, 16, UINT32_C(0xff000000), UINT32_C(0xffffff00) );
break;
}
astcenc_context* context;
status = astcenc_context_alloc(&config, thread_count, &context);
if (status != ASTCENC_SUCCESS) {
status = astcenc_context_alloc(&config, 1, &context);
if (status != ASTCENC_SUCCESS)
{
BX_TRACE("astc error in context alloc %s", astcenc_get_error_string(status));
imageCheckerboard(_dst, _width, _height, 16, UINT32_C(0xff000000), UINT32_C(0xffffff00) );
break;
@@ -4944,21 +4951,36 @@ namespace bimg
//Put image data into an astcenc_image
astcenc_image image{};
image.dim_x = _width;
image.dim_y = _height;
image.dim_z = 1;
image.dim_x = _width;
image.dim_y = _height;
image.dim_z = 1;
image.data_type = ASTCENC_TYPE_U8;
image.data = &_dst;
image.data = &_dst;
const uint32_t size = imageGetSize(NULL, uint16_t(_width), uint16_t(_height), 0, false, false, 1, _srcFormat);
static const astcenc_swizzle swizzle { //0123/rgba swizzle corresponds to ASTC_RGBA
ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
static const astcenc_swizzle swizzle
{ //0123/rgba swizzle corresponds to ASTC_RGBA
ASTCENC_SWZ_R,
ASTCENC_SWZ_G,
ASTCENC_SWZ_B,
ASTCENC_SWZ_A,
};
status = astcenc_decompress_image(context, static_cast<const uint8_t*>(_src), size, &image, &swizzle, 0);
if (status != ASTCENC_SUCCESS) {
status = astcenc_decompress_image(
context
, (const uint8_t*)_src
, size
, &image
, &swizzle
, 0
);
if (status != ASTCENC_SUCCESS)
{
BX_TRACE("astc error in compress image %s", astcenc_get_error_string(status));
imageCheckerboard(_dst, _width, _height, 16, UINT32_C(0xff000000), UINT32_C(0xffffff00) );
astcenc_context_free(context);
break;
}

View File

@@ -52,7 +52,7 @@ namespace bimg
ASTCENC_PRE_THOROUGH, // Highest
ASTCENC_PRE_FAST, // Fastest
};
BX_STATIC_ASSERT(Quality::Count == BX_COUNTOF(s_astcQuality));
BX_STATIC_ASSERT(Quality::Count == BX_COUNTOF(s_astcQuality) );
void imageEncodeFromRgba8(bx::AllocatorI* _allocator, void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint32_t _depth, TextureFormat::Enum _format, Quality::Enum _quality, bx::Error* _err)
{
@@ -156,61 +156,83 @@ namespace bimg
case TextureFormat::ASTC12x10:
case TextureFormat::ASTC12x12:
{
const unsigned int thread_count = 1;
const bimg::ImageBlockInfo& astcBlockInfo = bimg::getBlockInfo(_format);
const float quality = s_astcQuality[_quality];
const astcenc_profile profile = ASTCENC_PRF_LDR; //Linear LDR color profile
astcenc_error status;
//Create and init config and context
astcenc_config config{};
unsigned int astcFlags = ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
if (Quality::NormalMapDefault <= _quality) {
uint32_t astcFlags = ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
if (Quality::NormalMapDefault <= _quality)
{
astcFlags |= ASTCENC_FLG_MAP_NORMAL;
}
status = astcenc_config_init(profile, astcBlockInfo.blockWidth, astcBlockInfo.blockHeight, 1, quality, astcFlags, &config);
if (status != ASTCENC_SUCCESS) {
BX_TRACE("astc error in config init %s", astcenc_get_error_string(status));
astcenc_error status = astcenc_config_init(
ASTCENC_PRF_LDR
, astcBlockInfo.blockWidth
, astcBlockInfo.blockHeight
, 1
, s_astcQuality[_quality]
, astcFlags
, &config
);
if (status != ASTCENC_SUCCESS)
{
BX_TRACE("astc error in config init %s", astcenc_get_error_string(status) );
BX_ERROR_SET(_err, BIMG_ERROR, "Unable to initialize astc config!");
break;
}
astcenc_context* context;
status = astcenc_context_alloc(&config, thread_count, &context);
if (status != ASTCENC_SUCCESS) {
BX_TRACE("astc error in context alloc %s", astcenc_get_error_string(status));
status = astcenc_context_alloc(&config, 1, &context);
if (status != ASTCENC_SUCCESS)
{
BX_TRACE("astc error in context alloc %s", astcenc_get_error_string(status) );
BX_ERROR_SET(_err, BIMG_ERROR, "Unable to alloc astc context!");
break;
}
//Put image data into an astcenc_image
astcenc_image image{};
image.dim_x = _width;
image.dim_y = _height;
image.dim_z = 1;
image.dim_x = _width;
image.dim_y = _height;
image.dim_z = 1;
image.data_type = ASTCENC_TYPE_U8;
image.data = reinterpret_cast<void**>(const_cast<uint8_t**>(&src));
image.data = (void**)&src;
const size_t block_count_x = (_width + astcBlockInfo.blockWidth - 1) / astcBlockInfo.blockWidth;
const size_t block_count_y = (_height + astcBlockInfo.blockHeight - 1) / astcBlockInfo.blockHeight;
const size_t comp_len = block_count_x * block_count_y * 16;
const size_t blockCountX = (_width + astcBlockInfo.blockWidth - 1) / astcBlockInfo.blockWidth;
const size_t blockCountY = (_height + astcBlockInfo.blockHeight - 1) / astcBlockInfo.blockHeight;
const size_t compLen = blockCountX * blockCountY * 16;
if (Quality::NormalMapDefault <= _quality)
{
static const astcenc_swizzle swizzle { //0001/rrrg swizzle corresponds to ASTC_ENC_NORMAL_RA
ASTCENC_SWZ_R, ASTCENC_SWZ_R, ASTCENC_SWZ_R, ASTCENC_SWZ_G
static const astcenc_swizzle swizzle
{ //0001/rrrg swizzle corresponds to ASTC_ENC_NORMAL_RA
ASTCENC_SWZ_R,
ASTCENC_SWZ_R,
ASTCENC_SWZ_R,
ASTCENC_SWZ_G,
};
status = astcenc_compress_image(context, &image, &swizzle, dst, comp_len, 0);
status = astcenc_compress_image(context, &image, &swizzle, dst, compLen, 0);
}
else
{
static const astcenc_swizzle swizzle { //0123/rgba swizzle corresponds to ASTC_RGBA
ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
static const astcenc_swizzle swizzle
{ //0123/rgba swizzle corresponds to ASTC_RGBA
ASTCENC_SWZ_R,
ASTCENC_SWZ_G,
ASTCENC_SWZ_B,
ASTCENC_SWZ_A,
};
status = astcenc_compress_image(context, &image, &swizzle, dst, comp_len, 0);
status = astcenc_compress_image(context, &image, &swizzle, dst, compLen, 0);
}
if (status != ASTCENC_SUCCESS) {
BX_TRACE("astc error in compress image %s", astcenc_get_error_string(status));
if (status != ASTCENC_SUCCESS)
{
BX_TRACE("astc error in compress image %s", astcenc_get_error_string(status) );
BX_ERROR_SET(_err, BIMG_ERROR, "Unable to compress astc image!");
astcenc_context_free(context);
break;