Updated astc-encoder.

This commit is contained in:
Бранимир Караџић
2024-05-24 16:41:07 -07:00
parent e9fa0ceff2
commit 98a40e8533
19 changed files with 373 additions and 160 deletions

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2020-2023 Arm Limited // Copyright 2020-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -215,6 +215,8 @@ enum astcenc_error {
ASTCENC_ERR_BAD_CONTEXT, ASTCENC_ERR_BAD_CONTEXT,
/** @brief The call failed due to unimplemented functionality. */ /** @brief The call failed due to unimplemented functionality. */
ASTCENC_ERR_NOT_IMPLEMENTED, ASTCENC_ERR_NOT_IMPLEMENTED,
/** @brief The call failed due to an out-of-spec decode mode flag set. */
ASTCENC_ERR_BAD_DECODE_MODE,
#if defined(ASTCENC_DIAGNOSTICS) #if defined(ASTCENC_DIAGNOSTICS)
/** @brief The call failed due to an issue with diagnostic tracing. */ /** @brief The call failed due to an issue with diagnostic tracing. */
ASTCENC_ERR_DTRACE_FAILURE, ASTCENC_ERR_DTRACE_FAILURE,
@@ -302,6 +304,11 @@ enum astcenc_type
ASTCENC_TYPE_F32 = 2 ASTCENC_TYPE_F32 = 2
}; };
/**
* @brief Function pointer type for compression progress reporting callback.
*/
extern "C" typedef void (*astcenc_progress_callback)(float);
/** /**
* @brief Enable normal map compression. * @brief Enable normal map compression.
* *
@@ -312,6 +319,19 @@ enum astcenc_type
*/ */
static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0; static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0;
/**
* @brief Enable compression heuristics that assume use of decode_unorm8 decode mode.
*
* The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this
* flag during compression will allow the compressor to use the correct rounding when selecting
* encodings. This will improve the compressed image quality if your application is using the
* decode_unorm8 decode mode, but will reduce image quality if using decode_fp16.
*
* Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of
* this setting.
*/
static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8 = 1 << 1;
/** /**
* @brief Enable alpha weighting. * @brief Enable alpha weighting.
* *
@@ -378,6 +398,7 @@ static const unsigned int ASTCENC_ALL_FLAGS =
ASTCENC_FLG_MAP_RGBM | ASTCENC_FLG_MAP_RGBM |
ASTCENC_FLG_USE_ALPHA_WEIGHT | ASTCENC_FLG_USE_ALPHA_WEIGHT |
ASTCENC_FLG_USE_PERCEPTUAL | ASTCENC_FLG_USE_PERCEPTUAL |
ASTCENC_FLG_USE_DECODE_UNORM8 |
ASTCENC_FLG_DECOMPRESS_ONLY | ASTCENC_FLG_DECOMPRESS_ONLY |
ASTCENC_FLG_SELF_DECOMPRESS_ONLY; ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
@@ -550,6 +571,16 @@ struct astcenc_config
*/ */
float tune_search_mode0_enable; float tune_search_mode0_enable;
/**
* @brief The progress callback, can be @c nullptr.
*
* If this is specified the codec will peridocially report progress for
* compression as a percentage between 0 and 100. The callback is called from one
* of the compressor threads, so doing significant work in the callback will
* reduce compression performance.
*/
astcenc_progress_callback progress_callback;
#if defined(ASTCENC_DIAGNOSTICS) #if defined(ASTCENC_DIAGNOSTICS)
/** /**
* @brief The path to save the diagnostic trace data to. * @brief The path to save the diagnostic trace data to.

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited // Copyright 2011-2023 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -894,32 +894,55 @@ void unpack_color_endpoints(
} }
} }
vint4 ldr_scale(257); // Handle endpoint errors and expansion
vint4 hdr_scale(1);
vint4 output_scale = ldr_scale;
// An LDR profile image // Linear LDR 8-bit endpoints are expanded to 16-bit by replication
if ((decode_mode == ASTCENC_PRF_LDR) || if (decode_mode == ASTCENC_PRF_LDR)
(decode_mode == ASTCENC_PRF_LDR_SRGB))
{ {
// Also matches HDR alpha, as cannot have HDR alpha without HDR RGB // Error color - HDR endpoint in an LDR encoding
if (rgb_hdr == true) if (rgb_hdr || alpha_hdr)
{ {
output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00); output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00); output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
output_scale = hdr_scale;
rgb_hdr = false; rgb_hdr = false;
alpha_hdr = false; alpha_hdr = false;
} }
output0 = output0 * 257;
output1 = output1 * 257;
} }
// An HDR profile image // sRGB LDR 8-bit endpoints are expanded to 16 bit by:
// - RGB = shift left by 8 bits and OR with 0x80
// - A = replication
else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
// Error color - HDR endpoint in an LDR encoding
if (rgb_hdr || alpha_hdr)
{
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
rgb_hdr = false;
alpha_hdr = false;
}
vmask4 mask(true, true, true, false);
vint4 output0rgb = lsl<8>(output0) | vint4(0x80);
vint4 output0a = output0 * 257;
output0 = select(output0a, output0rgb, mask);
vint4 output1rgb = lsl<8>(output1) | vint4(0x80);
vint4 output1a = output1 * 257;
output1 = select(output1a, output1rgb, mask);
}
// An HDR profile decode, but may be using linear LDR endpoints
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
// HDR endpoints are already 16-bit
else else
{ {
vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr); vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
output_scale = select(ldr_scale, hdr_scale, hdr_lanes); vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes);
output0 = output0 * output_scale;
output1 = output1 * output_scale;
} }
output0 = output0 * output_scale;
output1 = output1 * output_scale;
} }

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -247,7 +247,7 @@ static bool realign_weights_decimated(
} }
// Create an unquantized weight grid for this decimation level // Create an unquantized weight grid for this decimation level
alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH) for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
{ {
vint unquant_value(dec_weights_uquant + we_idx); vint unquant_value(dec_weights_uquant + we_idx);
@@ -467,7 +467,7 @@ static float compress_symbolic_block_for_partition_1plane(
qwt_bitcounts[i] = static_cast<int8_t>(bitcount); qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
// Generate the optimized set of weights for the weight mode // Generate the optimized set of weights for the weight mode
compute_quantized_weights_for_decimation( compute_quantized_weights_for_decimation(
@@ -830,7 +830,7 @@ static float compress_symbolic_block_for_partition_2planes(
unsigned int decimation_mode = bm.decimation_mode; unsigned int decimation_mode = bm.decimation_mode;
const auto& di = bsd.get_decimation_info(decimation_mode); const auto& di = bsd.get_decimation_info(decimation_mode);
alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
// Generate the optimized set of weights for the mode // Generate the optimized set of weights for the mode
compute_quantized_weights_for_decimation( compute_quantized_weights_for_decimation(

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -27,15 +27,15 @@
/** /**
* @brief Compute the integer linear interpolation of two color endpoints. * @brief Compute the integer linear interpolation of two color endpoints.
* *
* @param decode_mode The ASTC profile (linear or sRGB) * @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16.
* @param color0 The endpoint0 color. * @param color0 The endpoint0 color.
* @param color1 The endpoint1 color. * @param color1 The endpoint1 color.
* @param weights The interpolation weight (between 0 and 64). * @param weights The interpolation weight (between 0 and 64).
* *
* @return The interpolated color. * @return The interpolated color.
*/ */
static vint4 lerp_color_int( static vint4 lerp_color_int(
astcenc_profile decode_mode, vmask4 u8_mask,
vint4 color0, vint4 color0,
vint4 color1, vint4 color1,
vint4 weights vint4 weights
@@ -43,24 +43,18 @@ static vint4 lerp_color_int(
vint4 weight1 = weights; vint4 weight1 = weights;
vint4 weight0 = vint4(64) - weight1; vint4 weight0 = vint4(64) - weight1;
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
color0 = asr<8>(color0);
color1 = asr<8>(color1);
}
vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32); vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
color = asr<6>(color); color = asr<6>(color);
if (decode_mode == ASTCENC_PRF_LDR_SRGB) // For decode_unorm8 values force the codec to bit replicate. This allows the
{ // rest of the codec to assume the full 0xFFFF range for everything and ignore
color = color * vint4(257); // the decode_mode setting
} vint4 color_u8 = asr<8>(color) * vint4(257);
color = select(color, color_u8, u8_mask);
return color; return color;
} }
/** /**
* @brief Convert integer color value into a float value for the decoder. * @brief Convert integer color value into a float value for the decoder.
* *
@@ -229,12 +223,13 @@ void decompress_symbolic_block(
{ {
vint4 colori(scb.constant_color); vint4 colori(scb.constant_color);
// For sRGB decoding a real decoder would just use the top 8 bits for color conversion. // Determine the UNORM8 rounding on the decode
// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range. vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{ // The real decoder would just use the top 8 bits, but we rescale
colori = asr<8>(colori) * 257; // in to a 16-bit value that rounds correctly.
} vint4 colori_u8 = asr<8>(colori) * 257;
colori = select(colori, colori_u8, u8_mask);
vint4 colorf16 = unorm16_to_sf16(colori); vint4 colorf16 = unorm16_to_sf16(colori);
color = float16_to_float(colorf16); color = float16_to_float(colorf16);
@@ -289,6 +284,8 @@ void decompress_symbolic_block(
int plane2_component = scb.plane2_component; int plane2_component = scb.plane2_component;
vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component); vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
for (int i = 0; i < partition_count; i++) for (int i = 0; i < partition_count; i++)
{ {
// Decode the color endpoints for this partition // Decode the color endpoints for this partition
@@ -310,7 +307,7 @@ void decompress_symbolic_block(
{ {
int tix = pi.texels_of_partition[i][j]; int tix = pi.texels_of_partition[i][j];
vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask); vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight); vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
vfloat4 colorf = decode_texel(color, lns_mask); vfloat4 colorf = decode_texel(color, lns_mask);
blk.data_r[tix] = colorf.lane<0>(); blk.data_r[tix] = colorf.lane<0>();
@@ -365,12 +362,14 @@ float compute_symbolic_block_difference_2plane(
rgb_lns, a_lns, rgb_lns, a_lns,
ep0, ep1); ep0, ep1);
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
// Unpack and compute error for each texel in the partition // Unpack and compute error for each texel in the partition
unsigned int texel_count = bsd.texel_count; unsigned int texel_count = bsd.texel_count;
for (unsigned int i = 0; i < texel_count; i++) for (unsigned int i = 0; i < texel_count; i++)
{ {
vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask); vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight); vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
vfloat4 color = int_to_float(colori); vfloat4 color = int_to_float(colori);
vfloat4 oldColor = blk.texel(i); vfloat4 oldColor = blk.texel(i);
@@ -444,6 +443,8 @@ float compute_symbolic_block_difference_1plane(
int plane1_weights[BLOCK_MAX_TEXELS]; int plane1_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr); unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
vfloat4 summa = vfloat4::zero(); vfloat4 summa = vfloat4::zero();
for (unsigned int i = 0; i < partition_count; i++) for (unsigned int i = 0; i < partition_count; i++)
{ {
@@ -464,7 +465,7 @@ float compute_symbolic_block_difference_1plane(
for (unsigned int j = 0; j < texel_count; j++) for (unsigned int j = 0; j < texel_count; j++)
{ {
unsigned int tix = pi.texels_of_partition[i][j]; unsigned int tix = pi.texels_of_partition[i][j];
vint4 colori = lerp_color_int(config.profile, ep0, ep1, vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
vint4(plane1_weights[tix])); vint4(plane1_weights[tix]));
vfloat4 color = int_to_float(colori); vfloat4 color = int_to_float(colori);
@@ -532,7 +533,7 @@ float compute_symbolic_block_difference_1plane_1partition(
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
// Unquantize and undecimate the weights // Unquantize and undecimate the weights
alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr); unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
// Decode the color endpoints for this partition // Decode the color endpoints for this partition
@@ -547,19 +548,12 @@ float compute_symbolic_block_difference_1plane_1partition(
rgb_lns, a_lns, rgb_lns, a_lns,
ep0, ep1); ep0, ep1);
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
// Pre-shift sRGB so things round correctly
if (config.profile == ASTCENC_PRF_LDR_SRGB)
{
ep0 = asr<8>(ep0);
ep1 = asr<8>(ep1);
}
// Unpack and compute error for each texel in the partition // Unpack and compute error for each texel in the partition
vfloatacc summav = vfloatacc::zero(); vfloatacc summav = vfloatacc::zero();
vint lane_id = vint::lane_id(); vint lane_id = vint::lane_id();
vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
unsigned int texel_count = bsd.texel_count; unsigned int texel_count = bsd.texel_count;
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
@@ -578,11 +572,25 @@ float compute_symbolic_block_difference_1plane_1partition(
vint ep0_b = vint(ep0.lane<2>()) * weight0; vint ep0_b = vint(ep0.lane<2>()) * weight0;
vint ep0_a = vint(ep0.lane<3>()) * weight0; vint ep0_a = vint(ep0.lane<3>()) * weight0;
// Shift so things round correctly // Combine contributions
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale; vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale; vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale; vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale; vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
// If using a U8 decode mode bit replicate top 8 bits
// so rest of codec can assume 0xFFFF max range everywhere
vint colori_r8 = asr<8>(colori_r) * vint(257);
colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
vint colori_g8 = asr<8>(colori_g) * vint(257);
colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
vint colori_b8 = asr<8>(colori_b) * vint(257);
colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
vint colori_a8 = asr<8>(colori_a) * vint(257);
colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
// Compute color diff // Compute color diff
vfloat color_r = int_to_float(colori_r); vfloat color_r = int_to_float(colori_r);

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -217,11 +217,13 @@ static astcenc_error validate_block_size(
/** /**
* @brief Validate flags. * @brief Validate flags.
* *
* @param flags The flags to check. * @param profile The profile to check.
* @param flags The flags to check.
* *
* @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
*/ */
static astcenc_error validate_flags( static astcenc_error validate_flags(
astcenc_profile profile,
unsigned int flags unsigned int flags
) { ) {
// Flags field must not contain any unknown flag bits // Flags field must not contain any unknown flag bits
@@ -239,6 +241,14 @@ static astcenc_error validate_flags(
return ASTCENC_ERR_BAD_FLAGS; return ASTCENC_ERR_BAD_FLAGS;
} }
// Decode_unorm8 must only be used with an LDR profile
bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
if (is_unorm8 && is_hdr)
{
return ASTCENC_ERR_BAD_DECODE_MODE;
}
return ASTCENC_SUCCESS; return ASTCENC_SUCCESS;
} }
@@ -364,7 +374,7 @@ static astcenc_error validate_config(
return status; return status;
} }
status = validate_flags(config.flags); status = validate_flags(config.profile, config.flags);
if (status != ASTCENC_SUCCESS) if (status != ASTCENC_SUCCESS)
{ {
return status; return status;
@@ -591,7 +601,7 @@ astcenc_error astcenc_config_init(
} }
// Flags field must not contain any unknown flag bits // Flags field must not contain any unknown flag bits
status = validate_flags(flags); status = validate_flags(profile, flags);
if (status != ASTCENC_SUCCESS) if (status != ASTCENC_SUCCESS)
{ {
return status; return status;
@@ -689,6 +699,12 @@ astcenc_error astcenc_context_alloc(
} }
ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN); ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
if (!ctx->bsd)
{
delete ctxo;
return ASTCENC_ERR_OUT_OF_MEM;
}
bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY); bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
init_block_size_descriptor(config.block_x, config.block_y, config.block_z, init_block_size_descriptor(config.block_x, config.block_y, config.block_z,
can_omit_modes, can_omit_modes,
@@ -698,7 +714,7 @@ astcenc_error astcenc_context_alloc(
#if !defined(ASTCENC_DECOMPRESS_ONLY) #if !defined(ASTCENC_DECOMPRESS_ONLY)
// Do setup only needed by compression // Do setup only needed by compression
if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY)) if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
{ {
// Turn a dB limit into a per-texel error for faster use later // Turn a dB limit into a per-texel error for faster use later
if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB)) if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
@@ -712,7 +728,7 @@ astcenc_error astcenc_context_alloc(
size_t worksize = sizeof(compression_working_buffers) * thread_count; size_t worksize = sizeof(compression_working_buffers) * thread_count;
ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN); ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0, static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
"compression_working_buffers size must be multiple of vector alignment"); "compression_working_buffers size must be multiple of vector alignment");
if (!ctx->working_buffers) if (!ctx->working_buffers)
{ {
@@ -802,6 +818,8 @@ static void compress_image(
int row_blocks = xblocks; int row_blocks = xblocks;
int plane_blocks = xblocks * yblocks; int plane_blocks = xblocks * yblocks;
blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
// Populate the block channel weights // Populate the block channel weights
blk.channel_weight = vfloat4(ctx.config.cw_r_weight, blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
ctx.config.cw_g_weight, ctx.config.cw_g_weight,
@@ -812,7 +830,7 @@ static void compress_image(
auto& temp_buffers = ctx.working_buffers[thread_index]; auto& temp_buffers = ctx.working_buffers[thread_index];
// Only the first thread actually runs the initializer // Only the first thread actually runs the initializer
ctxo.manage_compress.init(block_count); ctxo.manage_compress.init(block_count, ctx.config.progress_callback);
// Determine if we can use an optimized load function // Determine if we can use an optimized load function
bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) || bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
@@ -1137,6 +1155,7 @@ astcenc_error astcenc_decompress_image(
unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x; unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y; unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z; unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
unsigned int block_count = zblocks * yblocks * xblocks;
int row_blocks = xblocks; int row_blocks = xblocks;
int plane_blocks = xblocks * yblocks; int plane_blocks = xblocks * yblocks;
@@ -1148,9 +1167,12 @@ astcenc_error astcenc_decompress_image(
return ASTCENC_ERR_OUT_OF_MEM; return ASTCENC_ERR_OUT_OF_MEM;
} }
image_block blk; image_block blk {};
blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z); blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
// Decode mode inferred from the output data type
blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
// If context thread count is one then implicitly reset // If context thread count is one then implicitly reset
if (ctx->thread_count == 1) if (ctx->thread_count == 1)
{ {
@@ -1158,7 +1180,7 @@ astcenc_error astcenc_decompress_image(
} }
// Only the first thread actually runs the initializer // Only the first thread actually runs the initializer
ctxo->manage_decompress.init(zblocks * yblocks * xblocks); ctxo->manage_decompress.init(block_count, nullptr);
// All threads run this processing loop until there is no work remaining // All threads run this processing loop until there is no work remaining
while (true) while (true)
@@ -1356,6 +1378,8 @@ const char* astcenc_get_error_string(
return "ASTCENC_ERR_BAD_CONTEXT"; return "ASTCENC_ERR_BAD_CONTEXT";
case ASTCENC_ERR_NOT_IMPLEMENTED: case ASTCENC_ERR_NOT_IMPLEMENTED:
return "ASTCENC_ERR_NOT_IMPLEMENTED"; return "ASTCENC_ERR_NOT_IMPLEMENTED";
case ASTCENC_ERR_BAD_DECODE_MODE:
return "ASTCENC_ERR_BAD_DECODE_MODE";
#if defined(ASTCENC_DIAGNOSTICS) #if defined(ASTCENC_DIAGNOSTICS)
case ASTCENC_ERR_DTRACE_FAILURE: case ASTCENC_ERR_DTRACE_FAILURE:
return "ASTCENC_ERR_DTRACE_FAILURE"; return "ASTCENC_ERR_DTRACE_FAILURE";

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -873,7 +873,7 @@ void compute_ideal_weights_for_decimation(
} }
// Otherwise compute an estimate and perform single refinement iteration // Otherwise compute an estimate and perform single refinement iteration
alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
// Compute an initial average for each decimated weight // Compute an initial average for each decimated weight
bool constant_wes = ei.is_constant_weight_error_scale; bool constant_wes = ei.is_constant_weight_error_scale;
@@ -1171,7 +1171,7 @@ void recompute_ideal_colors_1plane(
promise(total_texel_count > 0); promise(total_texel_count > 0);
promise(partition_count > 0); promise(partition_count > 0);
alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS];
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
{ {
vint unquant_value(dec_weights_uquant + i); vint unquant_value(dec_weights_uquant + i);
@@ -1179,7 +1179,7 @@ void recompute_ideal_colors_1plane(
storea(unquant_valuef, dec_weight + i); storea(unquant_valuef, dec_weight + i);
} }
alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS];
float* undec_weight_ref; float* undec_weight_ref;
if (di.max_texel_weight_count == 1) if (di.max_texel_weight_count == 1)
{ {
@@ -1394,8 +1394,8 @@ void recompute_ideal_colors_2planes(
promise(total_texel_count > 0); promise(total_texel_count > 0);
promise(weight_count > 0); promise(weight_count > 0);
alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE]; ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE]; ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE); assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
@@ -1410,8 +1410,8 @@ void recompute_ideal_colors_2planes(
storea(unquant_value2f, dec_weight_plane2 + i); storea(unquant_value2f, dec_weight_plane2 + i);
} }
alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS];
alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS];
float* undec_weight_plane1_ref; float* undec_weight_plane1_ref;
float* undec_weight_plane2_ref; float* undec_weight_plane2_ref;

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -109,7 +109,7 @@ static vfloat4 swz_texel(
vfloat4 data, vfloat4 data,
const astcenc_swizzle& swz const astcenc_swizzle& swz
) { ) {
alignas(16) float datas[6]; ASTCENC_ALIGNAS float datas[6];
storea(data, datas); storea(data, datas);
datas[ASTCENC_SWZ_0] = 0.0f; datas[ASTCENC_SWZ_0] = 0.0f;

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -464,10 +464,10 @@ static inline void write_bits(
} }
/** /**
* @brief Read up to 8 bits at an arbitrary bit offset. * @brief Read up to 16 bits from two bytes.
* *
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may * This function reads a packed N-bit field from two bytes in memory. The stored value must exist
* span two separate bytes in memory. * within the two bytes, but can start at an arbitary bit offset and span the two bytes in memory.
* *
* @param bitcount The number of bits to read. * @param bitcount The number of bits to read.
* @param bitoffset The bit offset to read from, between 0 and 7. * @param bitoffset The bit offset to read from, between 0 and 7.

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -29,6 +29,7 @@
#include <cstdio> #include <cstdio>
#endif #endif
#include <cstdlib> #include <cstdlib>
#include <limits>
#include "astcenc.h" #include "astcenc.h"
#include "astcenc_mathlib.h" #include "astcenc_mathlib.h"
@@ -325,10 +326,10 @@ struct partition_info
uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS]; uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS];
/** @brief The partition of each texel in the block. */ /** @brief The partition of each texel in the block. */
uint8_t partition_of_texel[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
/** @brief The list of texels in each partition. */ /** @brief The list of texels in each partition. */
uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
}; };
/** /**
@@ -366,40 +367,40 @@ struct decimation_info
* @brief The number of weights that contribute to each texel. * @brief The number of weights that contribute to each texel.
* Value is between 1 and 4. * Value is between 1 and 4.
*/ */
uint8_t texel_weight_count[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
/** /**
* @brief The weight index of the N weights that are interpolated for each texel. * @brief The weight index of the N weights that are interpolated for each texel.
* Stored transposed to improve vectorization. * Stored transposed to improve vectorization.
*/ */
uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
/** /**
* @brief The bilinear contribution of the N weights that are interpolated for each texel. * @brief The bilinear contribution of the N weights that are interpolated for each texel.
* Value is between 0 and 16, stored transposed to improve vectorization. * Value is between 0 and 16, stored transposed to improve vectorization.
*/ */
uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
/** /**
* @brief The bilinear contribution of the N weights that are interpolated for each texel. * @brief The bilinear contribution of the N weights that are interpolated for each texel.
* Value is between 0 and 1, stored transposed to improve vectorization. * Value is between 0 and 1, stored transposed to improve vectorization.
*/ */
alignas(ASTCENC_VECALIGN) float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
/** @brief The number of texels that each stored weight contributes to. */ /** @brief The number of texels that each stored weight contributes to. */
uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
/** /**
* @brief The list of texels that use a specific weight index. * @brief The list of texels that use a specific weight index.
* Stored transposed to improve vectorization. * Stored transposed to improve vectorization.
*/ */
uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
/** /**
* @brief The bilinear contribution to the N texels that use each weight. * @brief The bilinear contribution to the N texels that use each weight.
* Value is between 0 and 1, stored transposed to improve vectorization. * Value is between 0 and 1, stored transposed to improve vectorization.
*/ */
alignas(ASTCENC_VECALIGN) float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
/** /**
* @brief The bilinear contribution to the Nth texel that uses each weight. * @brief The bilinear contribution to the Nth texel that uses each weight.
@@ -579,7 +580,7 @@ struct block_size_descriptor
decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES]; decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES];
/** @brief The active decimation tables, stored in low indices. */ /** @brief The active decimation tables, stored in low indices. */
alignas(ASTCENC_VECALIGN) decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES]; ASTCENC_ALIGNAS decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
/** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */ /** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */
uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES]; uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES];
@@ -731,7 +732,11 @@ struct block_size_descriptor
* *
* The @c data_[rgba] fields store the image data in an encoded SoA float form designed for easy * The @c data_[rgba] fields store the image data in an encoded SoA float form designed for easy
* vectorization. Input data is converted to float and stored as values between 0 and 65535. LDR * vectorization. Input data is converted to float and stored as values between 0 and 65535. LDR
* data is stored as direct UNORM data, HDR data is stored as LNS data. * data is stored as direct UNORM data, HDR data is stored as LNS data. They are allocated SIMD
* elements over-size to allow vectorized stores of unaligned and partial SIMD lanes (e.g. in a
* 6x6x6 block the final row write will read elements 210-217 (vec8) or 214-217 (vec4), which is
* two elements above the last real data element). The overspill values are never written to memory,
* and would be benign, but the padding avoids hitting undefined behavior.
* *
* The @c rgb_lns and @c alpha_lns fields that assigned a per-texel use of HDR are only used during * The @c rgb_lns and @c alpha_lns fields that assigned a per-texel use of HDR are only used during
* decompression. The current compressor will always use HDR endpoint formats when in HDR mode. * decompression. The current compressor will always use HDR endpoint formats when in HDR mode.
@@ -739,16 +744,16 @@ struct block_size_descriptor
struct image_block struct image_block
{ {
/** @brief The input (compress) or output (decompress) data for the red color component. */ /** @brief The input (compress) or output (decompress) data for the red color component. */
alignas(ASTCENC_VECALIGN) float data_r[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float data_r[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
/** @brief The input (compress) or output (decompress) data for the green color component. */ /** @brief The input (compress) or output (decompress) data for the green color component. */
alignas(ASTCENC_VECALIGN) float data_g[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float data_g[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
/** @brief The input (compress) or output (decompress) data for the blue color component. */ /** @brief The input (compress) or output (decompress) data for the blue color component. */
alignas(ASTCENC_VECALIGN) float data_b[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float data_b[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
/** @brief The input (compress) or output (decompress) data for the alpha color component. */ /** @brief The input (compress) or output (decompress) data for the alpha color component. */
alignas(ASTCENC_VECALIGN) float data_a[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float data_a[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
/** @brief The number of texels in the block. */ /** @brief The number of texels in the block. */
uint8_t texel_count; uint8_t texel_count;
@@ -771,6 +776,9 @@ struct image_block
/** @brief Is this grayscale block where R == G == B for all texels? */ /** @brief Is this grayscale block where R == G == B for all texels? */
bool grayscale; bool grayscale;
/** @brief Is the eventual decode using decode_unorm8 rounding? */
bool decode_unorm8;
/** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */ /** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */
uint8_t rgb_lns[BLOCK_MAX_TEXELS]; uint8_t rgb_lns[BLOCK_MAX_TEXELS];
@@ -897,10 +905,10 @@ struct endpoints_and_weights
endpoints ep; endpoints ep;
/** @brief The ideal weight for each texel; may be undecimated or decimated. */ /** @brief The ideal weight for each texel; may be undecimated or decimated. */
alignas(ASTCENC_VECALIGN) float weights[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float weights[BLOCK_MAX_TEXELS];
/** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */ /** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */
alignas(ASTCENC_VECALIGN) float weight_error_scale[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float weight_error_scale[BLOCK_MAX_TEXELS];
}; };
/** /**
@@ -930,7 +938,7 @@ struct encoding_choice_errors
/** /**
* @brief Preallocated working buffers, allocated per thread during context creation. * @brief Preallocated working buffers, allocated per thread during context creation.
*/ */
struct alignas(ASTCENC_VECALIGN) compression_working_buffers struct ASTCENC_ALIGNAS compression_working_buffers
{ {
/** @brief Ideal endpoints and weights for plane 1. */ /** @brief Ideal endpoints and weights for plane 1. */
endpoints_and_weights ei1; endpoints_and_weights ei1;
@@ -946,17 +954,17 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers
* *
* For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets. * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
*/ */
alignas(ASTCENC_VECALIGN) float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
/** /**
* @brief Decimated quantized weight values in the unquantized 0-64 range. * @brief Decimated quantized weight values in the unquantized 0-64 range.
* *
* For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets. * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
*/ */
uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
/** @brief Error of the best encoding combination for each block mode. */ /** @brief Error of the best encoding combination for each block mode. */
alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES]; ASTCENC_ALIGNAS float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
/** @brief The best color quant for each block mode. */ /** @brief The best color quant for each block mode. */
uint8_t best_quant_levels[WEIGHTS_MAX_BLOCK_MODES]; uint8_t best_quant_levels[WEIGHTS_MAX_BLOCK_MODES];
@@ -1107,7 +1115,7 @@ struct symbolic_compressed_block
* *
* If dual plane, the second plane starts at @c weights[WEIGHTS_PLANE2_OFFSET]. * If dual plane, the second plane starts at @c weights[WEIGHTS_PLANE2_OFFSET].
*/ */
uint8_t weights[BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS uint8_t weights[BLOCK_MAX_WEIGHTS];
/** /**
* @brief Get the weight quantization used by this block mode. * @brief Get the weight quantization used by this block mode.
@@ -1563,6 +1571,33 @@ unsigned int find_best_partition_candidates(
Functionality for managing images and image related data. Functionality for managing images and image related data.
============================================================================ */ ============================================================================ */
/**
* @brief Get a vector mask indicating lanes decompressing into a UNORM8 value.
*
* @param decode_mode The color profile for LDR_SRGB settings.
* @param blk The image block for output image bitness settings.
*
* @return The component mask vector.
*/
static inline vmask4 get_u8_component_mask(
astcenc_profile decode_mode,
const image_block& blk
) {
vmask4 u8_mask(false);
// Decode mode writing to a unorm8 output value
if (blk.decode_unorm8)
{
u8_mask = vmask4(true);
}
// SRGB writing to a unorm8 RGB value
else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
u8_mask = vmask4(true, true, true, false);
}
return u8_mask;
}
/** /**
* @brief Setup computation of regional averages in an image. * @brief Setup computation of regional averages in an image.
* *
@@ -1816,7 +1851,7 @@ uint8_t pack_color_endpoints(
* *
* Endpoints must be unscrambled and converted into the 0-255 range before calling this functions. * Endpoints must be unscrambled and converted into the 0-255 range before calling this functions.
* *
* @param decode_mode The decode mode (LDR, HDR). * @param decode_mode The decode mode (LDR, HDR, etc).
* @param format The color endpoint mode used. * @param format The color endpoint mode used.
* @param input The raw array of encoded input integers. The length of this array * @param input The raw array of encoded input integers. The length of this array
* depends on @c format; it can be safely assumed to be large enough. * depends on @c format; it can be safely assumed to be large enough.
@@ -2142,10 +2177,11 @@ Platform-specific functions.
/** /**
* @brief Allocate an aligned memory buffer. * @brief Allocate an aligned memory buffer.
* *
* Allocated memory must be freed by aligned_free; * Allocated memory must be freed by aligned_free.
* *
* @param size The desired buffer size. * @param size The desired buffer size.
* @param align The desired buffer alignment; must be 2^N. * @param align The desired buffer alignment; must be 2^N, may be increased
* by the implementation to a minimum allowable alignment.
* *
* @return The memory buffer pointer or nullptr on allocation failure. * @return The memory buffer pointer or nullptr on allocation failure.
*/ */
@@ -2155,10 +2191,14 @@ T* aligned_malloc(size_t size, size_t align)
void* ptr; void* ptr;
int error = 0; int error = 0;
// Don't allow this to under-align a type
size_t min_align = astc::max(alignof(T), sizeof(void*));
size_t real_align = astc::max(min_align, align);
#if defined(_WIN32) #if defined(_WIN32)
ptr = _aligned_malloc(size, align); ptr = _aligned_malloc(size, real_align);
#else #else
error = posix_memalign(&ptr, align, size); error = posix_memalign(&ptr, real_align, size);
#endif #endif
if (error || (!ptr)) if (error || (!ptr))

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -118,6 +118,18 @@ private:
/** @brief Number of tasks that need to be processed. */ /** @brief Number of tasks that need to be processed. */
unsigned int m_task_count; unsigned int m_task_count;
/** @brief Progress callback (optional). */
astcenc_progress_callback m_callback;
/** @brief Lock used for callback synchronization. */
std::mutex m_callback_lock;
/** @brief Minimum progress before making a callback. */
float m_callback_min_diff;
/** @brief Last progress callback value. */
float m_callback_last_value;
public: public:
/** @brief Create a new ParallelManager. */ /** @brief Create a new ParallelManager. */
ParallelManager() ParallelManager()
@@ -138,6 +150,9 @@ public:
m_start_count = 0; m_start_count = 0;
m_done_count = 0; m_done_count = 0;
m_task_count = 0; m_task_count = 0;
m_callback = nullptr;
m_callback_last_value = 0.0f;
m_callback_min_diff = 1.0f;
} }
/** /**
@@ -166,14 +181,20 @@ public:
* initialization. Other threads will block and wait for it to complete. * initialization. Other threads will block and wait for it to complete.
* *
* @param task_count Total number of tasks needing processing. * @param task_count Total number of tasks needing processing.
* @param callback Function pointer for progress status callbacks.
*/ */
void init(unsigned int task_count) void init(unsigned int task_count, astcenc_progress_callback callback)
{ {
std::lock_guard<std::mutex> lck(m_lock); std::lock_guard<std::mutex> lck(m_lock);
if (!m_init_done) if (!m_init_done)
{ {
m_callback = callback;
m_task_count = task_count; m_task_count = task_count;
m_init_done = true; m_init_done = true;
// Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead
float min_diff = (4096.0f / static_cast<float>(task_count)) * 100.0f;
m_callback_min_diff = astc::max(min_diff, 1.0f);
} }
} }
@@ -212,12 +233,49 @@ public:
{ {
// Note: m_done_count cannot use an atomic without the mutex; this has a race between the // Note: m_done_count cannot use an atomic without the mutex; this has a race between the
// update here and the wait() for other threads // update here and the wait() for other threads
std::unique_lock<std::mutex> lck(m_lock); unsigned int local_count;
this->m_done_count += count; float local_last_value;
if (m_done_count == m_task_count)
{ {
lck.unlock(); std::unique_lock<std::mutex> lck(m_lock);
m_complete.notify_all(); m_done_count += count;
local_count = m_done_count;
local_last_value = m_callback_last_value;
if (m_done_count == m_task_count)
{
// Ensure the progress bar hits 100%
if (m_callback)
{
std::unique_lock<std::mutex> cblck(m_callback_lock);
m_callback(100.0f);
m_callback_last_value = 100.0f;
}
lck.unlock();
m_complete.notify_all();
}
}
// Process progress callback if we have one
if (m_callback)
{
// Initial lockless test - have we progressed enough to emit?
float num = static_cast<float>(local_count);
float den = static_cast<float>(m_task_count);
float this_value = (num / den) * 100.0f;
bool report_test = (this_value - local_last_value) > m_callback_min_diff;
// Recheck under lock, because another thread might report first
if (report_test)
{
std::unique_lock<std::mutex> cblck(m_callback_lock);
bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff;
if (report_retest)
{
m_callback(this_value);
m_callback_last_value = this_value;
}
}
} }
} }

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -73,10 +73,22 @@
#endif #endif
#endif #endif
// Force vector-sized SIMD alignment
#if ASTCENC_AVX #if ASTCENC_AVX
#define ASTCENC_VECALIGN 32 #define ASTCENC_VECALIGN 32
#else #elif ASTCENC_SSE || ASTCENC_NEON
#define ASTCENC_VECALIGN 16 #define ASTCENC_VECALIGN 16
// Use default alignment for non-SIMD builds
#else
#define ASTCENC_VECALIGN 0
#endif
// C++11 states that alignas(0) should be ignored but GCC doesn't do
// this on some versions, so workaround and avoid emitting alignas(0)
#if ASTCENC_VECALIGN > 0
#define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN)
#else
#define ASTCENC_ALIGNAS
#endif #endif
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0 #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0

View File

@@ -15,13 +15,13 @@
// under the License. // under the License.
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
#include "astcenc_mathlib.h"
/** /**
* @brief Soft-float library for IEEE-754. * @brief Soft-float library for IEEE-754.
*/ */
#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0) #if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
#include "astcenc_mathlib.h"
/* sized soft-float types. These are mapped to the sized integer /* sized soft-float types. These are mapped to the sized integer
types of C99, instead of C's floating-point types; this is because types of C99, instead of C's floating-point types; this is because
the library needs to maintain exact, bit-level control on all the library needs to maintain exact, bit-level control on all

View File

@@ -330,12 +330,14 @@ void physical_to_symbolic(
return; return;
} }
// Low values span 3 bytes so need two read_bits calls
int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8); int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8);
int vx_high_s = read_bits(8, 25, pcb) | (read_bits(5, 25 + 8, pcb) << 8); int vx_high_s = read_bits(13, 25, pcb);
int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8); int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8);
int vx_high_t = read_bits(8, 51, pcb) | (read_bits(5, 51 + 8, pcb) << 8); int vx_high_t = read_bits(13, 51, pcb);
int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF; int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF &&
vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones) if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
{ {
@@ -350,12 +352,14 @@ void physical_to_symbolic(
int vx_high_s = read_bits(9, 19, pcb); int vx_high_s = read_bits(9, 19, pcb);
int vx_low_t = read_bits(9, 28, pcb); int vx_low_t = read_bits(9, 28, pcb);
int vx_high_t = read_bits(9, 37, pcb); int vx_high_t = read_bits(9, 37, pcb);
int vx_low_p = read_bits(9, 46, pcb); int vx_low_r = read_bits(9, 46, pcb);
int vx_high_p = read_bits(9, 55, pcb); int vx_high_r = read_bits(9, 55, pcb);
int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF; int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF &&
vx_low_t == 0x1FF && vx_high_t == 0x1FF &&
vx_low_r == 0x1FF && vx_high_r == 0x1FF;
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_p >= vx_high_p) && !all_ones) if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_r >= vx_high_r) && !all_ones)
{ {
scb.block_type = SYM_BTYPE_ERROR; scb.block_type = SYM_BTYPE_ERROR;
return; return;
@@ -470,8 +474,7 @@ void physical_to_symbolic(
bitpos += 2; bitpos += 2;
} }
} }
scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb) | scb.partition_index = static_cast<uint16_t>(read_bits(10, 13, pcb));
(read_bits(PARTITION_INDEX_BITS - 6, 19, pcb) << 6));
} }
for (int i = 0; i < partition_count; i++) for (int i = 0; i < partition_count; i++)

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2019-2022 Arm Limited // Copyright 2019-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -1170,7 +1170,7 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 ma
*/ */
ASTCENC_SIMD_INLINE void print(vint8 a) ASTCENC_SIMD_INLINE void print(vint8 a)
{ {
alignas(ASTCENC_VECALIGN) int v[8]; alignas(32) int v[8];
storea(a, v); storea(a, v);
printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n", printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n",
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
@@ -1181,7 +1181,7 @@ ASTCENC_SIMD_INLINE void print(vint8 a)
*/ */
ASTCENC_SIMD_INLINE void printx(vint8 a) ASTCENC_SIMD_INLINE void printx(vint8 a)
{ {
alignas(ASTCENC_VECALIGN) int v[8]; alignas(32) int v[8];
storea(a, v); storea(a, v);
printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n", printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n",
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
@@ -1192,7 +1192,7 @@ ASTCENC_SIMD_INLINE void printx(vint8 a)
*/ */
ASTCENC_SIMD_INLINE void print(vfloat8 a) ASTCENC_SIMD_INLINE void print(vfloat8 a)
{ {
alignas(ASTCENC_VECALIGN) float v[8]; alignas(32) float v[8];
storea(a, v); storea(a, v);
printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n", printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
static_cast<double>(v[0]), static_cast<double>(v[1]), static_cast<double>(v[0]), static_cast<double>(v[1]),

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2020-2021 Arm Limited // Copyright 2020-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -383,7 +383,7 @@ static ASTCENC_SIMD_INLINE void bit_transfer_signed(
*/ */
ASTCENC_SIMD_INLINE void print(vint4 a) ASTCENC_SIMD_INLINE void print(vint4 a)
{ {
alignas(16) int v[4]; ASTCENC_ALIGNAS int v[4];
storea(a, v); storea(a, v);
printf("v4_i32:\n %8d %8d %8d %8d\n", printf("v4_i32:\n %8d %8d %8d %8d\n",
v[0], v[1], v[2], v[3]); v[0], v[1], v[2], v[3]);
@@ -394,7 +394,7 @@ ASTCENC_SIMD_INLINE void print(vint4 a)
*/ */
ASTCENC_SIMD_INLINE void printx(vint4 a) ASTCENC_SIMD_INLINE void printx(vint4 a)
{ {
alignas(16) int v[4]; ASTCENC_ALIGNAS int v[4];
storea(a, v); storea(a, v);
printf("v4_i32:\n %08x %08x %08x %08x\n", printf("v4_i32:\n %08x %08x %08x %08x\n",
v[0], v[1], v[2], v[3]); v[0], v[1], v[2], v[3]);
@@ -405,7 +405,7 @@ ASTCENC_SIMD_INLINE void printx(vint4 a)
*/ */
ASTCENC_SIMD_INLINE void print(vfloat4 a) ASTCENC_SIMD_INLINE void print(vfloat4 a)
{ {
alignas(16) float v[4]; ASTCENC_ALIGNAS float v[4];
storea(a, v); storea(a, v);
printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n", printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
static_cast<double>(v[0]), static_cast<double>(v[1]), static_cast<double>(v[0]), static_cast<double>(v[1]),

View File

@@ -359,9 +359,9 @@ struct vmask4
/** /**
* @brief Get the scalar from a single lane. * @brief Get the scalar from a single lane.
*/ */
template <int32_t l> ASTCENC_SIMD_INLINE uint32_t lane() const template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
{ {
return vgetq_lane_u32(m, l); return vgetq_lane_u32(m, l) != 0;
} }
/** /**

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2019-2022 Arm Limited // Copyright 2019-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -351,6 +351,13 @@ struct vmask4
m[3] = d == false ? 0 : -1; m[3] = d == false ? 0 : -1;
} }
/**
* @brief Get the scalar value of a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE float lane() const
{
return m[l] != 0;
}
/** /**
* @brief The vector ... * @brief The vector ...
@@ -549,10 +556,16 @@ ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
*/ */
template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a) template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
{ {
return vint4(a.m[0] << s, // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
a.m[1] << s, unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
a.m[2] << s, unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
a.m[3] << s); unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
return vint4(static_cast<int>(as0),
static_cast<int>(as1),
static_cast<int>(as2),
static_cast<int>(as3));
} }
/** /**
@@ -560,6 +573,7 @@ template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
*/ */
template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a) template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
{ {
// Cast to unsigned to avoid shift in/out of sign bit undefined behavior
unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s; unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s; unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s; unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2019-2022 Arm Limited // Copyright 2019-2023 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -379,9 +379,9 @@ struct vmask4
/** /**
* @brief Get the scalar value of a single lane. * @brief Get the scalar value of a single lane.
*/ */
template <int l> ASTCENC_SIMD_INLINE float lane() const template <int l> ASTCENC_SIMD_INLINE bool lane() const
{ {
return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)); return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)) != 0.0f;
} }
/** /**

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@@ -60,8 +60,8 @@ static const uint8_t steps_for_quant_level[12] {
2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
}; };
alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS]; ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS]; ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
#if defined(ASTCENC_DIAGNOSTICS) #if defined(ASTCENC_DIAGNOSTICS)
static bool print_once { true }; static bool print_once { true };
@@ -99,7 +99,7 @@ static void compute_angular_offsets(
promise(weight_count > 0); promise(weight_count > 0);
promise(max_angular_steps > 0); promise(max_angular_steps > 0);
alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
// Precompute isample; arrays are always allocated 64 elements long // Precompute isample; arrays are always allocated 64 elements long
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
@@ -242,16 +242,16 @@ static void compute_angular_endpoints_for_quant_levels(
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level]; unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level]; unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS]; ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
compute_angular_offsets(weight_count, dec_weight_ideal_value, compute_angular_offsets(weight_count, dec_weight_ideal_value,
max_angular_steps, angular_offsets); max_angular_steps, angular_offsets);
alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS]; ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS]; ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS]; ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS]; ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS]; ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value, compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
max_angular_steps, max_quant_steps, max_angular_steps, max_quant_steps,