mirror of
https://github.com/bkaradzic/bimg.git
synced 2026-02-17 20:52:38 +01:00
Updated astc-encoder.
This commit is contained in:
33
3rdparty/astc-encoder/include/astcenc.h
vendored
33
3rdparty/astc-encoder/include/astcenc.h
vendored
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2020-2023 Arm Limited
|
||||
// Copyright 2020-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -215,6 +215,8 @@ enum astcenc_error {
|
||||
ASTCENC_ERR_BAD_CONTEXT,
|
||||
/** @brief The call failed due to unimplemented functionality. */
|
||||
ASTCENC_ERR_NOT_IMPLEMENTED,
|
||||
/** @brief The call failed due to an out-of-spec decode mode flag set. */
|
||||
ASTCENC_ERR_BAD_DECODE_MODE,
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
/** @brief The call failed due to an issue with diagnostic tracing. */
|
||||
ASTCENC_ERR_DTRACE_FAILURE,
|
||||
@@ -302,6 +304,11 @@ enum astcenc_type
|
||||
ASTCENC_TYPE_F32 = 2
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Function pointer type for compression progress reporting callback.
|
||||
*/
|
||||
extern "C" typedef void (*astcenc_progress_callback)(float);
|
||||
|
||||
/**
|
||||
* @brief Enable normal map compression.
|
||||
*
|
||||
@@ -312,6 +319,19 @@ enum astcenc_type
|
||||
*/
|
||||
static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0;
|
||||
|
||||
/**
|
||||
* @brief Enable compression heuristics that assume use of decode_unorm8 decode mode.
|
||||
*
|
||||
* The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this
|
||||
* flag during compression will allow the compressor to use the correct rounding when selecting
|
||||
* encodings. This will improve the compressed image quality if your application is using the
|
||||
* decode_unorm8 decode mode, but will reduce image quality if using decode_fp16.
|
||||
*
|
||||
* Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of
|
||||
* this setting.
|
||||
*/
|
||||
static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8 = 1 << 1;
|
||||
|
||||
/**
|
||||
* @brief Enable alpha weighting.
|
||||
*
|
||||
@@ -378,6 +398,7 @@ static const unsigned int ASTCENC_ALL_FLAGS =
|
||||
ASTCENC_FLG_MAP_RGBM |
|
||||
ASTCENC_FLG_USE_ALPHA_WEIGHT |
|
||||
ASTCENC_FLG_USE_PERCEPTUAL |
|
||||
ASTCENC_FLG_USE_DECODE_UNORM8 |
|
||||
ASTCENC_FLG_DECOMPRESS_ONLY |
|
||||
ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
|
||||
|
||||
@@ -550,6 +571,16 @@ struct astcenc_config
|
||||
*/
|
||||
float tune_search_mode0_enable;
|
||||
|
||||
/**
|
||||
* @brief The progress callback, can be @c nullptr.
|
||||
*
|
||||
* If this is specified the codec will peridocially report progress for
|
||||
* compression as a percentage between 0 and 100. The callback is called from one
|
||||
* of the compressor threads, so doing significant work in the callback will
|
||||
* reduce compression performance.
|
||||
*/
|
||||
astcenc_progress_callback progress_callback;
|
||||
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
/**
|
||||
* @brief The path to save the diagnostic trace data to.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2021 Arm Limited
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -894,32 +894,55 @@ void unpack_color_endpoints(
|
||||
}
|
||||
}
|
||||
|
||||
vint4 ldr_scale(257);
|
||||
vint4 hdr_scale(1);
|
||||
vint4 output_scale = ldr_scale;
|
||||
// Handle endpoint errors and expansion
|
||||
|
||||
// An LDR profile image
|
||||
if ((decode_mode == ASTCENC_PRF_LDR) ||
|
||||
(decode_mode == ASTCENC_PRF_LDR_SRGB))
|
||||
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
|
||||
if (decode_mode == ASTCENC_PRF_LDR)
|
||||
{
|
||||
// Also matches HDR alpha, as cannot have HDR alpha without HDR RGB
|
||||
if (rgb_hdr == true)
|
||||
// Error color - HDR endpoint in an LDR encoding
|
||||
if (rgb_hdr || alpha_hdr)
|
||||
{
|
||||
output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
|
||||
output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
|
||||
output_scale = hdr_scale;
|
||||
|
||||
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
|
||||
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
|
||||
rgb_hdr = false;
|
||||
alpha_hdr = false;
|
||||
}
|
||||
|
||||
output0 = output0 * 257;
|
||||
output1 = output1 * 257;
|
||||
}
|
||||
// An HDR profile image
|
||||
// sRGB LDR 8-bit endpoints are expanded to 16 bit by:
|
||||
// - RGB = shift left by 8 bits and OR with 0x80
|
||||
// - A = replication
|
||||
else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
|
||||
{
|
||||
// Error color - HDR endpoint in an LDR encoding
|
||||
if (rgb_hdr || alpha_hdr)
|
||||
{
|
||||
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
|
||||
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
|
||||
rgb_hdr = false;
|
||||
alpha_hdr = false;
|
||||
}
|
||||
|
||||
vmask4 mask(true, true, true, false);
|
||||
|
||||
vint4 output0rgb = lsl<8>(output0) | vint4(0x80);
|
||||
vint4 output0a = output0 * 257;
|
||||
output0 = select(output0a, output0rgb, mask);
|
||||
|
||||
vint4 output1rgb = lsl<8>(output1) | vint4(0x80);
|
||||
vint4 output1a = output1 * 257;
|
||||
output1 = select(output1a, output1rgb, mask);
|
||||
}
|
||||
// An HDR profile decode, but may be using linear LDR endpoints
|
||||
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
|
||||
// HDR endpoints are already 16-bit
|
||||
else
|
||||
{
|
||||
vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
|
||||
output_scale = select(ldr_scale, hdr_scale, hdr_lanes);
|
||||
vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes);
|
||||
output0 = output0 * output_scale;
|
||||
output1 = output1 * output_scale;
|
||||
}
|
||||
|
||||
output0 = output0 * output_scale;
|
||||
output1 = output1 * output_scale;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -247,7 +247,7 @@ static bool realign_weights_decimated(
|
||||
}
|
||||
|
||||
// Create an unquantized weight grid for this decimation level
|
||||
alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS];
|
||||
ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
|
||||
for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vint unquant_value(dec_weights_uquant + we_idx);
|
||||
@@ -467,7 +467,7 @@ static float compress_symbolic_block_for_partition_1plane(
|
||||
|
||||
qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
|
||||
|
||||
alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
|
||||
ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
|
||||
|
||||
// Generate the optimized set of weights for the weight mode
|
||||
compute_quantized_weights_for_decimation(
|
||||
@@ -830,7 +830,7 @@ static float compress_symbolic_block_for_partition_2planes(
|
||||
unsigned int decimation_mode = bm.decimation_mode;
|
||||
const auto& di = bsd.get_decimation_info(decimation_mode);
|
||||
|
||||
alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
|
||||
ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
|
||||
|
||||
// Generate the optimized set of weights for the mode
|
||||
compute_quantized_weights_for_decimation(
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -27,15 +27,15 @@
|
||||
/**
|
||||
* @brief Compute the integer linear interpolation of two color endpoints.
|
||||
*
|
||||
* @param decode_mode The ASTC profile (linear or sRGB)
|
||||
* @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16.
|
||||
* @param color0 The endpoint0 color.
|
||||
* @param color1 The endpoint1 color.
|
||||
* @param weights The interpolation weight (between 0 and 64).
|
||||
* @param weights The interpolation weight (between 0 and 64).
|
||||
*
|
||||
* @return The interpolated color.
|
||||
*/
|
||||
static vint4 lerp_color_int(
|
||||
astcenc_profile decode_mode,
|
||||
vmask4 u8_mask,
|
||||
vint4 color0,
|
||||
vint4 color1,
|
||||
vint4 weights
|
||||
@@ -43,24 +43,18 @@ static vint4 lerp_color_int(
|
||||
vint4 weight1 = weights;
|
||||
vint4 weight0 = vint4(64) - weight1;
|
||||
|
||||
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
|
||||
{
|
||||
color0 = asr<8>(color0);
|
||||
color1 = asr<8>(color1);
|
||||
}
|
||||
|
||||
vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
|
||||
color = asr<6>(color);
|
||||
|
||||
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
|
||||
{
|
||||
color = color * vint4(257);
|
||||
}
|
||||
// For decode_unorm8 values force the codec to bit replicate. This allows the
|
||||
// rest of the codec to assume the full 0xFFFF range for everything and ignore
|
||||
// the decode_mode setting
|
||||
vint4 color_u8 = asr<8>(color) * vint4(257);
|
||||
color = select(color, color_u8, u8_mask);
|
||||
|
||||
return color;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Convert integer color value into a float value for the decoder.
|
||||
*
|
||||
@@ -229,12 +223,13 @@ void decompress_symbolic_block(
|
||||
{
|
||||
vint4 colori(scb.constant_color);
|
||||
|
||||
// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
|
||||
// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
|
||||
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
|
||||
{
|
||||
colori = asr<8>(colori) * 257;
|
||||
}
|
||||
// Determine the UNORM8 rounding on the decode
|
||||
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
|
||||
|
||||
// The real decoder would just use the top 8 bits, but we rescale
|
||||
// in to a 16-bit value that rounds correctly.
|
||||
vint4 colori_u8 = asr<8>(colori) * 257;
|
||||
colori = select(colori, colori_u8, u8_mask);
|
||||
|
||||
vint4 colorf16 = unorm16_to_sf16(colori);
|
||||
color = float16_to_float(colorf16);
|
||||
@@ -289,6 +284,8 @@ void decompress_symbolic_block(
|
||||
int plane2_component = scb.plane2_component;
|
||||
vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
|
||||
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
{
|
||||
// Decode the color endpoints for this partition
|
||||
@@ -310,7 +307,7 @@ void decompress_symbolic_block(
|
||||
{
|
||||
int tix = pi.texels_of_partition[i][j];
|
||||
vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
|
||||
vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
|
||||
vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
|
||||
vfloat4 colorf = decode_texel(color, lns_mask);
|
||||
|
||||
blk.data_r[tix] = colorf.lane<0>();
|
||||
@@ -365,12 +362,14 @@ float compute_symbolic_block_difference_2plane(
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
|
||||
|
||||
// Unpack and compute error for each texel in the partition
|
||||
unsigned int texel_count = bsd.texel_count;
|
||||
for (unsigned int i = 0; i < texel_count; i++)
|
||||
{
|
||||
vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
|
||||
vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);
|
||||
vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
|
||||
|
||||
vfloat4 color = int_to_float(colori);
|
||||
vfloat4 oldColor = blk.texel(i);
|
||||
@@ -444,6 +443,8 @@ float compute_symbolic_block_difference_1plane(
|
||||
int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
|
||||
|
||||
vfloat4 summa = vfloat4::zero();
|
||||
for (unsigned int i = 0; i < partition_count; i++)
|
||||
{
|
||||
@@ -464,7 +465,7 @@ float compute_symbolic_block_difference_1plane(
|
||||
for (unsigned int j = 0; j < texel_count; j++)
|
||||
{
|
||||
unsigned int tix = pi.texels_of_partition[i][j];
|
||||
vint4 colori = lerp_color_int(config.profile, ep0, ep1,
|
||||
vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
|
||||
vint4(plane1_weights[tix]));
|
||||
|
||||
vfloat4 color = int_to_float(colori);
|
||||
@@ -532,7 +533,7 @@ float compute_symbolic_block_difference_1plane_1partition(
|
||||
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
|
||||
// Unquantize and undecimate the weights
|
||||
alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
|
||||
|
||||
// Decode the color endpoints for this partition
|
||||
@@ -547,19 +548,12 @@ float compute_symbolic_block_difference_1plane_1partition(
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
|
||||
|
||||
// Pre-shift sRGB so things round correctly
|
||||
if (config.profile == ASTCENC_PRF_LDR_SRGB)
|
||||
{
|
||||
ep0 = asr<8>(ep0);
|
||||
ep1 = asr<8>(ep1);
|
||||
}
|
||||
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
|
||||
|
||||
// Unpack and compute error for each texel in the partition
|
||||
vfloatacc summav = vfloatacc::zero();
|
||||
|
||||
vint lane_id = vint::lane_id();
|
||||
vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
|
||||
|
||||
unsigned int texel_count = bsd.texel_count;
|
||||
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
@@ -578,11 +572,25 @@ float compute_symbolic_block_difference_1plane_1partition(
|
||||
vint ep0_b = vint(ep0.lane<2>()) * weight0;
|
||||
vint ep0_a = vint(ep0.lane<3>()) * weight0;
|
||||
|
||||
// Shift so things round correctly
|
||||
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
|
||||
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
|
||||
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
|
||||
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;
|
||||
// Combine contributions
|
||||
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
|
||||
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
|
||||
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
|
||||
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
|
||||
|
||||
// If using a U8 decode mode bit replicate top 8 bits
|
||||
// so rest of codec can assume 0xFFFF max range everywhere
|
||||
vint colori_r8 = asr<8>(colori_r) * vint(257);
|
||||
colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
|
||||
|
||||
vint colori_g8 = asr<8>(colori_g) * vint(257);
|
||||
colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
|
||||
|
||||
vint colori_b8 = asr<8>(colori_b) * vint(257);
|
||||
colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
|
||||
|
||||
vint colori_a8 = asr<8>(colori_a) * vint(257);
|
||||
colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
|
||||
|
||||
// Compute color diff
|
||||
vfloat color_r = int_to_float(colori_r);
|
||||
|
||||
42
3rdparty/astc-encoder/source/astcenc_entry.cpp
vendored
42
3rdparty/astc-encoder/source/astcenc_entry.cpp
vendored
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -217,11 +217,13 @@ static astcenc_error validate_block_size(
|
||||
/**
|
||||
* @brief Validate flags.
|
||||
*
|
||||
* @param flags The flags to check.
|
||||
* @param profile The profile to check.
|
||||
* @param flags The flags to check.
|
||||
*
|
||||
* @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
|
||||
*/
|
||||
static astcenc_error validate_flags(
|
||||
astcenc_profile profile,
|
||||
unsigned int flags
|
||||
) {
|
||||
// Flags field must not contain any unknown flag bits
|
||||
@@ -239,6 +241,14 @@ static astcenc_error validate_flags(
|
||||
return ASTCENC_ERR_BAD_FLAGS;
|
||||
}
|
||||
|
||||
// Decode_unorm8 must only be used with an LDR profile
|
||||
bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
|
||||
bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
|
||||
if (is_unorm8 && is_hdr)
|
||||
{
|
||||
return ASTCENC_ERR_BAD_DECODE_MODE;
|
||||
}
|
||||
|
||||
return ASTCENC_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -364,7 +374,7 @@ static astcenc_error validate_config(
|
||||
return status;
|
||||
}
|
||||
|
||||
status = validate_flags(config.flags);
|
||||
status = validate_flags(config.profile, config.flags);
|
||||
if (status != ASTCENC_SUCCESS)
|
||||
{
|
||||
return status;
|
||||
@@ -591,7 +601,7 @@ astcenc_error astcenc_config_init(
|
||||
}
|
||||
|
||||
// Flags field must not contain any unknown flag bits
|
||||
status = validate_flags(flags);
|
||||
status = validate_flags(profile, flags);
|
||||
if (status != ASTCENC_SUCCESS)
|
||||
{
|
||||
return status;
|
||||
@@ -689,6 +699,12 @@ astcenc_error astcenc_context_alloc(
|
||||
}
|
||||
|
||||
ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
|
||||
if (!ctx->bsd)
|
||||
{
|
||||
delete ctxo;
|
||||
return ASTCENC_ERR_OUT_OF_MEM;
|
||||
}
|
||||
|
||||
bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
|
||||
init_block_size_descriptor(config.block_x, config.block_y, config.block_z,
|
||||
can_omit_modes,
|
||||
@@ -698,7 +714,7 @@ astcenc_error astcenc_context_alloc(
|
||||
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
// Do setup only needed by compression
|
||||
if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY))
|
||||
if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
|
||||
{
|
||||
// Turn a dB limit into a per-texel error for faster use later
|
||||
if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
|
||||
@@ -712,7 +728,7 @@ astcenc_error astcenc_context_alloc(
|
||||
|
||||
size_t worksize = sizeof(compression_working_buffers) * thread_count;
|
||||
ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
|
||||
static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0,
|
||||
static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
|
||||
"compression_working_buffers size must be multiple of vector alignment");
|
||||
if (!ctx->working_buffers)
|
||||
{
|
||||
@@ -802,6 +818,8 @@ static void compress_image(
|
||||
int row_blocks = xblocks;
|
||||
int plane_blocks = xblocks * yblocks;
|
||||
|
||||
blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
|
||||
|
||||
// Populate the block channel weights
|
||||
blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
|
||||
ctx.config.cw_g_weight,
|
||||
@@ -812,7 +830,7 @@ static void compress_image(
|
||||
auto& temp_buffers = ctx.working_buffers[thread_index];
|
||||
|
||||
// Only the first thread actually runs the initializer
|
||||
ctxo.manage_compress.init(block_count);
|
||||
ctxo.manage_compress.init(block_count, ctx.config.progress_callback);
|
||||
|
||||
// Determine if we can use an optimized load function
|
||||
bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
|
||||
@@ -1137,6 +1155,7 @@ astcenc_error astcenc_decompress_image(
|
||||
unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
|
||||
unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
|
||||
unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
|
||||
unsigned int block_count = zblocks * yblocks * xblocks;
|
||||
|
||||
int row_blocks = xblocks;
|
||||
int plane_blocks = xblocks * yblocks;
|
||||
@@ -1148,9 +1167,12 @@ astcenc_error astcenc_decompress_image(
|
||||
return ASTCENC_ERR_OUT_OF_MEM;
|
||||
}
|
||||
|
||||
image_block blk;
|
||||
image_block blk {};
|
||||
blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
|
||||
|
||||
// Decode mode inferred from the output data type
|
||||
blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
|
||||
|
||||
// If context thread count is one then implicitly reset
|
||||
if (ctx->thread_count == 1)
|
||||
{
|
||||
@@ -1158,7 +1180,7 @@ astcenc_error astcenc_decompress_image(
|
||||
}
|
||||
|
||||
// Only the first thread actually runs the initializer
|
||||
ctxo->manage_decompress.init(zblocks * yblocks * xblocks);
|
||||
ctxo->manage_decompress.init(block_count, nullptr);
|
||||
|
||||
// All threads run this processing loop until there is no work remaining
|
||||
while (true)
|
||||
@@ -1356,6 +1378,8 @@ const char* astcenc_get_error_string(
|
||||
return "ASTCENC_ERR_BAD_CONTEXT";
|
||||
case ASTCENC_ERR_NOT_IMPLEMENTED:
|
||||
return "ASTCENC_ERR_NOT_IMPLEMENTED";
|
||||
case ASTCENC_ERR_BAD_DECODE_MODE:
|
||||
return "ASTCENC_ERR_BAD_DECODE_MODE";
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
case ASTCENC_ERR_DTRACE_FAILURE:
|
||||
return "ASTCENC_ERR_DTRACE_FAILURE";
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -873,7 +873,7 @@ void compute_ideal_weights_for_decimation(
|
||||
}
|
||||
|
||||
// Otherwise compute an estimate and perform single refinement iteration
|
||||
alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
|
||||
|
||||
// Compute an initial average for each decimated weight
|
||||
bool constant_wes = ei.is_constant_weight_error_scale;
|
||||
@@ -1171,7 +1171,7 @@ void recompute_ideal_colors_1plane(
|
||||
promise(total_texel_count > 0);
|
||||
promise(partition_count > 0);
|
||||
|
||||
alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS];
|
||||
ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS];
|
||||
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vint unquant_value(dec_weights_uquant + i);
|
||||
@@ -1179,7 +1179,7 @@ void recompute_ideal_colors_1plane(
|
||||
storea(unquant_valuef, dec_weight + i);
|
||||
}
|
||||
|
||||
alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS];
|
||||
float* undec_weight_ref;
|
||||
if (di.max_texel_weight_count == 1)
|
||||
{
|
||||
@@ -1394,8 +1394,8 @@ void recompute_ideal_colors_2planes(
|
||||
promise(total_texel_count > 0);
|
||||
promise(weight_count > 0);
|
||||
|
||||
alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
|
||||
alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
|
||||
ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
|
||||
ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
|
||||
|
||||
assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
|
||||
|
||||
@@ -1410,8 +1410,8 @@ void recompute_ideal_colors_2planes(
|
||||
storea(unquant_value2f, dec_weight_plane2 + i);
|
||||
}
|
||||
|
||||
alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS];
|
||||
alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS];
|
||||
|
||||
float* undec_weight_plane1_ref;
|
||||
float* undec_weight_plane2_ref;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -109,7 +109,7 @@ static vfloat4 swz_texel(
|
||||
vfloat4 data,
|
||||
const astcenc_swizzle& swz
|
||||
) {
|
||||
alignas(16) float datas[6];
|
||||
ASTCENC_ALIGNAS float datas[6];
|
||||
|
||||
storea(data, datas);
|
||||
datas[ASTCENC_SWZ_0] = 0.0f;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2021 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -464,10 +464,10 @@ static inline void write_bits(
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Read up to 8 bits at an arbitrary bit offset.
|
||||
* @brief Read up to 16 bits from two bytes.
|
||||
*
|
||||
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
|
||||
* span two separate bytes in memory.
|
||||
* This function reads a packed N-bit field from two bytes in memory. The stored value must exist
|
||||
* within the two bytes, but can start at an arbitary bit offset and span the two bytes in memory.
|
||||
*
|
||||
* @param bitcount The number of bits to read.
|
||||
* @param bitoffset The bit offset to read from, between 0 and 7.
|
||||
|
||||
96
3rdparty/astc-encoder/source/astcenc_internal.h
vendored
96
3rdparty/astc-encoder/source/astcenc_internal.h
vendored
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -29,6 +29,7 @@
|
||||
#include <cstdio>
|
||||
#endif
|
||||
#include <cstdlib>
|
||||
#include <limits>
|
||||
|
||||
#include "astcenc.h"
|
||||
#include "astcenc_mathlib.h"
|
||||
@@ -325,10 +326,10 @@ struct partition_info
|
||||
uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS];
|
||||
|
||||
/** @brief The partition of each texel in the block. */
|
||||
uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
|
||||
|
||||
/** @brief The list of texels in each partition. */
|
||||
uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -366,40 +367,40 @@ struct decimation_info
|
||||
* @brief The number of weights that contribute to each texel.
|
||||
* Value is between 1 and 4.
|
||||
*/
|
||||
uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
|
||||
|
||||
/**
|
||||
* @brief The weight index of the N weights that are interpolated for each texel.
|
||||
* Stored transposed to improve vectorization.
|
||||
*/
|
||||
uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
|
||||
|
||||
/**
|
||||
* @brief The bilinear contribution of the N weights that are interpolated for each texel.
|
||||
* Value is between 0 and 16, stored transposed to improve vectorization.
|
||||
*/
|
||||
uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
|
||||
|
||||
/**
|
||||
* @brief The bilinear contribution of the N weights that are interpolated for each texel.
|
||||
* Value is between 0 and 1, stored transposed to improve vectorization.
|
||||
*/
|
||||
alignas(ASTCENC_VECALIGN) float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
|
||||
|
||||
/** @brief The number of texels that each stored weight contributes to. */
|
||||
uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
|
||||
ASTCENC_ALIGNAS uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
|
||||
|
||||
/**
|
||||
* @brief The list of texels that use a specific weight index.
|
||||
* Stored transposed to improve vectorization.
|
||||
*/
|
||||
uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
|
||||
ASTCENC_ALIGNAS uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
|
||||
|
||||
/**
|
||||
* @brief The bilinear contribution to the N texels that use each weight.
|
||||
* Value is between 0 and 1, stored transposed to improve vectorization.
|
||||
*/
|
||||
alignas(ASTCENC_VECALIGN) float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
|
||||
ASTCENC_ALIGNAS float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
|
||||
|
||||
/**
|
||||
* @brief The bilinear contribution to the Nth texel that uses each weight.
|
||||
@@ -579,7 +580,7 @@ struct block_size_descriptor
|
||||
decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES];
|
||||
|
||||
/** @brief The active decimation tables, stored in low indices. */
|
||||
alignas(ASTCENC_VECALIGN) decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
|
||||
ASTCENC_ALIGNAS decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
|
||||
|
||||
/** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */
|
||||
uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES];
|
||||
@@ -731,7 +732,11 @@ struct block_size_descriptor
|
||||
*
|
||||
* The @c data_[rgba] fields store the image data in an encoded SoA float form designed for easy
|
||||
* vectorization. Input data is converted to float and stored as values between 0 and 65535. LDR
|
||||
* data is stored as direct UNORM data, HDR data is stored as LNS data.
|
||||
* data is stored as direct UNORM data, HDR data is stored as LNS data. They are allocated SIMD
|
||||
* elements over-size to allow vectorized stores of unaligned and partial SIMD lanes (e.g. in a
|
||||
* 6x6x6 block the final row write will read elements 210-217 (vec8) or 214-217 (vec4), which is
|
||||
* two elements above the last real data element). The overspill values are never written to memory,
|
||||
* and would be benign, but the padding avoids hitting undefined behavior.
|
||||
*
|
||||
* The @c rgb_lns and @c alpha_lns fields that assigned a per-texel use of HDR are only used during
|
||||
* decompression. The current compressor will always use HDR endpoint formats when in HDR mode.
|
||||
@@ -739,16 +744,16 @@ struct block_size_descriptor
|
||||
struct image_block
|
||||
{
|
||||
/** @brief The input (compress) or output (decompress) data for the red color component. */
|
||||
alignas(ASTCENC_VECALIGN) float data_r[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float data_r[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
|
||||
|
||||
/** @brief The input (compress) or output (decompress) data for the green color component. */
|
||||
alignas(ASTCENC_VECALIGN) float data_g[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float data_g[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
|
||||
|
||||
/** @brief The input (compress) or output (decompress) data for the blue color component. */
|
||||
alignas(ASTCENC_VECALIGN) float data_b[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float data_b[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
|
||||
|
||||
/** @brief The input (compress) or output (decompress) data for the alpha color component. */
|
||||
alignas(ASTCENC_VECALIGN) float data_a[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float data_a[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
|
||||
|
||||
/** @brief The number of texels in the block. */
|
||||
uint8_t texel_count;
|
||||
@@ -771,6 +776,9 @@ struct image_block
|
||||
/** @brief Is this grayscale block where R == G == B for all texels? */
|
||||
bool grayscale;
|
||||
|
||||
/** @brief Is the eventual decode using decode_unorm8 rounding? */
|
||||
bool decode_unorm8;
|
||||
|
||||
/** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */
|
||||
uint8_t rgb_lns[BLOCK_MAX_TEXELS];
|
||||
|
||||
@@ -897,10 +905,10 @@ struct endpoints_and_weights
|
||||
endpoints ep;
|
||||
|
||||
/** @brief The ideal weight for each texel; may be undecimated or decimated. */
|
||||
alignas(ASTCENC_VECALIGN) float weights[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float weights[BLOCK_MAX_TEXELS];
|
||||
|
||||
/** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */
|
||||
alignas(ASTCENC_VECALIGN) float weight_error_scale[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float weight_error_scale[BLOCK_MAX_TEXELS];
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -930,7 +938,7 @@ struct encoding_choice_errors
|
||||
/**
|
||||
* @brief Preallocated working buffers, allocated per thread during context creation.
|
||||
*/
|
||||
struct alignas(ASTCENC_VECALIGN) compression_working_buffers
|
||||
struct ASTCENC_ALIGNAS compression_working_buffers
|
||||
{
|
||||
/** @brief Ideal endpoints and weights for plane 1. */
|
||||
endpoints_and_weights ei1;
|
||||
@@ -946,17 +954,17 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers
|
||||
*
|
||||
* For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
|
||||
*/
|
||||
alignas(ASTCENC_VECALIGN) float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
|
||||
ASTCENC_ALIGNAS float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
|
||||
|
||||
/**
|
||||
* @brief Decimated quantized weight values in the unquantized 0-64 range.
|
||||
*
|
||||
* For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
|
||||
*/
|
||||
uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
|
||||
ASTCENC_ALIGNAS uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
|
||||
|
||||
/** @brief Error of the best encoding combination for each block mode. */
|
||||
alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
|
||||
ASTCENC_ALIGNAS float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
|
||||
|
||||
/** @brief The best color quant for each block mode. */
|
||||
uint8_t best_quant_levels[WEIGHTS_MAX_BLOCK_MODES];
|
||||
@@ -1107,7 +1115,7 @@ struct symbolic_compressed_block
|
||||
*
|
||||
* If dual plane, the second plane starts at @c weights[WEIGHTS_PLANE2_OFFSET].
|
||||
*/
|
||||
uint8_t weights[BLOCK_MAX_WEIGHTS];
|
||||
ASTCENC_ALIGNAS uint8_t weights[BLOCK_MAX_WEIGHTS];
|
||||
|
||||
/**
|
||||
* @brief Get the weight quantization used by this block mode.
|
||||
@@ -1563,6 +1571,33 @@ unsigned int find_best_partition_candidates(
|
||||
Functionality for managing images and image related data.
|
||||
============================================================================ */
|
||||
|
||||
/**
|
||||
* @brief Get a vector mask indicating lanes decompressing into a UNORM8 value.
|
||||
*
|
||||
* @param decode_mode The color profile for LDR_SRGB settings.
|
||||
* @param blk The image block for output image bitness settings.
|
||||
*
|
||||
* @return The component mask vector.
|
||||
*/
|
||||
static inline vmask4 get_u8_component_mask(
|
||||
astcenc_profile decode_mode,
|
||||
const image_block& blk
|
||||
) {
|
||||
vmask4 u8_mask(false);
|
||||
// Decode mode writing to a unorm8 output value
|
||||
if (blk.decode_unorm8)
|
||||
{
|
||||
u8_mask = vmask4(true);
|
||||
}
|
||||
// SRGB writing to a unorm8 RGB value
|
||||
else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
|
||||
{
|
||||
u8_mask = vmask4(true, true, true, false);
|
||||
}
|
||||
|
||||
return u8_mask;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Setup computation of regional averages in an image.
|
||||
*
|
||||
@@ -1816,7 +1851,7 @@ uint8_t pack_color_endpoints(
|
||||
*
|
||||
* Endpoints must be unscrambled and converted into the 0-255 range before calling this functions.
|
||||
*
|
||||
* @param decode_mode The decode mode (LDR, HDR).
|
||||
* @param decode_mode The decode mode (LDR, HDR, etc).
|
||||
* @param format The color endpoint mode used.
|
||||
* @param input The raw array of encoded input integers. The length of this array
|
||||
* depends on @c format; it can be safely assumed to be large enough.
|
||||
@@ -2142,10 +2177,11 @@ Platform-specific functions.
|
||||
/**
|
||||
* @brief Allocate an aligned memory buffer.
|
||||
*
|
||||
* Allocated memory must be freed by aligned_free;
|
||||
* Allocated memory must be freed by aligned_free.
|
||||
*
|
||||
* @param size The desired buffer size.
|
||||
* @param align The desired buffer alignment; must be 2^N.
|
||||
* @param align The desired buffer alignment; must be 2^N, may be increased
|
||||
* by the implementation to a minimum allowable alignment.
|
||||
*
|
||||
* @return The memory buffer pointer or nullptr on allocation failure.
|
||||
*/
|
||||
@@ -2155,10 +2191,14 @@ T* aligned_malloc(size_t size, size_t align)
|
||||
void* ptr;
|
||||
int error = 0;
|
||||
|
||||
// Don't allow this to under-align a type
|
||||
size_t min_align = astc::max(alignof(T), sizeof(void*));
|
||||
size_t real_align = astc::max(min_align, align);
|
||||
|
||||
#if defined(_WIN32)
|
||||
ptr = _aligned_malloc(size, align);
|
||||
ptr = _aligned_malloc(size, real_align);
|
||||
#else
|
||||
error = posix_memalign(&ptr, align, size);
|
||||
error = posix_memalign(&ptr, real_align, size);
|
||||
#endif
|
||||
|
||||
if (error || (!ptr))
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -118,6 +118,18 @@ private:
|
||||
/** @brief Number of tasks that need to be processed. */
|
||||
unsigned int m_task_count;
|
||||
|
||||
/** @brief Progress callback (optional). */
|
||||
astcenc_progress_callback m_callback;
|
||||
|
||||
/** @brief Lock used for callback synchronization. */
|
||||
std::mutex m_callback_lock;
|
||||
|
||||
/** @brief Minimum progress before making a callback. */
|
||||
float m_callback_min_diff;
|
||||
|
||||
/** @brief Last progress callback value. */
|
||||
float m_callback_last_value;
|
||||
|
||||
public:
|
||||
/** @brief Create a new ParallelManager. */
|
||||
ParallelManager()
|
||||
@@ -138,6 +150,9 @@ public:
|
||||
m_start_count = 0;
|
||||
m_done_count = 0;
|
||||
m_task_count = 0;
|
||||
m_callback = nullptr;
|
||||
m_callback_last_value = 0.0f;
|
||||
m_callback_min_diff = 1.0f;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -166,14 +181,20 @@ public:
|
||||
* initialization. Other threads will block and wait for it to complete.
|
||||
*
|
||||
* @param task_count Total number of tasks needing processing.
|
||||
* @param callback Function pointer for progress status callbacks.
|
||||
*/
|
||||
void init(unsigned int task_count)
|
||||
void init(unsigned int task_count, astcenc_progress_callback callback)
|
||||
{
|
||||
std::lock_guard<std::mutex> lck(m_lock);
|
||||
if (!m_init_done)
|
||||
{
|
||||
m_callback = callback;
|
||||
m_task_count = task_count;
|
||||
m_init_done = true;
|
||||
|
||||
// Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead
|
||||
float min_diff = (4096.0f / static_cast<float>(task_count)) * 100.0f;
|
||||
m_callback_min_diff = astc::max(min_diff, 1.0f);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,12 +233,49 @@ public:
|
||||
{
|
||||
// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
|
||||
// update here and the wait() for other threads
|
||||
std::unique_lock<std::mutex> lck(m_lock);
|
||||
this->m_done_count += count;
|
||||
if (m_done_count == m_task_count)
|
||||
unsigned int local_count;
|
||||
float local_last_value;
|
||||
{
|
||||
lck.unlock();
|
||||
m_complete.notify_all();
|
||||
std::unique_lock<std::mutex> lck(m_lock);
|
||||
m_done_count += count;
|
||||
local_count = m_done_count;
|
||||
local_last_value = m_callback_last_value;
|
||||
|
||||
if (m_done_count == m_task_count)
|
||||
{
|
||||
// Ensure the progress bar hits 100%
|
||||
if (m_callback)
|
||||
{
|
||||
std::unique_lock<std::mutex> cblck(m_callback_lock);
|
||||
m_callback(100.0f);
|
||||
m_callback_last_value = 100.0f;
|
||||
}
|
||||
|
||||
lck.unlock();
|
||||
m_complete.notify_all();
|
||||
}
|
||||
}
|
||||
|
||||
// Process progress callback if we have one
|
||||
if (m_callback)
|
||||
{
|
||||
// Initial lockless test - have we progressed enough to emit?
|
||||
float num = static_cast<float>(local_count);
|
||||
float den = static_cast<float>(m_task_count);
|
||||
float this_value = (num / den) * 100.0f;
|
||||
bool report_test = (this_value - local_last_value) > m_callback_min_diff;
|
||||
|
||||
// Recheck under lock, because another thread might report first
|
||||
if (report_test)
|
||||
{
|
||||
std::unique_lock<std::mutex> cblck(m_callback_lock);
|
||||
bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff;
|
||||
if (report_retest)
|
||||
{
|
||||
m_callback(this_value);
|
||||
m_callback_last_value = this_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
16
3rdparty/astc-encoder/source/astcenc_mathlib.h
vendored
16
3rdparty/astc-encoder/source/astcenc_mathlib.h
vendored
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2021 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -73,10 +73,22 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Force vector-sized SIMD alignment
|
||||
#if ASTCENC_AVX
|
||||
#define ASTCENC_VECALIGN 32
|
||||
#else
|
||||
#elif ASTCENC_SSE || ASTCENC_NEON
|
||||
#define ASTCENC_VECALIGN 16
|
||||
// Use default alignment for non-SIMD builds
|
||||
#else
|
||||
#define ASTCENC_VECALIGN 0
|
||||
#endif
|
||||
|
||||
// C++11 states that alignas(0) should be ignored but GCC doesn't do
|
||||
// this on some versions, so workaround and avoid emitting alignas(0)
|
||||
#if ASTCENC_VECALIGN > 0
|
||||
#define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN)
|
||||
#else
|
||||
#define ASTCENC_ALIGNAS
|
||||
#endif
|
||||
|
||||
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
|
||||
|
||||
@@ -15,13 +15,13 @@
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "astcenc_mathlib.h"
|
||||
|
||||
/**
|
||||
* @brief Soft-float library for IEEE-754.
|
||||
*/
|
||||
#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
|
||||
|
||||
#include "astcenc_mathlib.h"
|
||||
|
||||
/* sized soft-float types. These are mapped to the sized integer
|
||||
types of C99, instead of C's floating-point types; this is because
|
||||
the library needs to maintain exact, bit-level control on all
|
||||
|
||||
@@ -330,12 +330,14 @@ void physical_to_symbolic(
|
||||
return;
|
||||
}
|
||||
|
||||
// Low values span 3 bytes so need two read_bits calls
|
||||
int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8);
|
||||
int vx_high_s = read_bits(8, 25, pcb) | (read_bits(5, 25 + 8, pcb) << 8);
|
||||
int vx_high_s = read_bits(13, 25, pcb);
|
||||
int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8);
|
||||
int vx_high_t = read_bits(8, 51, pcb) | (read_bits(5, 51 + 8, pcb) << 8);
|
||||
int vx_high_t = read_bits(13, 51, pcb);
|
||||
|
||||
int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
|
||||
int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF &&
|
||||
vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
|
||||
|
||||
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
|
||||
{
|
||||
@@ -350,12 +352,14 @@ void physical_to_symbolic(
|
||||
int vx_high_s = read_bits(9, 19, pcb);
|
||||
int vx_low_t = read_bits(9, 28, pcb);
|
||||
int vx_high_t = read_bits(9, 37, pcb);
|
||||
int vx_low_p = read_bits(9, 46, pcb);
|
||||
int vx_high_p = read_bits(9, 55, pcb);
|
||||
int vx_low_r = read_bits(9, 46, pcb);
|
||||
int vx_high_r = read_bits(9, 55, pcb);
|
||||
|
||||
int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;
|
||||
int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF &&
|
||||
vx_low_t == 0x1FF && vx_high_t == 0x1FF &&
|
||||
vx_low_r == 0x1FF && vx_high_r == 0x1FF;
|
||||
|
||||
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_p >= vx_high_p) && !all_ones)
|
||||
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_r >= vx_high_r) && !all_ones)
|
||||
{
|
||||
scb.block_type = SYM_BTYPE_ERROR;
|
||||
return;
|
||||
@@ -470,8 +474,7 @@ void physical_to_symbolic(
|
||||
bitpos += 2;
|
||||
}
|
||||
}
|
||||
scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb) |
|
||||
(read_bits(PARTITION_INDEX_BITS - 6, 19, pcb) << 6));
|
||||
scb.partition_index = static_cast<uint16_t>(read_bits(10, 13, pcb));
|
||||
}
|
||||
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2019-2022 Arm Limited
|
||||
// Copyright 2019-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -1170,7 +1170,7 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 ma
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void print(vint8 a)
|
||||
{
|
||||
alignas(ASTCENC_VECALIGN) int v[8];
|
||||
alignas(32) int v[8];
|
||||
storea(a, v);
|
||||
printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n",
|
||||
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
|
||||
@@ -1181,7 +1181,7 @@ ASTCENC_SIMD_INLINE void print(vint8 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void printx(vint8 a)
|
||||
{
|
||||
alignas(ASTCENC_VECALIGN) int v[8];
|
||||
alignas(32) int v[8];
|
||||
storea(a, v);
|
||||
printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n",
|
||||
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
|
||||
@@ -1192,7 +1192,7 @@ ASTCENC_SIMD_INLINE void printx(vint8 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void print(vfloat8 a)
|
||||
{
|
||||
alignas(ASTCENC_VECALIGN) float v[8];
|
||||
alignas(32) float v[8];
|
||||
storea(a, v);
|
||||
printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
|
||||
static_cast<double>(v[0]), static_cast<double>(v[1]),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2020-2021 Arm Limited
|
||||
// Copyright 2020-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -383,7 +383,7 @@ static ASTCENC_SIMD_INLINE void bit_transfer_signed(
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void print(vint4 a)
|
||||
{
|
||||
alignas(16) int v[4];
|
||||
ASTCENC_ALIGNAS int v[4];
|
||||
storea(a, v);
|
||||
printf("v4_i32:\n %8d %8d %8d %8d\n",
|
||||
v[0], v[1], v[2], v[3]);
|
||||
@@ -394,7 +394,7 @@ ASTCENC_SIMD_INLINE void print(vint4 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void printx(vint4 a)
|
||||
{
|
||||
alignas(16) int v[4];
|
||||
ASTCENC_ALIGNAS int v[4];
|
||||
storea(a, v);
|
||||
printf("v4_i32:\n %08x %08x %08x %08x\n",
|
||||
v[0], v[1], v[2], v[3]);
|
||||
@@ -405,7 +405,7 @@ ASTCENC_SIMD_INLINE void printx(vint4 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void print(vfloat4 a)
|
||||
{
|
||||
alignas(16) float v[4];
|
||||
ASTCENC_ALIGNAS float v[4];
|
||||
storea(a, v);
|
||||
printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
|
||||
static_cast<double>(v[0]), static_cast<double>(v[1]),
|
||||
|
||||
@@ -359,9 +359,9 @@ struct vmask4
|
||||
/**
|
||||
* @brief Get the scalar from a single lane.
|
||||
*/
|
||||
template <int32_t l> ASTCENC_SIMD_INLINE uint32_t lane() const
|
||||
template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
|
||||
{
|
||||
return vgetq_lane_u32(m, l);
|
||||
return vgetq_lane_u32(m, l) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2019-2022 Arm Limited
|
||||
// Copyright 2019-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -351,6 +351,13 @@ struct vmask4
|
||||
m[3] = d == false ? 0 : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the scalar value of a single lane.
|
||||
*/
|
||||
template <int l> ASTCENC_SIMD_INLINE float lane() const
|
||||
{
|
||||
return m[l] != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief The vector ...
|
||||
@@ -549,10 +556,16 @@ ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
|
||||
*/
|
||||
template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
|
||||
{
|
||||
return vint4(a.m[0] << s,
|
||||
a.m[1] << s,
|
||||
a.m[2] << s,
|
||||
a.m[3] << s);
|
||||
// Cast to unsigned to avoid shift in/out of sign bit undefined behavior
|
||||
unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
|
||||
unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
|
||||
unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
|
||||
unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
|
||||
|
||||
return vint4(static_cast<int>(as0),
|
||||
static_cast<int>(as1),
|
||||
static_cast<int>(as2),
|
||||
static_cast<int>(as3));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -560,6 +573,7 @@ template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
|
||||
*/
|
||||
template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
|
||||
{
|
||||
// Cast to unsigned to avoid shift in/out of sign bit undefined behavior
|
||||
unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
|
||||
unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
|
||||
unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2019-2022 Arm Limited
|
||||
// Copyright 2019-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -379,9 +379,9 @@ struct vmask4
|
||||
/**
|
||||
* @brief Get the scalar value of a single lane.
|
||||
*/
|
||||
template <int l> ASTCENC_SIMD_INLINE float lane() const
|
||||
template <int l> ASTCENC_SIMD_INLINE bool lane() const
|
||||
{
|
||||
return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l));
|
||||
return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)) != 0.0f;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -60,8 +60,8 @@ static const uint8_t steps_for_quant_level[12] {
|
||||
2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
|
||||
};
|
||||
|
||||
alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
|
||||
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
static bool print_once { true };
|
||||
@@ -99,7 +99,7 @@ static void compute_angular_offsets(
|
||||
promise(weight_count > 0);
|
||||
promise(max_angular_steps > 0);
|
||||
|
||||
alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS];
|
||||
ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
|
||||
|
||||
// Precompute isample; arrays are always allocated 64 elements long
|
||||
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||
@@ -242,16 +242,16 @@ static void compute_angular_endpoints_for_quant_levels(
|
||||
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
|
||||
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
|
||||
|
||||
alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
|
||||
|
||||
compute_angular_offsets(weight_count, dec_weight_ideal_value,
|
||||
max_angular_steps, angular_offsets);
|
||||
|
||||
alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
|
||||
|
||||
compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
|
||||
max_angular_steps, max_quant_steps,
|
||||
|
||||
Reference in New Issue
Block a user