Updated astc-encoder.

This commit is contained in:
Бранимир Караџић
2024-05-24 16:41:07 -07:00
parent e9fa0ceff2
commit 98a40e8533
19 changed files with 373 additions and 160 deletions

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2023 Arm Limited
// Copyright 2020-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -215,6 +215,8 @@ enum astcenc_error {
ASTCENC_ERR_BAD_CONTEXT,
/** @brief The call failed due to unimplemented functionality. */
ASTCENC_ERR_NOT_IMPLEMENTED,
/** @brief The call failed due to an out-of-spec decode mode flag set. */
ASTCENC_ERR_BAD_DECODE_MODE,
#if defined(ASTCENC_DIAGNOSTICS)
/** @brief The call failed due to an issue with diagnostic tracing. */
ASTCENC_ERR_DTRACE_FAILURE,
@@ -302,6 +304,11 @@ enum astcenc_type
ASTCENC_TYPE_F32 = 2
};
/**
* @brief Function pointer type for compression progress reporting callback.
*/
extern "C" typedef void (*astcenc_progress_callback)(float);
/**
* @brief Enable normal map compression.
*
@@ -312,6 +319,19 @@ enum astcenc_type
*/
static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0;
/**
* @brief Enable compression heuristics that assume use of decode_unorm8 decode mode.
*
* The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this
* flag during compression will allow the compressor to use the correct rounding when selecting
* encodings. This will improve the compressed image quality if your application is using the
* decode_unorm8 decode mode, but will reduce image quality if using decode_fp16.
*
* Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of
* this setting.
*/
static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8 = 1 << 1;
/**
* @brief Enable alpha weighting.
*
@@ -378,6 +398,7 @@ static const unsigned int ASTCENC_ALL_FLAGS =
ASTCENC_FLG_MAP_RGBM |
ASTCENC_FLG_USE_ALPHA_WEIGHT |
ASTCENC_FLG_USE_PERCEPTUAL |
ASTCENC_FLG_USE_DECODE_UNORM8 |
ASTCENC_FLG_DECOMPRESS_ONLY |
ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
@@ -550,6 +571,16 @@ struct astcenc_config
*/
float tune_search_mode0_enable;
/**
* @brief The progress callback, can be @c nullptr.
*
* If this is specified the codec will peridocially report progress for
* compression as a percentage between 0 and 100. The callback is called from one
* of the compressor threads, so doing significant work in the callback will
* reduce compression performance.
*/
astcenc_progress_callback progress_callback;
#if defined(ASTCENC_DIAGNOSTICS)
/**
* @brief The path to save the diagnostic trace data to.

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -894,32 +894,55 @@ void unpack_color_endpoints(
}
}
vint4 ldr_scale(257);
vint4 hdr_scale(1);
vint4 output_scale = ldr_scale;
// Handle endpoint errors and expansion
// An LDR profile image
if ((decode_mode == ASTCENC_PRF_LDR) ||
(decode_mode == ASTCENC_PRF_LDR_SRGB))
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
if (decode_mode == ASTCENC_PRF_LDR)
{
// Also matches HDR alpha, as cannot have HDR alpha without HDR RGB
if (rgb_hdr == true)
// Error color - HDR endpoint in an LDR encoding
if (rgb_hdr || alpha_hdr)
{
output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
output_scale = hdr_scale;
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
rgb_hdr = false;
alpha_hdr = false;
}
output0 = output0 * 257;
output1 = output1 * 257;
}
// An HDR profile image
// sRGB LDR 8-bit endpoints are expanded to 16 bit by:
// - RGB = shift left by 8 bits and OR with 0x80
// - A = replication
else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
// Error color - HDR endpoint in an LDR encoding
if (rgb_hdr || alpha_hdr)
{
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
rgb_hdr = false;
alpha_hdr = false;
}
vmask4 mask(true, true, true, false);
vint4 output0rgb = lsl<8>(output0) | vint4(0x80);
vint4 output0a = output0 * 257;
output0 = select(output0a, output0rgb, mask);
vint4 output1rgb = lsl<8>(output1) | vint4(0x80);
vint4 output1a = output1 * 257;
output1 = select(output1a, output1rgb, mask);
}
// An HDR profile decode, but may be using linear LDR endpoints
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
// HDR endpoints are already 16-bit
else
{
vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
output_scale = select(ldr_scale, hdr_scale, hdr_lanes);
}
vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes);
output0 = output0 * output_scale;
output1 = output1 * output_scale;
}
}

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -247,7 +247,7 @@ static bool realign_weights_decimated(
}
// Create an unquantized weight grid for this decimation level
alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS];
ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
{
vint unquant_value(dec_weights_uquant + we_idx);
@@ -467,7 +467,7 @@ static float compress_symbolic_block_for_partition_1plane(
qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
// Generate the optimized set of weights for the weight mode
compute_quantized_weights_for_decimation(
@@ -830,7 +830,7 @@ static float compress_symbolic_block_for_partition_2planes(
unsigned int decimation_mode = bm.decimation_mode;
const auto& di = bsd.get_decimation_info(decimation_mode);
alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
// Generate the optimized set of weights for the mode
compute_quantized_weights_for_decimation(

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -27,7 +27,7 @@
/**
* @brief Compute the integer linear interpolation of two color endpoints.
*
* @param decode_mode The ASTC profile (linear or sRGB)
* @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16.
* @param color0 The endpoint0 color.
* @param color1 The endpoint1 color.
* @param weights The interpolation weight (between 0 and 64).
@@ -35,7 +35,7 @@
* @return The interpolated color.
*/
static vint4 lerp_color_int(
astcenc_profile decode_mode,
vmask4 u8_mask,
vint4 color0,
vint4 color1,
vint4 weights
@@ -43,24 +43,18 @@ static vint4 lerp_color_int(
vint4 weight1 = weights;
vint4 weight0 = vint4(64) - weight1;
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
color0 = asr<8>(color0);
color1 = asr<8>(color1);
}
vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
color = asr<6>(color);
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
color = color * vint4(257);
}
// For decode_unorm8 values force the codec to bit replicate. This allows the
// rest of the codec to assume the full 0xFFFF range for everything and ignore
// the decode_mode setting
vint4 color_u8 = asr<8>(color) * vint4(257);
color = select(color, color_u8, u8_mask);
return color;
}
/**
* @brief Convert integer color value into a float value for the decoder.
*
@@ -229,12 +223,13 @@ void decompress_symbolic_block(
{
vint4 colori(scb.constant_color);
// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
colori = asr<8>(colori) * 257;
}
// Determine the UNORM8 rounding on the decode
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
// The real decoder would just use the top 8 bits, but we rescale
// in to a 16-bit value that rounds correctly.
vint4 colori_u8 = asr<8>(colori) * 257;
colori = select(colori, colori_u8, u8_mask);
vint4 colorf16 = unorm16_to_sf16(colori);
color = float16_to_float(colorf16);
@@ -289,6 +284,8 @@ void decompress_symbolic_block(
int plane2_component = scb.plane2_component;
vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
for (int i = 0; i < partition_count; i++)
{
// Decode the color endpoints for this partition
@@ -310,7 +307,7 @@ void decompress_symbolic_block(
{
int tix = pi.texels_of_partition[i][j];
vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
vfloat4 colorf = decode_texel(color, lns_mask);
blk.data_r[tix] = colorf.lane<0>();
@@ -365,12 +362,14 @@ float compute_symbolic_block_difference_2plane(
rgb_lns, a_lns,
ep0, ep1);
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
// Unpack and compute error for each texel in the partition
unsigned int texel_count = bsd.texel_count;
for (unsigned int i = 0; i < texel_count; i++)
{
vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);
vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
vfloat4 color = int_to_float(colori);
vfloat4 oldColor = blk.texel(i);
@@ -444,6 +443,8 @@ float compute_symbolic_block_difference_1plane(
int plane1_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
vfloat4 summa = vfloat4::zero();
for (unsigned int i = 0; i < partition_count; i++)
{
@@ -464,7 +465,7 @@ float compute_symbolic_block_difference_1plane(
for (unsigned int j = 0; j < texel_count; j++)
{
unsigned int tix = pi.texels_of_partition[i][j];
vint4 colori = lerp_color_int(config.profile, ep0, ep1,
vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
vint4(plane1_weights[tix]));
vfloat4 color = int_to_float(colori);
@@ -532,7 +533,7 @@ float compute_symbolic_block_difference_1plane_1partition(
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
// Unquantize and undecimate the weights
alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
// Decode the color endpoints for this partition
@@ -547,19 +548,12 @@ float compute_symbolic_block_difference_1plane_1partition(
rgb_lns, a_lns,
ep0, ep1);
// Pre-shift sRGB so things round correctly
if (config.profile == ASTCENC_PRF_LDR_SRGB)
{
ep0 = asr<8>(ep0);
ep1 = asr<8>(ep1);
}
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
// Unpack and compute error for each texel in the partition
vfloatacc summav = vfloatacc::zero();
vint lane_id = vint::lane_id();
vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
unsigned int texel_count = bsd.texel_count;
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
@@ -578,11 +572,25 @@ float compute_symbolic_block_difference_1plane_1partition(
vint ep0_b = vint(ep0.lane<2>()) * weight0;
vint ep0_a = vint(ep0.lane<3>()) * weight0;
// Shift so things round correctly
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;
// Combine contributions
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
// If using a U8 decode mode bit replicate top 8 bits
// so rest of codec can assume 0xFFFF max range everywhere
vint colori_r8 = asr<8>(colori_r) * vint(257);
colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
vint colori_g8 = asr<8>(colori_g) * vint(257);
colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
vint colori_b8 = asr<8>(colori_b) * vint(257);
colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
vint colori_a8 = asr<8>(colori_a) * vint(257);
colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
// Compute color diff
vfloat color_r = int_to_float(colori_r);

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -217,11 +217,13 @@ static astcenc_error validate_block_size(
/**
* @brief Validate flags.
*
* @param profile The profile to check.
* @param flags The flags to check.
*
* @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
*/
static astcenc_error validate_flags(
astcenc_profile profile,
unsigned int flags
) {
// Flags field must not contain any unknown flag bits
@@ -239,6 +241,14 @@ static astcenc_error validate_flags(
return ASTCENC_ERR_BAD_FLAGS;
}
// Decode_unorm8 must only be used with an LDR profile
bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
if (is_unorm8 && is_hdr)
{
return ASTCENC_ERR_BAD_DECODE_MODE;
}
return ASTCENC_SUCCESS;
}
@@ -364,7 +374,7 @@ static astcenc_error validate_config(
return status;
}
status = validate_flags(config.flags);
status = validate_flags(config.profile, config.flags);
if (status != ASTCENC_SUCCESS)
{
return status;
@@ -591,7 +601,7 @@ astcenc_error astcenc_config_init(
}
// Flags field must not contain any unknown flag bits
status = validate_flags(flags);
status = validate_flags(profile, flags);
if (status != ASTCENC_SUCCESS)
{
return status;
@@ -689,6 +699,12 @@ astcenc_error astcenc_context_alloc(
}
ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
if (!ctx->bsd)
{
delete ctxo;
return ASTCENC_ERR_OUT_OF_MEM;
}
bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
init_block_size_descriptor(config.block_x, config.block_y, config.block_z,
can_omit_modes,
@@ -698,7 +714,7 @@ astcenc_error astcenc_context_alloc(
#if !defined(ASTCENC_DECOMPRESS_ONLY)
// Do setup only needed by compression
if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY))
if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
{
// Turn a dB limit into a per-texel error for faster use later
if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
@@ -712,7 +728,7 @@ astcenc_error astcenc_context_alloc(
size_t worksize = sizeof(compression_working_buffers) * thread_count;
ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0,
static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
"compression_working_buffers size must be multiple of vector alignment");
if (!ctx->working_buffers)
{
@@ -802,6 +818,8 @@ static void compress_image(
int row_blocks = xblocks;
int plane_blocks = xblocks * yblocks;
blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
// Populate the block channel weights
blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
ctx.config.cw_g_weight,
@@ -812,7 +830,7 @@ static void compress_image(
auto& temp_buffers = ctx.working_buffers[thread_index];
// Only the first thread actually runs the initializer
ctxo.manage_compress.init(block_count);
ctxo.manage_compress.init(block_count, ctx.config.progress_callback);
// Determine if we can use an optimized load function
bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
@@ -1137,6 +1155,7 @@ astcenc_error astcenc_decompress_image(
unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
unsigned int block_count = zblocks * yblocks * xblocks;
int row_blocks = xblocks;
int plane_blocks = xblocks * yblocks;
@@ -1148,9 +1167,12 @@ astcenc_error astcenc_decompress_image(
return ASTCENC_ERR_OUT_OF_MEM;
}
image_block blk;
image_block blk {};
blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
// Decode mode inferred from the output data type
blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
// If context thread count is one then implicitly reset
if (ctx->thread_count == 1)
{
@@ -1158,7 +1180,7 @@ astcenc_error astcenc_decompress_image(
}
// Only the first thread actually runs the initializer
ctxo->manage_decompress.init(zblocks * yblocks * xblocks);
ctxo->manage_decompress.init(block_count, nullptr);
// All threads run this processing loop until there is no work remaining
while (true)
@@ -1356,6 +1378,8 @@ const char* astcenc_get_error_string(
return "ASTCENC_ERR_BAD_CONTEXT";
case ASTCENC_ERR_NOT_IMPLEMENTED:
return "ASTCENC_ERR_NOT_IMPLEMENTED";
case ASTCENC_ERR_BAD_DECODE_MODE:
return "ASTCENC_ERR_BAD_DECODE_MODE";
#if defined(ASTCENC_DIAGNOSTICS)
case ASTCENC_ERR_DTRACE_FAILURE:
return "ASTCENC_ERR_DTRACE_FAILURE";

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -873,7 +873,7 @@ void compute_ideal_weights_for_decimation(
}
// Otherwise compute an estimate and perform single refinement iteration
alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
// Compute an initial average for each decimated weight
bool constant_wes = ei.is_constant_weight_error_scale;
@@ -1171,7 +1171,7 @@ void recompute_ideal_colors_1plane(
promise(total_texel_count > 0);
promise(partition_count > 0);
alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS];
ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS];
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
{
vint unquant_value(dec_weights_uquant + i);
@@ -1179,7 +1179,7 @@ void recompute_ideal_colors_1plane(
storea(unquant_valuef, dec_weight + i);
}
alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS];
float* undec_weight_ref;
if (di.max_texel_weight_count == 1)
{
@@ -1394,8 +1394,8 @@ void recompute_ideal_colors_2planes(
promise(total_texel_count > 0);
promise(weight_count > 0);
alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
@@ -1410,8 +1410,8 @@ void recompute_ideal_colors_2planes(
storea(unquant_value2f, dec_weight_plane2 + i);
}
alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS];
alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS];
float* undec_weight_plane1_ref;
float* undec_weight_plane2_ref;

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -109,7 +109,7 @@ static vfloat4 swz_texel(
vfloat4 data,
const astcenc_swizzle& swz
) {
alignas(16) float datas[6];
ASTCENC_ALIGNAS float datas[6];
storea(data, datas);
datas[ASTCENC_SWZ_0] = 0.0f;

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -464,10 +464,10 @@ static inline void write_bits(
}
/**
* @brief Read up to 8 bits at an arbitrary bit offset.
* @brief Read up to 16 bits from two bytes.
*
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
* span two separate bytes in memory.
* This function reads a packed N-bit field from two bytes in memory. The stored value must exist
* within the two bytes, but can start at an arbitary bit offset and span the two bytes in memory.
*
* @param bitcount The number of bits to read.
* @param bitoffset The bit offset to read from, between 0 and 7.

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -29,6 +29,7 @@
#include <cstdio>
#endif
#include <cstdlib>
#include <limits>
#include "astcenc.h"
#include "astcenc_mathlib.h"
@@ -325,10 +326,10 @@ struct partition_info
uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS];
/** @brief The partition of each texel in the block. */
uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
/** @brief The list of texels in each partition. */
uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
};
/**
@@ -366,40 +367,40 @@ struct decimation_info
* @brief The number of weights that contribute to each texel.
* Value is between 1 and 4.
*/
uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
/**
* @brief The weight index of the N weights that are interpolated for each texel.
* Stored transposed to improve vectorization.
*/
uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
/**
* @brief The bilinear contribution of the N weights that are interpolated for each texel.
* Value is between 0 and 16, stored transposed to improve vectorization.
*/
uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
/**
* @brief The bilinear contribution of the N weights that are interpolated for each texel.
* Value is between 0 and 1, stored transposed to improve vectorization.
*/
alignas(ASTCENC_VECALIGN) float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
/** @brief The number of texels that each stored weight contributes to. */
uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
ASTCENC_ALIGNAS uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
/**
* @brief The list of texels that use a specific weight index.
* Stored transposed to improve vectorization.
*/
uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
ASTCENC_ALIGNAS uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
/**
* @brief The bilinear contribution to the N texels that use each weight.
* Value is between 0 and 1, stored transposed to improve vectorization.
*/
alignas(ASTCENC_VECALIGN) float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
ASTCENC_ALIGNAS float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
/**
* @brief The bilinear contribution to the Nth texel that uses each weight.
@@ -579,7 +580,7 @@ struct block_size_descriptor
decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES];
/** @brief The active decimation tables, stored in low indices. */
alignas(ASTCENC_VECALIGN) decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
ASTCENC_ALIGNAS decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
/** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */
uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES];
@@ -731,7 +732,11 @@ struct block_size_descriptor
*
* The @c data_[rgba] fields store the image data in an encoded SoA float form designed for easy
* vectorization. Input data is converted to float and stored as values between 0 and 65535. LDR
* data is stored as direct UNORM data, HDR data is stored as LNS data.
* data is stored as direct UNORM data, HDR data is stored as LNS data. They are allocated SIMD
* elements over-size to allow vectorized stores of unaligned and partial SIMD lanes (e.g. in a
* 6x6x6 block the final row write will read elements 210-217 (vec8) or 214-217 (vec4), which is
* two elements above the last real data element). The overspill values are never written to memory,
* and would be benign, but the padding avoids hitting undefined behavior.
*
* The @c rgb_lns and @c alpha_lns fields that assigned a per-texel use of HDR are only used during
* decompression. The current compressor will always use HDR endpoint formats when in HDR mode.
@@ -739,16 +744,16 @@ struct block_size_descriptor
struct image_block
{
/** @brief The input (compress) or output (decompress) data for the red color component. */
alignas(ASTCENC_VECALIGN) float data_r[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float data_r[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
/** @brief The input (compress) or output (decompress) data for the green color component. */
alignas(ASTCENC_VECALIGN) float data_g[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float data_g[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
/** @brief The input (compress) or output (decompress) data for the blue color component. */
alignas(ASTCENC_VECALIGN) float data_b[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float data_b[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
/** @brief The input (compress) or output (decompress) data for the alpha color component. */
alignas(ASTCENC_VECALIGN) float data_a[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float data_a[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
/** @brief The number of texels in the block. */
uint8_t texel_count;
@@ -771,6 +776,9 @@ struct image_block
/** @brief Is this grayscale block where R == G == B for all texels? */
bool grayscale;
/** @brief Is the eventual decode using decode_unorm8 rounding? */
bool decode_unorm8;
/** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */
uint8_t rgb_lns[BLOCK_MAX_TEXELS];
@@ -897,10 +905,10 @@ struct endpoints_and_weights
endpoints ep;
/** @brief The ideal weight for each texel; may be undecimated or decimated. */
alignas(ASTCENC_VECALIGN) float weights[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float weights[BLOCK_MAX_TEXELS];
/** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */
alignas(ASTCENC_VECALIGN) float weight_error_scale[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float weight_error_scale[BLOCK_MAX_TEXELS];
};
/**
@@ -930,7 +938,7 @@ struct encoding_choice_errors
/**
* @brief Preallocated working buffers, allocated per thread during context creation.
*/
struct alignas(ASTCENC_VECALIGN) compression_working_buffers
struct ASTCENC_ALIGNAS compression_working_buffers
{
/** @brief Ideal endpoints and weights for plane 1. */
endpoints_and_weights ei1;
@@ -946,17 +954,17 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers
*
* For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
*/
alignas(ASTCENC_VECALIGN) float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
ASTCENC_ALIGNAS float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
/**
* @brief Decimated quantized weight values in the unquantized 0-64 range.
*
* For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
*/
uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
ASTCENC_ALIGNAS uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
/** @brief Error of the best encoding combination for each block mode. */
alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
ASTCENC_ALIGNAS float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
/** @brief The best color quant for each block mode. */
uint8_t best_quant_levels[WEIGHTS_MAX_BLOCK_MODES];
@@ -1107,7 +1115,7 @@ struct symbolic_compressed_block
*
* If dual plane, the second plane starts at @c weights[WEIGHTS_PLANE2_OFFSET].
*/
uint8_t weights[BLOCK_MAX_WEIGHTS];
ASTCENC_ALIGNAS uint8_t weights[BLOCK_MAX_WEIGHTS];
/**
* @brief Get the weight quantization used by this block mode.
@@ -1563,6 +1571,33 @@ unsigned int find_best_partition_candidates(
Functionality for managing images and image related data.
============================================================================ */
/**
* @brief Get a vector mask indicating lanes decompressing into a UNORM8 value.
*
* @param decode_mode The color profile for LDR_SRGB settings.
* @param blk The image block for output image bitness settings.
*
* @return The component mask vector.
*/
static inline vmask4 get_u8_component_mask(
astcenc_profile decode_mode,
const image_block& blk
) {
vmask4 u8_mask(false);
// Decode mode writing to a unorm8 output value
if (blk.decode_unorm8)
{
u8_mask = vmask4(true);
}
// SRGB writing to a unorm8 RGB value
else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
u8_mask = vmask4(true, true, true, false);
}
return u8_mask;
}
/**
* @brief Setup computation of regional averages in an image.
*
@@ -1816,7 +1851,7 @@ uint8_t pack_color_endpoints(
*
* Endpoints must be unscrambled and converted into the 0-255 range before calling this functions.
*
* @param decode_mode The decode mode (LDR, HDR).
* @param decode_mode The decode mode (LDR, HDR, etc).
* @param format The color endpoint mode used.
* @param input The raw array of encoded input integers. The length of this array
* depends on @c format; it can be safely assumed to be large enough.
@@ -2142,10 +2177,11 @@ Platform-specific functions.
/**
* @brief Allocate an aligned memory buffer.
*
* Allocated memory must be freed by aligned_free;
* Allocated memory must be freed by aligned_free.
*
* @param size The desired buffer size.
* @param align The desired buffer alignment; must be 2^N.
* @param align The desired buffer alignment; must be 2^N, may be increased
* by the implementation to a minimum allowable alignment.
*
* @return The memory buffer pointer or nullptr on allocation failure.
*/
@@ -2155,10 +2191,14 @@ T* aligned_malloc(size_t size, size_t align)
void* ptr;
int error = 0;
// Don't allow this to under-align a type
size_t min_align = astc::max(alignof(T), sizeof(void*));
size_t real_align = astc::max(min_align, align);
#if defined(_WIN32)
ptr = _aligned_malloc(size, align);
ptr = _aligned_malloc(size, real_align);
#else
error = posix_memalign(&ptr, align, size);
error = posix_memalign(&ptr, real_align, size);
#endif
if (error || (!ptr))

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -118,6 +118,18 @@ private:
/** @brief Number of tasks that need to be processed. */
unsigned int m_task_count;
/** @brief Progress callback (optional). */
astcenc_progress_callback m_callback;
/** @brief Lock used for callback synchronization. */
std::mutex m_callback_lock;
/** @brief Minimum progress before making a callback. */
float m_callback_min_diff;
/** @brief Last progress callback value. */
float m_callback_last_value;
public:
/** @brief Create a new ParallelManager. */
ParallelManager()
@@ -138,6 +150,9 @@ public:
m_start_count = 0;
m_done_count = 0;
m_task_count = 0;
m_callback = nullptr;
m_callback_last_value = 0.0f;
m_callback_min_diff = 1.0f;
}
/**
@@ -166,14 +181,20 @@ public:
* initialization. Other threads will block and wait for it to complete.
*
* @param task_count Total number of tasks needing processing.
* @param callback Function pointer for progress status callbacks.
*/
void init(unsigned int task_count)
void init(unsigned int task_count, astcenc_progress_callback callback)
{
std::lock_guard<std::mutex> lck(m_lock);
if (!m_init_done)
{
m_callback = callback;
m_task_count = task_count;
m_init_done = true;
// Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead
float min_diff = (4096.0f / static_cast<float>(task_count)) * 100.0f;
m_callback_min_diff = astc::max(min_diff, 1.0f);
}
}
@@ -212,15 +233,52 @@ public:
{
// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
// update here and the wait() for other threads
unsigned int local_count;
float local_last_value;
{
std::unique_lock<std::mutex> lck(m_lock);
this->m_done_count += count;
m_done_count += count;
local_count = m_done_count;
local_last_value = m_callback_last_value;
if (m_done_count == m_task_count)
{
// Ensure the progress bar hits 100%
if (m_callback)
{
std::unique_lock<std::mutex> cblck(m_callback_lock);
m_callback(100.0f);
m_callback_last_value = 100.0f;
}
lck.unlock();
m_complete.notify_all();
}
}
// Process progress callback if we have one
if (m_callback)
{
// Initial lockless test - have we progressed enough to emit?
float num = static_cast<float>(local_count);
float den = static_cast<float>(m_task_count);
float this_value = (num / den) * 100.0f;
bool report_test = (this_value - local_last_value) > m_callback_min_diff;
// Recheck under lock, because another thread might report first
if (report_test)
{
std::unique_lock<std::mutex> cblck(m_callback_lock);
bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff;
if (report_retest)
{
m_callback(this_value);
m_callback_last_value = this_value;
}
}
}
}
/**
* @brief Wait for stage processing to complete.
*/

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -73,10 +73,22 @@
#endif
#endif
// Force vector-sized SIMD alignment
#if ASTCENC_AVX
#define ASTCENC_VECALIGN 32
#else
#elif ASTCENC_SSE || ASTCENC_NEON
#define ASTCENC_VECALIGN 16
// Use default alignment for non-SIMD builds
#else
#define ASTCENC_VECALIGN 0
#endif
// C++11 states that alignas(0) should be ignored but GCC doesn't do
// this on some versions, so workaround and avoid emitting alignas(0)
#if ASTCENC_VECALIGN > 0
#define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN)
#else
#define ASTCENC_ALIGNAS
#endif
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0

View File

@@ -15,13 +15,13 @@
// under the License.
// ----------------------------------------------------------------------------
#include "astcenc_mathlib.h"
/**
* @brief Soft-float library for IEEE-754.
*/
#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
#include "astcenc_mathlib.h"
/* sized soft-float types. These are mapped to the sized integer
types of C99, instead of C's floating-point types; this is because
the library needs to maintain exact, bit-level control on all

View File

@@ -330,12 +330,14 @@ void physical_to_symbolic(
return;
}
// Low values span 3 bytes so need two read_bits calls
int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8);
int vx_high_s = read_bits(8, 25, pcb) | (read_bits(5, 25 + 8, pcb) << 8);
int vx_high_s = read_bits(13, 25, pcb);
int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8);
int vx_high_t = read_bits(8, 51, pcb) | (read_bits(5, 51 + 8, pcb) << 8);
int vx_high_t = read_bits(13, 51, pcb);
int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF &&
vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
{
@@ -350,12 +352,14 @@ void physical_to_symbolic(
int vx_high_s = read_bits(9, 19, pcb);
int vx_low_t = read_bits(9, 28, pcb);
int vx_high_t = read_bits(9, 37, pcb);
int vx_low_p = read_bits(9, 46, pcb);
int vx_high_p = read_bits(9, 55, pcb);
int vx_low_r = read_bits(9, 46, pcb);
int vx_high_r = read_bits(9, 55, pcb);
int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;
int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF &&
vx_low_t == 0x1FF && vx_high_t == 0x1FF &&
vx_low_r == 0x1FF && vx_high_r == 0x1FF;
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_p >= vx_high_p) && !all_ones)
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_r >= vx_high_r) && !all_ones)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
@@ -470,8 +474,7 @@ void physical_to_symbolic(
bitpos += 2;
}
}
scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb) |
(read_bits(PARTITION_INDEX_BITS - 6, 19, pcb) << 6));
scb.partition_index = static_cast<uint16_t>(read_bits(10, 13, pcb));
}
for (int i = 0; i < partition_count; i++)

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2019-2022 Arm Limited
// Copyright 2019-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -1170,7 +1170,7 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 ma
*/
ASTCENC_SIMD_INLINE void print(vint8 a)
{
alignas(ASTCENC_VECALIGN) int v[8];
alignas(32) int v[8];
storea(a, v);
printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n",
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
@@ -1181,7 +1181,7 @@ ASTCENC_SIMD_INLINE void print(vint8 a)
*/
ASTCENC_SIMD_INLINE void printx(vint8 a)
{
alignas(ASTCENC_VECALIGN) int v[8];
alignas(32) int v[8];
storea(a, v);
printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n",
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
@@ -1192,7 +1192,7 @@ ASTCENC_SIMD_INLINE void printx(vint8 a)
*/
ASTCENC_SIMD_INLINE void print(vfloat8 a)
{
alignas(ASTCENC_VECALIGN) float v[8];
alignas(32) float v[8];
storea(a, v);
printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
static_cast<double>(v[0]), static_cast<double>(v[1]),

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2021 Arm Limited
// Copyright 2020-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -383,7 +383,7 @@ static ASTCENC_SIMD_INLINE void bit_transfer_signed(
*/
ASTCENC_SIMD_INLINE void print(vint4 a)
{
alignas(16) int v[4];
ASTCENC_ALIGNAS int v[4];
storea(a, v);
printf("v4_i32:\n %8d %8d %8d %8d\n",
v[0], v[1], v[2], v[3]);
@@ -394,7 +394,7 @@ ASTCENC_SIMD_INLINE void print(vint4 a)
*/
ASTCENC_SIMD_INLINE void printx(vint4 a)
{
alignas(16) int v[4];
ASTCENC_ALIGNAS int v[4];
storea(a, v);
printf("v4_i32:\n %08x %08x %08x %08x\n",
v[0], v[1], v[2], v[3]);
@@ -405,7 +405,7 @@ ASTCENC_SIMD_INLINE void printx(vint4 a)
*/
ASTCENC_SIMD_INLINE void print(vfloat4 a)
{
alignas(16) float v[4];
ASTCENC_ALIGNAS float v[4];
storea(a, v);
printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
static_cast<double>(v[0]), static_cast<double>(v[1]),

View File

@@ -359,9 +359,9 @@ struct vmask4
/**
* @brief Get the scalar from a single lane.
*/
template <int32_t l> ASTCENC_SIMD_INLINE uint32_t lane() const
template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
{
return vgetq_lane_u32(m, l);
return vgetq_lane_u32(m, l) != 0;
}
/**

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2019-2022 Arm Limited
// Copyright 2019-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -351,6 +351,13 @@ struct vmask4
m[3] = d == false ? 0 : -1;
}
/**
* @brief Get the scalar value of a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE float lane() const
{
return m[l] != 0;
}
/**
* @brief The vector ...
@@ -549,10 +556,16 @@ ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
*/
template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
{
return vint4(a.m[0] << s,
a.m[1] << s,
a.m[2] << s,
a.m[3] << s);
// Cast to unsigned to avoid shift in/out of sign bit undefined behavior
unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
return vint4(static_cast<int>(as0),
static_cast<int>(as1),
static_cast<int>(as2),
static_cast<int>(as3));
}
/**
@@ -560,6 +573,7 @@ template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
*/
template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
{
// Cast to unsigned to avoid shift in/out of sign bit undefined behavior
unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2019-2022 Arm Limited
// Copyright 2019-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -379,9 +379,9 @@ struct vmask4
/**
* @brief Get the scalar value of a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE float lane() const
template <int l> ASTCENC_SIMD_INLINE bool lane() const
{
return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l));
return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)) != 0.0f;
}
/**

View File

@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -60,8 +60,8 @@ static const uint8_t steps_for_quant_level[12] {
2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
};
alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
#if defined(ASTCENC_DIAGNOSTICS)
static bool print_once { true };
@@ -99,7 +99,7 @@ static void compute_angular_offsets(
promise(weight_count > 0);
promise(max_angular_steps > 0);
alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS];
ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
// Precompute isample; arrays are always allocated 64 elements long
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
@@ -242,16 +242,16 @@ static void compute_angular_endpoints_for_quant_levels(
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
compute_angular_offsets(weight_count, dec_weight_ideal_value,
max_angular_steps, angular_offsets);
alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS];
ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
max_angular_steps, max_quant_steps,