diff --git a/3rdparty/astc-encoder/include/astcenc.h b/3rdparty/astc-encoder/include/astcenc.h index c6c8c14..3d04b4e 100644 --- a/3rdparty/astc-encoder/include/astcenc.h +++ b/3rdparty/astc-encoder/include/astcenc.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2020-2023 Arm Limited +// Copyright 2020-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -215,6 +215,8 @@ enum astcenc_error { ASTCENC_ERR_BAD_CONTEXT, /** @brief The call failed due to unimplemented functionality. */ ASTCENC_ERR_NOT_IMPLEMENTED, + /** @brief The call failed due to an out-of-spec decode mode flag set. */ + ASTCENC_ERR_BAD_DECODE_MODE, #if defined(ASTCENC_DIAGNOSTICS) /** @brief The call failed due to an issue with diagnostic tracing. */ ASTCENC_ERR_DTRACE_FAILURE, @@ -302,6 +304,11 @@ enum astcenc_type ASTCENC_TYPE_F32 = 2 }; +/** + * @brief Function pointer type for compression progress reporting callback. + */ +extern "C" typedef void (*astcenc_progress_callback)(float); + /** * @brief Enable normal map compression. * @@ -312,6 +319,19 @@ enum astcenc_type */ static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0; +/** + * @brief Enable compression heuristics that assume use of decode_unorm8 decode mode. + * + * The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this + * flag during compression will allow the compressor to use the correct rounding when selecting + * encodings. This will improve the compressed image quality if your application is using the + * decode_unorm8 decode mode, but will reduce image quality if using decode_fp16. + * + * Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of + * this setting. + */ +static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8 = 1 << 1; + /** * @brief Enable alpha weighting. * @@ -378,6 +398,7 @@ static const unsigned int ASTCENC_ALL_FLAGS = ASTCENC_FLG_MAP_RGBM | ASTCENC_FLG_USE_ALPHA_WEIGHT | ASTCENC_FLG_USE_PERCEPTUAL | + ASTCENC_FLG_USE_DECODE_UNORM8 | ASTCENC_FLG_DECOMPRESS_ONLY | ASTCENC_FLG_SELF_DECOMPRESS_ONLY; @@ -550,6 +571,16 @@ struct astcenc_config */ float tune_search_mode0_enable; + /** + * @brief The progress callback, can be @c nullptr. + * + * If this is specified the codec will peridocially report progress for + * compression as a percentage between 0 and 100. The callback is called from one + * of the compressor threads, so doing significant work in the callback will + * reduce compression performance. + */ + astcenc_progress_callback progress_callback; + #if defined(ASTCENC_DIAGNOSTICS) /** * @brief The path to save the diagnostic trace data to. diff --git a/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp b/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp index 10fb6bc..2daa515 100644 --- a/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp +++ b/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2023 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -894,32 +894,55 @@ void unpack_color_endpoints( } } - vint4 ldr_scale(257); - vint4 hdr_scale(1); - vint4 output_scale = ldr_scale; + // Handle endpoint errors and expansion - // An LDR profile image - if ((decode_mode == ASTCENC_PRF_LDR) || - (decode_mode == ASTCENC_PRF_LDR_SRGB)) + // Linear LDR 8-bit endpoints are expanded to 16-bit by replication + if (decode_mode == ASTCENC_PRF_LDR) { - // Also matches HDR alpha, as cannot have HDR alpha without HDR RGB - if (rgb_hdr == true) + // Error color - HDR endpoint in an LDR encoding + if (rgb_hdr || alpha_hdr) { - output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00); - output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00); - output_scale = hdr_scale; - + output0 = vint4(0xFF, 0x00, 0xFF, 0xFF); + output1 = vint4(0xFF, 0x00, 0xFF, 0xFF); rgb_hdr = false; alpha_hdr = false; } + + output0 = output0 * 257; + output1 = output1 * 257; } - // An HDR profile image + // sRGB LDR 8-bit endpoints are expanded to 16 bit by: + // - RGB = shift left by 8 bits and OR with 0x80 + // - A = replication + else if (decode_mode == ASTCENC_PRF_LDR_SRGB) + { + // Error color - HDR endpoint in an LDR encoding + if (rgb_hdr || alpha_hdr) + { + output0 = vint4(0xFF, 0x00, 0xFF, 0xFF); + output1 = vint4(0xFF, 0x00, 0xFF, 0xFF); + rgb_hdr = false; + alpha_hdr = false; + } + + vmask4 mask(true, true, true, false); + + vint4 output0rgb = lsl<8>(output0) | vint4(0x80); + vint4 output0a = output0 * 257; + output0 = select(output0a, output0rgb, mask); + + vint4 output1rgb = lsl<8>(output1) | vint4(0x80); + vint4 output1a = output1 * 257; + output1 = select(output1a, output1rgb, mask); + } + // An HDR profile decode, but may be using linear LDR endpoints + // Linear LDR 8-bit endpoints are expanded to 16-bit by replication + // HDR endpoints are already 16-bit else { vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr); - output_scale = select(ldr_scale, hdr_scale, hdr_lanes); + vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes); + output0 = output0 * output_scale; + output1 = output1 * output_scale; } - - output0 = output0 * output_scale; - output1 = output1 * output_scale; } diff --git a/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp b/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp index 0c90540..98d2495 100644 --- a/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp +++ b/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -247,7 +247,7 @@ static bool realign_weights_decimated( } // Create an unquantized weight grid for this decimation level - alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS]; for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH) { vint unquant_value(dec_weights_uquant + we_idx); @@ -467,7 +467,7 @@ static float compress_symbolic_block_for_partition_1plane( qwt_bitcounts[i] = static_cast(bitcount); - alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; // Generate the optimized set of weights for the weight mode compute_quantized_weights_for_decimation( @@ -830,7 +830,7 @@ static float compress_symbolic_block_for_partition_2planes( unsigned int decimation_mode = bm.decimation_mode; const auto& di = bsd.get_decimation_info(decimation_mode); - alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; // Generate the optimized set of weights for the mode compute_quantized_weights_for_decimation( diff --git a/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp b/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp index dd331a9..7463f7e 100644 --- a/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp +++ b/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -27,15 +27,15 @@ /** * @brief Compute the integer linear interpolation of two color endpoints. * - * @param decode_mode The ASTC profile (linear or sRGB) + * @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16. * @param color0 The endpoint0 color. * @param color1 The endpoint1 color. - * @param weights The interpolation weight (between 0 and 64). + * @param weights The interpolation weight (between 0 and 64). * * @return The interpolated color. */ static vint4 lerp_color_int( - astcenc_profile decode_mode, + vmask4 u8_mask, vint4 color0, vint4 color1, vint4 weights @@ -43,24 +43,18 @@ static vint4 lerp_color_int( vint4 weight1 = weights; vint4 weight0 = vint4(64) - weight1; - if (decode_mode == ASTCENC_PRF_LDR_SRGB) - { - color0 = asr<8>(color0); - color1 = asr<8>(color1); - } - vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32); color = asr<6>(color); - if (decode_mode == ASTCENC_PRF_LDR_SRGB) - { - color = color * vint4(257); - } + // For decode_unorm8 values force the codec to bit replicate. This allows the + // rest of the codec to assume the full 0xFFFF range for everything and ignore + // the decode_mode setting + vint4 color_u8 = asr<8>(color) * vint4(257); + color = select(color, color_u8, u8_mask); return color; } - /** * @brief Convert integer color value into a float value for the decoder. * @@ -229,12 +223,13 @@ void decompress_symbolic_block( { vint4 colori(scb.constant_color); - // For sRGB decoding a real decoder would just use the top 8 bits for color conversion. - // We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range. - if (decode_mode == ASTCENC_PRF_LDR_SRGB) - { - colori = asr<8>(colori) * 257; - } + // Determine the UNORM8 rounding on the decode + vmask4 u8_mask = get_u8_component_mask(decode_mode, blk); + + // The real decoder would just use the top 8 bits, but we rescale + // in to a 16-bit value that rounds correctly. + vint4 colori_u8 = asr<8>(colori) * 257; + colori = select(colori, colori_u8, u8_mask); vint4 colorf16 = unorm16_to_sf16(colori); color = float16_to_float(colorf16); @@ -289,6 +284,8 @@ void decompress_symbolic_block( int plane2_component = scb.plane2_component; vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component); + vmask4 u8_mask = get_u8_component_mask(decode_mode, blk); + for (int i = 0; i < partition_count; i++) { // Decode the color endpoints for this partition @@ -310,7 +307,7 @@ void decompress_symbolic_block( { int tix = pi.texels_of_partition[i][j]; vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask); - vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight); + vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight); vfloat4 colorf = decode_texel(color, lns_mask); blk.data_r[tix] = colorf.lane<0>(); @@ -365,12 +362,14 @@ float compute_symbolic_block_difference_2plane( rgb_lns, a_lns, ep0, ep1); + vmask4 u8_mask = get_u8_component_mask(config.profile, blk); + // Unpack and compute error for each texel in the partition unsigned int texel_count = bsd.texel_count; for (unsigned int i = 0; i < texel_count; i++) { vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask); - vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight); + vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight); vfloat4 color = int_to_float(colori); vfloat4 oldColor = blk.texel(i); @@ -444,6 +443,8 @@ float compute_symbolic_block_difference_1plane( int plane1_weights[BLOCK_MAX_TEXELS]; unpack_weights(bsd, scb, di, false, plane1_weights, nullptr); + vmask4 u8_mask = get_u8_component_mask(config.profile, blk); + vfloat4 summa = vfloat4::zero(); for (unsigned int i = 0; i < partition_count; i++) { @@ -464,7 +465,7 @@ float compute_symbolic_block_difference_1plane( for (unsigned int j = 0; j < texel_count; j++) { unsigned int tix = pi.texels_of_partition[i][j]; - vint4 colori = lerp_color_int(config.profile, ep0, ep1, + vint4 colori = lerp_color_int(u8_mask, ep0, ep1, vint4(plane1_weights[tix])); vfloat4 color = int_to_float(colori); @@ -532,7 +533,7 @@ float compute_symbolic_block_difference_1plane_1partition( const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); // Unquantize and undecimate the weights - alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS]; unpack_weights(bsd, scb, di, false, plane1_weights, nullptr); // Decode the color endpoints for this partition @@ -547,19 +548,12 @@ float compute_symbolic_block_difference_1plane_1partition( rgb_lns, a_lns, ep0, ep1); - - // Pre-shift sRGB so things round correctly - if (config.profile == ASTCENC_PRF_LDR_SRGB) - { - ep0 = asr<8>(ep0); - ep1 = asr<8>(ep1); - } + vmask4 u8_mask = get_u8_component_mask(config.profile, blk); // Unpack and compute error for each texel in the partition vfloatacc summav = vfloatacc::zero(); vint lane_id = vint::lane_id(); - vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1); unsigned int texel_count = bsd.texel_count; for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) @@ -578,11 +572,25 @@ float compute_symbolic_block_difference_1plane_1partition( vint ep0_b = vint(ep0.lane<2>()) * weight0; vint ep0_a = vint(ep0.lane<3>()) * weight0; - // Shift so things round correctly - vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale; - vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale; - vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale; - vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale; + // Combine contributions + vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)); + vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)); + vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)); + vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)); + + // If using a U8 decode mode bit replicate top 8 bits + // so rest of codec can assume 0xFFFF max range everywhere + vint colori_r8 = asr<8>(colori_r) * vint(257); + colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>())); + + vint colori_g8 = asr<8>(colori_g) * vint(257); + colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>())); + + vint colori_b8 = asr<8>(colori_b) * vint(257); + colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>())); + + vint colori_a8 = asr<8>(colori_a) * vint(257); + colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>())); // Compute color diff vfloat color_r = int_to_float(colori_r); diff --git a/3rdparty/astc-encoder/source/astcenc_entry.cpp b/3rdparty/astc-encoder/source/astcenc_entry.cpp index 03cf6a8..5dc3801 100644 --- a/3rdparty/astc-encoder/source/astcenc_entry.cpp +++ b/3rdparty/astc-encoder/source/astcenc_entry.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -217,11 +217,13 @@ static astcenc_error validate_block_size( /** * @brief Validate flags. * - * @param flags The flags to check. + * @param profile The profile to check. + * @param flags The flags to check. * * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. */ static astcenc_error validate_flags( + astcenc_profile profile, unsigned int flags ) { // Flags field must not contain any unknown flag bits @@ -239,6 +241,14 @@ static astcenc_error validate_flags( return ASTCENC_ERR_BAD_FLAGS; } + // Decode_unorm8 must only be used with an LDR profile + bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8; + bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A); + if (is_unorm8 && is_hdr) + { + return ASTCENC_ERR_BAD_DECODE_MODE; + } + return ASTCENC_SUCCESS; } @@ -364,7 +374,7 @@ static astcenc_error validate_config( return status; } - status = validate_flags(config.flags); + status = validate_flags(config.profile, config.flags); if (status != ASTCENC_SUCCESS) { return status; @@ -591,7 +601,7 @@ astcenc_error astcenc_config_init( } // Flags field must not contain any unknown flag bits - status = validate_flags(flags); + status = validate_flags(profile, flags); if (status != ASTCENC_SUCCESS) { return status; @@ -689,6 +699,12 @@ astcenc_error astcenc_context_alloc( } ctx->bsd = aligned_malloc(sizeof(block_size_descriptor), ASTCENC_VECALIGN); + if (!ctx->bsd) + { + delete ctxo; + return ASTCENC_ERR_OUT_OF_MEM; + } + bool can_omit_modes = static_cast(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY); init_block_size_descriptor(config.block_x, config.block_y, config.block_z, can_omit_modes, @@ -698,7 +714,7 @@ astcenc_error astcenc_context_alloc( #if !defined(ASTCENC_DECOMPRESS_ONLY) // Do setup only needed by compression - if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY)) + if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)) { // Turn a dB limit into a per-texel error for faster use later if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB)) @@ -712,7 +728,7 @@ astcenc_error astcenc_context_alloc( size_t worksize = sizeof(compression_working_buffers) * thread_count; ctx->working_buffers = aligned_malloc(worksize, ASTCENC_VECALIGN); - static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0, + static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0), "compression_working_buffers size must be multiple of vector alignment"); if (!ctx->working_buffers) { @@ -802,6 +818,8 @@ static void compress_image( int row_blocks = xblocks; int plane_blocks = xblocks * yblocks; + blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8; + // Populate the block channel weights blk.channel_weight = vfloat4(ctx.config.cw_r_weight, ctx.config.cw_g_weight, @@ -812,7 +830,7 @@ static void compress_image( auto& temp_buffers = ctx.working_buffers[thread_index]; // Only the first thread actually runs the initializer - ctxo.manage_compress.init(block_count); + ctxo.manage_compress.init(block_count, ctx.config.progress_callback); // Determine if we can use an optimized load function bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) || @@ -1137,6 +1155,7 @@ astcenc_error astcenc_decompress_image( unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x; unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y; unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z; + unsigned int block_count = zblocks * yblocks * xblocks; int row_blocks = xblocks; int plane_blocks = xblocks * yblocks; @@ -1148,9 +1167,12 @@ astcenc_error astcenc_decompress_image( return ASTCENC_ERR_OUT_OF_MEM; } - image_block blk; + image_block blk {}; blk.texel_count = static_cast(block_x * block_y * block_z); + // Decode mode inferred from the output data type + blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8; + // If context thread count is one then implicitly reset if (ctx->thread_count == 1) { @@ -1158,7 +1180,7 @@ astcenc_error astcenc_decompress_image( } // Only the first thread actually runs the initializer - ctxo->manage_decompress.init(zblocks * yblocks * xblocks); + ctxo->manage_decompress.init(block_count, nullptr); // All threads run this processing loop until there is no work remaining while (true) @@ -1356,6 +1378,8 @@ const char* astcenc_get_error_string( return "ASTCENC_ERR_BAD_CONTEXT"; case ASTCENC_ERR_NOT_IMPLEMENTED: return "ASTCENC_ERR_NOT_IMPLEMENTED"; + case ASTCENC_ERR_BAD_DECODE_MODE: + return "ASTCENC_ERR_BAD_DECODE_MODE"; #if defined(ASTCENC_DIAGNOSTICS) case ASTCENC_ERR_DTRACE_FAILURE: return "ASTCENC_ERR_DTRACE_FAILURE"; diff --git a/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp b/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp index 89ec9dc..051782f 100644 --- a/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp +++ b/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -873,7 +873,7 @@ void compute_ideal_weights_for_decimation( } // Otherwise compute an estimate and perform single refinement iteration - alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS]; // Compute an initial average for each decimated weight bool constant_wes = ei.is_constant_weight_error_scale; @@ -1171,7 +1171,7 @@ void recompute_ideal_colors_1plane( promise(total_texel_count > 0); promise(partition_count > 0); - alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS]; for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) { vint unquant_value(dec_weights_uquant + i); @@ -1179,7 +1179,7 @@ void recompute_ideal_colors_1plane( storea(unquant_valuef, dec_weight + i); } - alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS]; float* undec_weight_ref; if (di.max_texel_weight_count == 1) { @@ -1394,8 +1394,8 @@ void recompute_ideal_colors_2planes( promise(total_texel_count > 0); promise(weight_count > 0); - alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE]; - alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE]; + ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE]; + ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE]; assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE); @@ -1410,8 +1410,8 @@ void recompute_ideal_colors_2planes( storea(unquant_value2f, dec_weight_plane2 + i); } - alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS]; - alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS]; float* undec_weight_plane1_ref; float* undec_weight_plane2_ref; diff --git a/3rdparty/astc-encoder/source/astcenc_image.cpp b/3rdparty/astc-encoder/source/astcenc_image.cpp index b60d9cd..079f69f 100644 --- a/3rdparty/astc-encoder/source/astcenc_image.cpp +++ b/3rdparty/astc-encoder/source/astcenc_image.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2022 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -109,7 +109,7 @@ static vfloat4 swz_texel( vfloat4 data, const astcenc_swizzle& swz ) { - alignas(16) float datas[6]; + ASTCENC_ALIGNAS float datas[6]; storea(data, datas); datas[ASTCENC_SWZ_0] = 0.0f; diff --git a/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp b/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp index 4167503..41dc38b 100644 --- a/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp +++ b/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -464,10 +464,10 @@ static inline void write_bits( } /** - * @brief Read up to 8 bits at an arbitrary bit offset. + * @brief Read up to 16 bits from two bytes. * - * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may - * span two separate bytes in memory. + * This function reads a packed N-bit field from two bytes in memory. The stored value must exist + * within the two bytes, but can start at an arbitary bit offset and span the two bytes in memory. * * @param bitcount The number of bits to read. * @param bitoffset The bit offset to read from, between 0 and 7. diff --git a/3rdparty/astc-encoder/source/astcenc_internal.h b/3rdparty/astc-encoder/source/astcenc_internal.h index b1da41b..df6e07f 100644 --- a/3rdparty/astc-encoder/source/astcenc_internal.h +++ b/3rdparty/astc-encoder/source/astcenc_internal.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -29,6 +29,7 @@ #include #endif #include +#include #include "astcenc.h" #include "astcenc_mathlib.h" @@ -325,10 +326,10 @@ struct partition_info uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS]; /** @brief The partition of each texel in the block. */ - uint8_t partition_of_texel[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS uint8_t partition_of_texel[BLOCK_MAX_TEXELS]; /** @brief The list of texels in each partition. */ - uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS]; }; /** @@ -366,40 +367,40 @@ struct decimation_info * @brief The number of weights that contribute to each texel. * Value is between 1 and 4. */ - uint8_t texel_weight_count[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS uint8_t texel_weight_count[BLOCK_MAX_TEXELS]; /** * @brief The weight index of the N weights that are interpolated for each texel. * Stored transposed to improve vectorization. */ - uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS]; /** * @brief The bilinear contribution of the N weights that are interpolated for each texel. * Value is between 0 and 16, stored transposed to improve vectorization. */ - uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS]; /** * @brief The bilinear contribution of the N weights that are interpolated for each texel. * Value is between 0 and 1, stored transposed to improve vectorization. */ - alignas(ASTCENC_VECALIGN) float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS]; /** @brief The number of texels that each stored weight contributes to. */ - uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS]; /** * @brief The list of texels that use a specific weight index. * Stored transposed to improve vectorization. */ - uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS]; /** * @brief The bilinear contribution to the N texels that use each weight. * Value is between 0 and 1, stored transposed to improve vectorization. */ - alignas(ASTCENC_VECALIGN) float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS]; /** * @brief The bilinear contribution to the Nth texel that uses each weight. @@ -579,7 +580,7 @@ struct block_size_descriptor decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES]; /** @brief The active decimation tables, stored in low indices. */ - alignas(ASTCENC_VECALIGN) decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES]; + ASTCENC_ALIGNAS decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES]; /** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */ uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES]; @@ -731,7 +732,11 @@ struct block_size_descriptor * * The @c data_[rgba] fields store the image data in an encoded SoA float form designed for easy * vectorization. Input data is converted to float and stored as values between 0 and 65535. LDR - * data is stored as direct UNORM data, HDR data is stored as LNS data. + * data is stored as direct UNORM data, HDR data is stored as LNS data. They are allocated SIMD + * elements over-size to allow vectorized stores of unaligned and partial SIMD lanes (e.g. in a + * 6x6x6 block the final row write will read elements 210-217 (vec8) or 214-217 (vec4), which is + * two elements above the last real data element). The overspill values are never written to memory, + * and would be benign, but the padding avoids hitting undefined behavior. * * The @c rgb_lns and @c alpha_lns fields that assigned a per-texel use of HDR are only used during * decompression. The current compressor will always use HDR endpoint formats when in HDR mode. @@ -739,16 +744,16 @@ struct block_size_descriptor struct image_block { /** @brief The input (compress) or output (decompress) data for the red color component. */ - alignas(ASTCENC_VECALIGN) float data_r[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float data_r[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1]; /** @brief The input (compress) or output (decompress) data for the green color component. */ - alignas(ASTCENC_VECALIGN) float data_g[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float data_g[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1]; /** @brief The input (compress) or output (decompress) data for the blue color component. */ - alignas(ASTCENC_VECALIGN) float data_b[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float data_b[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1]; /** @brief The input (compress) or output (decompress) data for the alpha color component. */ - alignas(ASTCENC_VECALIGN) float data_a[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float data_a[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1]; /** @brief The number of texels in the block. */ uint8_t texel_count; @@ -771,6 +776,9 @@ struct image_block /** @brief Is this grayscale block where R == G == B for all texels? */ bool grayscale; + /** @brief Is the eventual decode using decode_unorm8 rounding? */ + bool decode_unorm8; + /** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */ uint8_t rgb_lns[BLOCK_MAX_TEXELS]; @@ -897,10 +905,10 @@ struct endpoints_and_weights endpoints ep; /** @brief The ideal weight for each texel; may be undecimated or decimated. */ - alignas(ASTCENC_VECALIGN) float weights[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float weights[BLOCK_MAX_TEXELS]; /** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */ - alignas(ASTCENC_VECALIGN) float weight_error_scale[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float weight_error_scale[BLOCK_MAX_TEXELS]; }; /** @@ -930,7 +938,7 @@ struct encoding_choice_errors /** * @brief Preallocated working buffers, allocated per thread during context creation. */ -struct alignas(ASTCENC_VECALIGN) compression_working_buffers +struct ASTCENC_ALIGNAS compression_working_buffers { /** @brief Ideal endpoints and weights for plane 1. */ endpoints_and_weights ei1; @@ -946,17 +954,17 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers * * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets. */ - alignas(ASTCENC_VECALIGN) float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS]; /** * @brief Decimated quantized weight values in the unquantized 0-64 range. * * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets. */ - uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS]; /** @brief Error of the best encoding combination for each block mode. */ - alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES]; + ASTCENC_ALIGNAS float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES]; /** @brief The best color quant for each block mode. */ uint8_t best_quant_levels[WEIGHTS_MAX_BLOCK_MODES]; @@ -1107,7 +1115,7 @@ struct symbolic_compressed_block * * If dual plane, the second plane starts at @c weights[WEIGHTS_PLANE2_OFFSET]. */ - uint8_t weights[BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS uint8_t weights[BLOCK_MAX_WEIGHTS]; /** * @brief Get the weight quantization used by this block mode. @@ -1563,6 +1571,33 @@ unsigned int find_best_partition_candidates( Functionality for managing images and image related data. ============================================================================ */ +/** + * @brief Get a vector mask indicating lanes decompressing into a UNORM8 value. + * + * @param decode_mode The color profile for LDR_SRGB settings. + * @param blk The image block for output image bitness settings. + * + * @return The component mask vector. + */ +static inline vmask4 get_u8_component_mask( + astcenc_profile decode_mode, + const image_block& blk +) { + vmask4 u8_mask(false); + // Decode mode writing to a unorm8 output value + if (blk.decode_unorm8) + { + u8_mask = vmask4(true); + } + // SRGB writing to a unorm8 RGB value + else if (decode_mode == ASTCENC_PRF_LDR_SRGB) + { + u8_mask = vmask4(true, true, true, false); + } + + return u8_mask; +} + /** * @brief Setup computation of regional averages in an image. * @@ -1816,7 +1851,7 @@ uint8_t pack_color_endpoints( * * Endpoints must be unscrambled and converted into the 0-255 range before calling this functions. * - * @param decode_mode The decode mode (LDR, HDR). + * @param decode_mode The decode mode (LDR, HDR, etc). * @param format The color endpoint mode used. * @param input The raw array of encoded input integers. The length of this array * depends on @c format; it can be safely assumed to be large enough. @@ -2142,10 +2177,11 @@ Platform-specific functions. /** * @brief Allocate an aligned memory buffer. * - * Allocated memory must be freed by aligned_free; + * Allocated memory must be freed by aligned_free. * * @param size The desired buffer size. - * @param align The desired buffer alignment; must be 2^N. + * @param align The desired buffer alignment; must be 2^N, may be increased + * by the implementation to a minimum allowable alignment. * * @return The memory buffer pointer or nullptr on allocation failure. */ @@ -2155,10 +2191,14 @@ T* aligned_malloc(size_t size, size_t align) void* ptr; int error = 0; + // Don't allow this to under-align a type + size_t min_align = astc::max(alignof(T), sizeof(void*)); + size_t real_align = astc::max(min_align, align); + #if defined(_WIN32) - ptr = _aligned_malloc(size, align); + ptr = _aligned_malloc(size, real_align); #else - error = posix_memalign(&ptr, align, size); + error = posix_memalign(&ptr, real_align, size); #endif if (error || (!ptr)) diff --git a/3rdparty/astc-encoder/source/astcenc_internal_entry.h b/3rdparty/astc-encoder/source/astcenc_internal_entry.h index 4e87945..c283c5a 100644 --- a/3rdparty/astc-encoder/source/astcenc_internal_entry.h +++ b/3rdparty/astc-encoder/source/astcenc_internal_entry.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2022 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -118,6 +118,18 @@ private: /** @brief Number of tasks that need to be processed. */ unsigned int m_task_count; + /** @brief Progress callback (optional). */ + astcenc_progress_callback m_callback; + + /** @brief Lock used for callback synchronization. */ + std::mutex m_callback_lock; + + /** @brief Minimum progress before making a callback. */ + float m_callback_min_diff; + + /** @brief Last progress callback value. */ + float m_callback_last_value; + public: /** @brief Create a new ParallelManager. */ ParallelManager() @@ -138,6 +150,9 @@ public: m_start_count = 0; m_done_count = 0; m_task_count = 0; + m_callback = nullptr; + m_callback_last_value = 0.0f; + m_callback_min_diff = 1.0f; } /** @@ -166,14 +181,20 @@ public: * initialization. Other threads will block and wait for it to complete. * * @param task_count Total number of tasks needing processing. + * @param callback Function pointer for progress status callbacks. */ - void init(unsigned int task_count) + void init(unsigned int task_count, astcenc_progress_callback callback) { std::lock_guard lck(m_lock); if (!m_init_done) { + m_callback = callback; m_task_count = task_count; m_init_done = true; + + // Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead + float min_diff = (4096.0f / static_cast(task_count)) * 100.0f; + m_callback_min_diff = astc::max(min_diff, 1.0f); } } @@ -212,12 +233,49 @@ public: { // Note: m_done_count cannot use an atomic without the mutex; this has a race between the // update here and the wait() for other threads - std::unique_lock lck(m_lock); - this->m_done_count += count; - if (m_done_count == m_task_count) + unsigned int local_count; + float local_last_value; { - lck.unlock(); - m_complete.notify_all(); + std::unique_lock lck(m_lock); + m_done_count += count; + local_count = m_done_count; + local_last_value = m_callback_last_value; + + if (m_done_count == m_task_count) + { + // Ensure the progress bar hits 100% + if (m_callback) + { + std::unique_lock cblck(m_callback_lock); + m_callback(100.0f); + m_callback_last_value = 100.0f; + } + + lck.unlock(); + m_complete.notify_all(); + } + } + + // Process progress callback if we have one + if (m_callback) + { + // Initial lockless test - have we progressed enough to emit? + float num = static_cast(local_count); + float den = static_cast(m_task_count); + float this_value = (num / den) * 100.0f; + bool report_test = (this_value - local_last_value) > m_callback_min_diff; + + // Recheck under lock, because another thread might report first + if (report_test) + { + std::unique_lock cblck(m_callback_lock); + bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff; + if (report_retest) + { + m_callback(this_value); + m_callback_last_value = this_value; + } + } } } diff --git a/3rdparty/astc-encoder/source/astcenc_mathlib.h b/3rdparty/astc-encoder/source/astcenc_mathlib.h index 0540c4f..562d659 100644 --- a/3rdparty/astc-encoder/source/astcenc_mathlib.h +++ b/3rdparty/astc-encoder/source/astcenc_mathlib.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -73,10 +73,22 @@ #endif #endif +// Force vector-sized SIMD alignment #if ASTCENC_AVX #define ASTCENC_VECALIGN 32 -#else +#elif ASTCENC_SSE || ASTCENC_NEON #define ASTCENC_VECALIGN 16 +// Use default alignment for non-SIMD builds +#else + #define ASTCENC_VECALIGN 0 +#endif + +// C++11 states that alignas(0) should be ignored but GCC doesn't do +// this on some versions, so workaround and avoid emitting alignas(0) +#if ASTCENC_VECALIGN > 0 + #define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN) +#else + #define ASTCENC_ALIGNAS #endif #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0 diff --git a/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp b/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp index fa66036..42db764 100644 --- a/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp +++ b/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp @@ -15,13 +15,13 @@ // under the License. // ---------------------------------------------------------------------------- -#include "astcenc_mathlib.h" - /** * @brief Soft-float library for IEEE-754. */ #if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0) +#include "astcenc_mathlib.h" + /* sized soft-float types. These are mapped to the sized integer types of C99, instead of C's floating-point types; this is because the library needs to maintain exact, bit-level control on all diff --git a/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp b/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp index c4da678..45d9abb 100644 --- a/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp +++ b/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp @@ -330,12 +330,14 @@ void physical_to_symbolic( return; } + // Low values span 3 bytes so need two read_bits calls int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8); - int vx_high_s = read_bits(8, 25, pcb) | (read_bits(5, 25 + 8, pcb) << 8); + int vx_high_s = read_bits(13, 25, pcb); int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8); - int vx_high_t = read_bits(8, 51, pcb) | (read_bits(5, 51 + 8, pcb) << 8); + int vx_high_t = read_bits(13, 51, pcb); - int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF; + int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && + vx_low_t == 0x1FFF && vx_high_t == 0x1FFF; if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones) { @@ -350,12 +352,14 @@ void physical_to_symbolic( int vx_high_s = read_bits(9, 19, pcb); int vx_low_t = read_bits(9, 28, pcb); int vx_high_t = read_bits(9, 37, pcb); - int vx_low_p = read_bits(9, 46, pcb); - int vx_high_p = read_bits(9, 55, pcb); + int vx_low_r = read_bits(9, 46, pcb); + int vx_high_r = read_bits(9, 55, pcb); - int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF; + int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && + vx_low_t == 0x1FF && vx_high_t == 0x1FF && + vx_low_r == 0x1FF && vx_high_r == 0x1FF; - if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_p >= vx_high_p) && !all_ones) + if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_r >= vx_high_r) && !all_ones) { scb.block_type = SYM_BTYPE_ERROR; return; @@ -470,8 +474,7 @@ void physical_to_symbolic( bitpos += 2; } } - scb.partition_index = static_cast(read_bits(6, 13, pcb) | - (read_bits(PARTITION_INDEX_BITS - 6, 19, pcb) << 6)); + scb.partition_index = static_cast(read_bits(10, 13, pcb)); } for (int i = 0; i < partition_count; i++) diff --git a/3rdparty/astc-encoder/source/astcenc_vecmathlib_avx2_8.h b/3rdparty/astc-encoder/source/astcenc_vecmathlib_avx2_8.h index 72ed19f..3ca25e3 100644 --- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_avx2_8.h +++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_avx2_8.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2019-2022 Arm Limited +// Copyright 2019-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -1170,7 +1170,7 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 ma */ ASTCENC_SIMD_INLINE void print(vint8 a) { - alignas(ASTCENC_VECALIGN) int v[8]; + alignas(32) int v[8]; storea(a, v); printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); @@ -1181,7 +1181,7 @@ ASTCENC_SIMD_INLINE void print(vint8 a) */ ASTCENC_SIMD_INLINE void printx(vint8 a) { - alignas(ASTCENC_VECALIGN) int v[8]; + alignas(32) int v[8]; storea(a, v); printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); @@ -1192,7 +1192,7 @@ ASTCENC_SIMD_INLINE void printx(vint8 a) */ ASTCENC_SIMD_INLINE void print(vfloat8 a) { - alignas(ASTCENC_VECALIGN) float v[8]; + alignas(32) float v[8]; storea(a, v); printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n", static_cast(v[0]), static_cast(v[1]), diff --git a/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h b/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h index 86ee4fd..1e04367 100644 --- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h +++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2020-2021 Arm Limited +// Copyright 2020-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -383,7 +383,7 @@ static ASTCENC_SIMD_INLINE void bit_transfer_signed( */ ASTCENC_SIMD_INLINE void print(vint4 a) { - alignas(16) int v[4]; + ASTCENC_ALIGNAS int v[4]; storea(a, v); printf("v4_i32:\n %8d %8d %8d %8d\n", v[0], v[1], v[2], v[3]); @@ -394,7 +394,7 @@ ASTCENC_SIMD_INLINE void print(vint4 a) */ ASTCENC_SIMD_INLINE void printx(vint4 a) { - alignas(16) int v[4]; + ASTCENC_ALIGNAS int v[4]; storea(a, v); printf("v4_i32:\n %08x %08x %08x %08x\n", v[0], v[1], v[2], v[3]); @@ -405,7 +405,7 @@ ASTCENC_SIMD_INLINE void printx(vint4 a) */ ASTCENC_SIMD_INLINE void print(vfloat4 a) { - alignas(16) float v[4]; + ASTCENC_ALIGNAS float v[4]; storea(a, v); printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n", static_cast(v[0]), static_cast(v[1]), diff --git a/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h b/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h index c5ad872..42545e7 100644 --- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h +++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h @@ -359,9 +359,9 @@ struct vmask4 /** * @brief Get the scalar from a single lane. */ - template ASTCENC_SIMD_INLINE uint32_t lane() const + template ASTCENC_SIMD_INLINE bool lane() const { - return vgetq_lane_u32(m, l); + return vgetq_lane_u32(m, l) != 0; } /** diff --git a/3rdparty/astc-encoder/source/astcenc_vecmathlib_none_4.h b/3rdparty/astc-encoder/source/astcenc_vecmathlib_none_4.h index 6dbb659..be7348e 100644 --- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_none_4.h +++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_none_4.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2019-2022 Arm Limited +// Copyright 2019-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -351,6 +351,13 @@ struct vmask4 m[3] = d == false ? 0 : -1; } + /** + * @brief Get the scalar value of a single lane. + */ + template ASTCENC_SIMD_INLINE float lane() const + { + return m[l] != 0; + } /** * @brief The vector ... @@ -549,10 +556,16 @@ ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b) */ template ASTCENC_SIMD_INLINE vint4 lsl(vint4 a) { - return vint4(a.m[0] << s, - a.m[1] << s, - a.m[2] << s, - a.m[3] << s); + // Cast to unsigned to avoid shift in/out of sign bit undefined behavior + unsigned int as0 = static_cast(a.m[0]) << s; + unsigned int as1 = static_cast(a.m[1]) << s; + unsigned int as2 = static_cast(a.m[2]) << s; + unsigned int as3 = static_cast(a.m[3]) << s; + + return vint4(static_cast(as0), + static_cast(as1), + static_cast(as2), + static_cast(as3)); } /** @@ -560,6 +573,7 @@ template ASTCENC_SIMD_INLINE vint4 lsl(vint4 a) */ template ASTCENC_SIMD_INLINE vint4 lsr(vint4 a) { + // Cast to unsigned to avoid shift in/out of sign bit undefined behavior unsigned int as0 = static_cast(a.m[0]) >> s; unsigned int as1 = static_cast(a.m[1]) >> s; unsigned int as2 = static_cast(a.m[2]) >> s; diff --git a/3rdparty/astc-encoder/source/astcenc_vecmathlib_sse_4.h b/3rdparty/astc-encoder/source/astcenc_vecmathlib_sse_4.h index 4dd58d2..3dce5ba 100644 --- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_sse_4.h +++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_sse_4.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2019-2022 Arm Limited +// Copyright 2019-2023 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -379,9 +379,9 @@ struct vmask4 /** * @brief Get the scalar value of a single lane. */ - template ASTCENC_SIMD_INLINE float lane() const + template ASTCENC_SIMD_INLINE bool lane() const { - return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)); + return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)) != 0.0f; } /** diff --git a/3rdparty/astc-encoder/source/astcenc_weight_align.cpp b/3rdparty/astc-encoder/source/astcenc_weight_align.cpp index aa6ab61..4e993e7 100644 --- a/3rdparty/astc-encoder/source/astcenc_weight_align.cpp +++ b/3rdparty/astc-encoder/source/astcenc_weight_align.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -60,8 +60,8 @@ static const uint8_t steps_for_quant_level[12] { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32 }; -alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS]; -alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS]; +ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS]; +ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS]; #if defined(ASTCENC_DIAGNOSTICS) static bool print_once { true }; @@ -99,7 +99,7 @@ static void compute_angular_offsets( promise(weight_count > 0); promise(max_angular_steps > 0); - alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS]; // Precompute isample; arrays are always allocated 64 elements long for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) @@ -242,16 +242,16 @@ static void compute_angular_endpoints_for_quant_levels( unsigned int max_quant_steps = steps_for_quant_level[max_quant_level]; unsigned int max_angular_steps = steps_for_quant_level[max_quant_level]; - alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS]; + ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS]; compute_angular_offsets(weight_count, dec_weight_ideal_value, max_angular_steps, angular_offsets); - alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS]; - alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS]; - alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS]; - alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS]; - alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS]; + ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS]; + ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS]; + ASTCENC_ALIGNAS float error[ANGULAR_STEPS]; + ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS]; + ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS]; compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value, max_angular_steps, max_quant_steps,