mirror of
https://github.com/bkaradzic/bimg.git
synced 2026-02-17 12:42:35 +01:00
Updated astc-encoder.
This commit is contained in:
8
3rdparty/astc-encoder/include/astcenc.h
vendored
8
3rdparty/astc-encoder/include/astcenc.h
vendored
@@ -542,6 +542,14 @@ struct astcenc_config
|
||||
*/
|
||||
float tune_2plane_early_out_limit_correlation;
|
||||
|
||||
/**
|
||||
* @brief The config enable for the mode0 fast-path search.
|
||||
*
|
||||
* If this is set to TUNE_MIN_TEXELS_MODE0 or higher then the early-out fast mode0
|
||||
* search is enabled. This option is ineffective for 3D block sizes.
|
||||
*/
|
||||
float tune_search_mode0_enable;
|
||||
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
/**
|
||||
* @brief The path to save the diagnostic trace data to.
|
||||
|
||||
@@ -40,6 +40,27 @@
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
/**
|
||||
* @brief Compute the error of an LDR RGB or RGBA encoding.
|
||||
*
|
||||
* @param uquant0 The original endpoint 0 color.
|
||||
* @param uquant1 The original endpoint 1 color.
|
||||
* @param quant0 The unpacked quantized endpoint 0 color.
|
||||
* @param quant1 The unpacked quantized endpoint 1 color.
|
||||
*
|
||||
* @return The MSE of the encoding.
|
||||
*/
|
||||
static float get_rgba_encoding_error(
|
||||
vfloat4 uquant0,
|
||||
vfloat4 uquant1,
|
||||
vint4 quant0,
|
||||
vint4 quant1
|
||||
) {
|
||||
vfloat4 error0 = uquant0 - int_to_float(quant0);
|
||||
vfloat4 error1 = uquant1 - int_to_float(quant1);
|
||||
return hadd_s(error0 * error0 + error1 * error1);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determine the quantized value given a quantization level.
|
||||
*
|
||||
@@ -56,6 +77,26 @@ static inline uint8_t quant_color(
|
||||
return color_unquant_to_uquant_tables[quant_level - QUANT_6][index];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determine the quantized value given a quantization level.
|
||||
*
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param value The value to convert. This must be in the 0-255 range.
|
||||
*
|
||||
* @return The unpacked quantized value, returned in 0-255 range.
|
||||
*/
|
||||
static inline vint4 quant_color3(
|
||||
quant_method quant_level,
|
||||
vint4 value
|
||||
) {
|
||||
vint4 index = value * 2 + 1;
|
||||
return vint4(
|
||||
color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<0>()],
|
||||
color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<1>()],
|
||||
color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<2>()],
|
||||
0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determine the quantized value given a quantization level and residual.
|
||||
*
|
||||
@@ -83,6 +124,35 @@ static inline uint8_t quant_color(
|
||||
return color_unquant_to_uquant_tables[quant_level - QUANT_6][index];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determine the quantized value given a quantization level and residual.
|
||||
*
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param value The value to convert. This must be in the 0-255 range.
|
||||
* @param valuef The original value before rounding, used to compute a residual.
|
||||
*
|
||||
* @return The unpacked quantized value, returned in 0-255 range.
|
||||
*/
|
||||
static inline vint4 quant_color3(
|
||||
quant_method quant_level,
|
||||
vint4 value,
|
||||
vfloat4 valuef
|
||||
) {
|
||||
vint4 index = value * 2;
|
||||
|
||||
// Compute the residual to determine if we should round down or up ties.
|
||||
// Test should be residual >= 0, but empirical testing shows small bias helps.
|
||||
vfloat4 residual = valuef - int_to_float(value);
|
||||
vmask4 mask = residual >= vfloat4(-0.1f);
|
||||
index = select(index, index + 1, mask);
|
||||
|
||||
return vint4(
|
||||
color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<0>()],
|
||||
color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<1>()],
|
||||
color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<2>()],
|
||||
0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Quantize an LDR RGB color.
|
||||
*
|
||||
@@ -92,47 +162,33 @@ static inline uint8_t quant_color(
|
||||
*
|
||||
* @param color0 The input unquantized color0 endpoint.
|
||||
* @param color1 The input unquantized color1 endpoint.
|
||||
* @param[out] output The output endpoints, returned as (r0, r1, g0, g1, b0, b1).
|
||||
* @param[out] color0_out The output quantized color0 endpoint.
|
||||
* @param[out] color1_out The output quantized color1 endpoint.
|
||||
* @param quant_level The quantization level to use.
|
||||
*/
|
||||
static void quantize_rgb(
|
||||
vfloat4 color0,
|
||||
vfloat4 color1,
|
||||
uint8_t output[6],
|
||||
vint4& color0_out,
|
||||
vint4& color1_out,
|
||||
quant_method quant_level
|
||||
) {
|
||||
float scale = 1.0f / 257.0f;
|
||||
vint4 color0i, color1i;
|
||||
vfloat4 nudge(0.2f);
|
||||
|
||||
float r0 = astc::clamp255f(color0.lane<0>() * scale);
|
||||
float g0 = astc::clamp255f(color0.lane<1>() * scale);
|
||||
float b0 = astc::clamp255f(color0.lane<2>() * scale);
|
||||
|
||||
float r1 = astc::clamp255f(color1.lane<0>() * scale);
|
||||
float g1 = astc::clamp255f(color1.lane<1>() * scale);
|
||||
float b1 = astc::clamp255f(color1.lane<2>() * scale);
|
||||
|
||||
int ri0, gi0, bi0, ri1, gi1, bi1;
|
||||
float rgb0_addon = 0.0f;
|
||||
float rgb1_addon = 0.0f;
|
||||
do
|
||||
{
|
||||
ri0 = quant_color(quant_level, astc::max(astc::flt2int_rtn(r0 + rgb0_addon), 0), r0 + rgb0_addon);
|
||||
gi0 = quant_color(quant_level, astc::max(astc::flt2int_rtn(g0 + rgb0_addon), 0), g0 + rgb0_addon);
|
||||
bi0 = quant_color(quant_level, astc::max(astc::flt2int_rtn(b0 + rgb0_addon), 0), b0 + rgb0_addon);
|
||||
ri1 = quant_color(quant_level, astc::min(astc::flt2int_rtn(r1 + rgb1_addon), 255), r1 + rgb1_addon);
|
||||
gi1 = quant_color(quant_level, astc::min(astc::flt2int_rtn(g1 + rgb1_addon), 255), g1 + rgb1_addon);
|
||||
bi1 = quant_color(quant_level, astc::min(astc::flt2int_rtn(b1 + rgb1_addon), 255), b1 + rgb1_addon);
|
||||
vint4 color0q = max(float_to_int_rtn(color0), vint4(0));
|
||||
color0i = quant_color3(quant_level, color0q, color0);
|
||||
color0 = color0 - nudge;
|
||||
|
||||
rgb0_addon -= 0.2f;
|
||||
rgb1_addon += 0.2f;
|
||||
} while (ri0 + gi0 + bi0 > ri1 + gi1 + bi1);
|
||||
vint4 color1q = min(float_to_int_rtn(color1), vint4(255));
|
||||
color1i = quant_color3(quant_level, color1q, color1);
|
||||
color1 = color1 + nudge;
|
||||
} while (hadd_rgb_s(color0i) > hadd_rgb_s(color1i));
|
||||
|
||||
output[0] = static_cast<uint8_t>(ri0);
|
||||
output[1] = static_cast<uint8_t>(ri1);
|
||||
output[2] = static_cast<uint8_t>(gi0);
|
||||
output[3] = static_cast<uint8_t>(gi1);
|
||||
output[4] = static_cast<uint8_t>(bi0);
|
||||
output[5] = static_cast<uint8_t>(bi1);
|
||||
color0_out = color0i;
|
||||
color1_out = color1i;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -145,24 +201,24 @@ static void quantize_rgb(
|
||||
*
|
||||
* @param color0 The input unquantized color0 endpoint.
|
||||
* @param color1 The input unquantized color1 endpoint.
|
||||
* @param[out] output The output endpoints, returned as (r0, r1, g0, g1, b0, b1, a0, a1).
|
||||
* @param[out] color0_out The output quantized color0 endpoint.
|
||||
* @param[out] color1_out The output quantized color1 endpoint.
|
||||
* @param quant_level The quantization level to use.
|
||||
*/
|
||||
static void quantize_rgba(
|
||||
vfloat4 color0,
|
||||
vfloat4 color1,
|
||||
uint8_t output[8],
|
||||
vint4& color0_out,
|
||||
vint4& color1_out,
|
||||
quant_method quant_level
|
||||
) {
|
||||
float scale = 1.0f / 257.0f;
|
||||
quantize_rgb(color0, color1, color0_out, color1_out, quant_level);
|
||||
|
||||
float a0 = astc::clamp255f(color0.lane<3>() * scale);
|
||||
float a1 = astc::clamp255f(color1.lane<3>() * scale);
|
||||
float a0 = color0.lane<3>();
|
||||
float a1 = color1.lane<3>();
|
||||
|
||||
output[6] = quant_color(quant_level, astc::flt2int_rtn(a0), a0);
|
||||
output[7] = quant_color(quant_level, astc::flt2int_rtn(a1), a1);
|
||||
|
||||
quantize_rgb(color0, color1, output, quant_level);
|
||||
color0_out.set_lane<3>(quant_color(quant_level, astc::flt2int_rtn(a0), a0));
|
||||
color1_out.set_lane<3>(quant_color(quant_level, astc::flt2int_rtn(a1), a1));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -172,7 +228,8 @@ static void quantize_rgba(
|
||||
*
|
||||
* @param color0 The input unquantized color0 endpoint.
|
||||
* @param color1 The input unquantized color1 endpoint.
|
||||
* @param[out] output The output endpoints, returned as (r1, r0, g1, g0, b1, b0).
|
||||
* @param[out] color0_out The output quantized color0 endpoint.
|
||||
* @param[out] color1_out The output quantized color1 endpoint.
|
||||
* @param quant_level The quantization level to use.
|
||||
*
|
||||
* @return Returns @c false on failure, @c true on success.
|
||||
@@ -180,54 +237,35 @@ static void quantize_rgba(
|
||||
static bool try_quantize_rgb_blue_contract(
|
||||
vfloat4 color0,
|
||||
vfloat4 color1,
|
||||
uint8_t output[6],
|
||||
vint4& color0_out,
|
||||
vint4& color1_out,
|
||||
quant_method quant_level
|
||||
) {
|
||||
float scale = 1.0f / 257.0f;
|
||||
// Apply inverse blue-contraction
|
||||
color0 += color0 - color0.swz<2, 2, 2, 3>();
|
||||
color1 += color1 - color1.swz<2, 2, 2, 3>();
|
||||
|
||||
float r0 = color0.lane<0>() * scale;
|
||||
float g0 = color0.lane<1>() * scale;
|
||||
float b0 = color0.lane<2>() * scale;
|
||||
|
||||
float r1 = color1.lane<0>() * scale;
|
||||
float g1 = color1.lane<1>() * scale;
|
||||
float b1 = color1.lane<2>() * scale;
|
||||
|
||||
// Apply inverse blue-contraction. This can produce an overflow; which means BC cannot be used.
|
||||
r0 += (r0 - b0);
|
||||
g0 += (g0 - b0);
|
||||
r1 += (r1 - b1);
|
||||
g1 += (g1 - b1);
|
||||
|
||||
if (r0 < 0.0f || r0 > 255.0f || g0 < 0.0f || g0 > 255.0f || b0 < 0.0f || b0 > 255.0f ||
|
||||
r1 < 0.0f || r1 > 255.0f || g1 < 0.0f || g1 > 255.0f || b1 < 0.0f || b1 > 255.0f)
|
||||
// If anything overflows BC cannot be used
|
||||
vmask4 color0_error = (color0 < vfloat4(0.0f)) | (color0 > vfloat4(255.0f));
|
||||
vmask4 color1_error = (color1 < vfloat4(0.0f)) | (color1 > vfloat4(255.0f));
|
||||
if (any(color0_error | color1_error))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Quantize the inverse-blue-contracted color
|
||||
int ri0 = quant_color(quant_level, astc::flt2int_rtn(r0), r0);
|
||||
int gi0 = quant_color(quant_level, astc::flt2int_rtn(g0), g0);
|
||||
int bi0 = quant_color(quant_level, astc::flt2int_rtn(b0), b0);
|
||||
// Quantize the inverse blue-contracted color
|
||||
vint4 color0i = quant_color3(quant_level, float_to_int_rtn(color0), color0);
|
||||
vint4 color1i = quant_color3(quant_level, float_to_int_rtn(color1), color1);
|
||||
|
||||
int ri1 = quant_color(quant_level, astc::flt2int_rtn(r1), r1);
|
||||
int gi1 = quant_color(quant_level, astc::flt2int_rtn(g1), g1);
|
||||
int bi1 = quant_color(quant_level, astc::flt2int_rtn(b1), b1);
|
||||
|
||||
// If color #1 is not larger than color #0 then blue-contraction cannot be used. Note that
|
||||
// blue-contraction and quantization change this order, which is why we must test afterwards.
|
||||
if (ri1 + gi1 + bi1 <= ri0 + gi0 + bi0)
|
||||
// If color #1 is not larger than color #0 then blue-contraction cannot be used
|
||||
// We must test afterwards because quantization can change the order
|
||||
if (hadd_rgb_s(color1i) <= hadd_rgb_s(color0i))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
output[0] = static_cast<uint8_t>(ri1);
|
||||
output[1] = static_cast<uint8_t>(ri0);
|
||||
output[2] = static_cast<uint8_t>(gi1);
|
||||
output[3] = static_cast<uint8_t>(gi0);
|
||||
output[4] = static_cast<uint8_t>(bi1);
|
||||
output[5] = static_cast<uint8_t>(bi0);
|
||||
|
||||
color0_out = color1i;
|
||||
color1_out = color0i;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -238,7 +276,8 @@ static bool try_quantize_rgb_blue_contract(
|
||||
*
|
||||
* @param color0 The input unquantized color0 endpoint.
|
||||
* @param color1 The input unquantized color1 endpoint.
|
||||
* @param[out] output The output endpoints, returned as (r1, r0, g1, g0, b1, b0, a1, a0).
|
||||
* @param[out] color0_out The output quantized color0 endpoint.
|
||||
* @param[out] color1_out The output quantized color1 endpoint.
|
||||
* @param quant_level The quantization level to use.
|
||||
*
|
||||
* @return Returns @c false on failure, @c true on success.
|
||||
@@ -246,18 +285,22 @@ static bool try_quantize_rgb_blue_contract(
|
||||
static bool try_quantize_rgba_blue_contract(
|
||||
vfloat4 color0,
|
||||
vfloat4 color1,
|
||||
uint8_t output[8],
|
||||
vint4& color0_out,
|
||||
vint4& color1_out,
|
||||
quant_method quant_level
|
||||
) {
|
||||
float scale = 1.0f / 257.0f;
|
||||
if (try_quantize_rgb_blue_contract(color0, color1, color0_out, color1_out, quant_level))
|
||||
{
|
||||
float a0 = color0.lane<3>();
|
||||
float a1 = color1.lane<3>();
|
||||
|
||||
float a0 = astc::clamp255f(color0.lane<3>() * scale);
|
||||
float a1 = astc::clamp255f(color1.lane<3>() * scale);
|
||||
color0_out.set_lane<3>(quant_color(quant_level, astc::flt2int_rtn(a1), a1));
|
||||
color1_out.set_lane<3>(quant_color(quant_level, astc::flt2int_rtn(a0), a0));
|
||||
|
||||
output[6] = quant_color(quant_level, astc::flt2int_rtn(a1), a1);
|
||||
output[7] = quant_color(quant_level, astc::flt2int_rtn(a0), a0);
|
||||
return true;
|
||||
}
|
||||
|
||||
return try_quantize_rgb_blue_contract(color0, color1, output, quant_level);
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -269,7 +312,8 @@ static bool try_quantize_rgba_blue_contract(
|
||||
*
|
||||
* @param color0 The input unquantized color0 endpoint.
|
||||
* @param color1 The input unquantized color1 endpoint.
|
||||
* @param[out] output The output endpoints, returned as (r0, r1, g0, g1, b0, b1).
|
||||
* @param[out] color0_out The output quantized color0 endpoint.
|
||||
* @param[out] color1_out The output quantized color1 endpoint.
|
||||
* @param quant_level The quantization level to use.
|
||||
*
|
||||
* @return Returns @c false on failure, @c true on success.
|
||||
@@ -277,85 +321,54 @@ static bool try_quantize_rgba_blue_contract(
|
||||
static bool try_quantize_rgb_delta(
|
||||
vfloat4 color0,
|
||||
vfloat4 color1,
|
||||
uint8_t output[6],
|
||||
vint4& color0_out,
|
||||
vint4& color1_out,
|
||||
quant_method quant_level
|
||||
) {
|
||||
float scale = 1.0f / 257.0f;
|
||||
|
||||
float r0 = astc::clamp255f(color0.lane<0>() * scale);
|
||||
float g0 = astc::clamp255f(color0.lane<1>() * scale);
|
||||
float b0 = astc::clamp255f(color0.lane<2>() * scale);
|
||||
|
||||
float r1 = astc::clamp255f(color1.lane<0>() * scale);
|
||||
float g1 = astc::clamp255f(color1.lane<1>() * scale);
|
||||
float b1 = astc::clamp255f(color1.lane<2>() * scale);
|
||||
|
||||
// Transform r0 to unorm9
|
||||
int r0a = astc::flt2int_rtn(r0);
|
||||
int g0a = astc::flt2int_rtn(g0);
|
||||
int b0a = astc::flt2int_rtn(b0);
|
||||
|
||||
r0a <<= 1;
|
||||
g0a <<= 1;
|
||||
b0a <<= 1;
|
||||
// Transform color0 to unorm9
|
||||
vint4 color0a = float_to_int_rtn(color0);
|
||||
color0.set_lane<3>(0.0f);
|
||||
color0a = lsl<1>(color0a);
|
||||
|
||||
// Mask off the top bit
|
||||
int r0b = r0a & 0xFF;
|
||||
int g0b = g0a & 0xFF;
|
||||
int b0b = b0a & 0xFF;
|
||||
vint4 color0b = color0a & 0xFF;
|
||||
|
||||
// Quantize then unquantize in order to get a value that we take differences against
|
||||
int r0be = quant_color(quant_level, r0b);
|
||||
int g0be = quant_color(quant_level, g0b);
|
||||
int b0be = quant_color(quant_level, b0b);
|
||||
|
||||
r0b = r0be | (r0a & 0x100);
|
||||
g0b = g0be | (g0a & 0x100);
|
||||
b0b = b0be | (b0a & 0x100);
|
||||
vint4 color0be = quant_color3(quant_level, color0b);
|
||||
color0b = color0be | (color0a & 0x100);
|
||||
|
||||
// Get hold of the second value
|
||||
int r1d = astc::flt2int_rtn(r1);
|
||||
int g1d = astc::flt2int_rtn(g1);
|
||||
int b1d = astc::flt2int_rtn(b1);
|
||||
|
||||
r1d <<= 1;
|
||||
g1d <<= 1;
|
||||
b1d <<= 1;
|
||||
vint4 color1d = float_to_int_rtn(color1);
|
||||
color1d = lsl<1>(color1d);
|
||||
|
||||
// ... and take differences
|
||||
r1d -= r0b;
|
||||
g1d -= g0b;
|
||||
b1d -= b0b;
|
||||
color1d = color1d - color0b;
|
||||
color1d.set_lane<3>(0);
|
||||
|
||||
// Check if the difference is too large to be encodable
|
||||
if (r1d > 63 || g1d > 63 || b1d > 63 || r1d < -64 || g1d < -64 || b1d < -64)
|
||||
if (any((color1d > vint4(63)) | (color1d < vint4(-64))))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Insert top bit of the base into the offset
|
||||
r1d &= 0x7F;
|
||||
g1d &= 0x7F;
|
||||
b1d &= 0x7F;
|
||||
|
||||
r1d |= (r0b & 0x100) >> 1;
|
||||
g1d |= (g0b & 0x100) >> 1;
|
||||
b1d |= (b0b & 0x100) >> 1;
|
||||
color1d = color1d & 0x7F;
|
||||
color1d = color1d | lsr<1>(color0b & 0x100);
|
||||
|
||||
// Then quantize and unquantize; if this causes either top two bits to flip, then encoding fails
|
||||
// since we have then corrupted either the top bit of the base or the sign bit of the offset
|
||||
int r1de = quant_color(quant_level, r1d);
|
||||
int g1de = quant_color(quant_level, g1d);
|
||||
int b1de = quant_color(quant_level, b1d);
|
||||
vint4 color1de = quant_color3(quant_level, color1d);
|
||||
|
||||
if (((r1d ^ r1de) | (g1d ^ g1de) | (b1d ^ b1de)) & 0xC0)
|
||||
vint4 color_flips = (color1d ^ color1de) & 0xC0;
|
||||
color_flips.set_lane<3>(0);
|
||||
if (any(color_flips != vint4::zero()))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// If the sum of offsets triggers blue-contraction then encoding fails
|
||||
vint4 ep0(r0be, g0be, b0be, 0);
|
||||
vint4 ep1(r1de, g1de, b1de, 0);
|
||||
vint4 ep0 = color0be;
|
||||
vint4 ep1 = color1de;
|
||||
bit_transfer_signed(ep1, ep0);
|
||||
if (hadd_rgb_s(ep1) < 0)
|
||||
{
|
||||
@@ -369,111 +382,90 @@ static bool try_quantize_rgb_delta(
|
||||
return false;
|
||||
}
|
||||
|
||||
output[0] = static_cast<uint8_t>(r0be);
|
||||
output[1] = static_cast<uint8_t>(r1de);
|
||||
output[2] = static_cast<uint8_t>(g0be);
|
||||
output[3] = static_cast<uint8_t>(g1de);
|
||||
output[4] = static_cast<uint8_t>(b0be);
|
||||
output[5] = static_cast<uint8_t>(b1de);
|
||||
|
||||
color0_out = color0be;
|
||||
color1_out = color1de;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Try to quantize an LDR RGB color using delta encoding and blue-contraction.
|
||||
*
|
||||
* Blue-contraction is only usable if encoded color 1 RGB is larger than color 0 RGB.
|
||||
*
|
||||
* @param color0 The input unquantized color0 endpoint.
|
||||
* @param color1 The input unquantized color1 endpoint.
|
||||
* @param[out] color0_out The output quantized color0 endpoint.
|
||||
* @param[out] color1_out The output quantized color1 endpoint.
|
||||
* @param quant_level The quantization level to use.
|
||||
*
|
||||
* @return Returns @c false on failure, @c true on success.
|
||||
*/
|
||||
static bool try_quantize_rgb_delta_blue_contract(
|
||||
vfloat4 color0,
|
||||
vfloat4 color1,
|
||||
uint8_t output[6],
|
||||
vint4& color0_out,
|
||||
vint4& color1_out,
|
||||
quant_method quant_level
|
||||
) {
|
||||
// Note: Switch around endpoint colors already at start
|
||||
float scale = 1.0f / 257.0f;
|
||||
std::swap(color0, color1);
|
||||
|
||||
float r1 = color0.lane<0>() * scale;
|
||||
float g1 = color0.lane<1>() * scale;
|
||||
float b1 = color0.lane<2>() * scale;
|
||||
// Apply inverse blue-contraction
|
||||
color0 += color0 - color0.swz<2, 2, 2, 3>();
|
||||
color1 += color1 - color1.swz<2, 2, 2, 3>();
|
||||
|
||||
float r0 = color1.lane<0>() * scale;
|
||||
float g0 = color1.lane<1>() * scale;
|
||||
float b0 = color1.lane<2>() * scale;
|
||||
|
||||
// Apply inverse blue-contraction. This can produce an overflow; which means BC cannot be used.
|
||||
r0 += (r0 - b0);
|
||||
g0 += (g0 - b0);
|
||||
r1 += (r1 - b1);
|
||||
g1 += (g1 - b1);
|
||||
|
||||
if (r0 < 0.0f || r0 > 255.0f || g0 < 0.0f || g0 > 255.0f || b0 < 0.0f || b0 > 255.0f ||
|
||||
r1 < 0.0f || r1 > 255.0f || g1 < 0.0f || g1 > 255.0f || b1 < 0.0f || b1 > 255.0f)
|
||||
// If anything overflows BC cannot be used
|
||||
vmask4 color0_error = (color0 < vfloat4(0.0f)) | (color0 > vfloat4(255.0f));
|
||||
vmask4 color1_error = (color1 < vfloat4(0.0f)) | (color1 > vfloat4(255.0f));
|
||||
if (any(color0_error | color1_error))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Transform r0 to unorm9
|
||||
int r0a = astc::flt2int_rtn(r0);
|
||||
int g0a = astc::flt2int_rtn(g0);
|
||||
int b0a = astc::flt2int_rtn(b0);
|
||||
r0a <<= 1;
|
||||
g0a <<= 1;
|
||||
b0a <<= 1;
|
||||
// Transform color0 to unorm9
|
||||
vint4 color0a = float_to_int_rtn(color0);
|
||||
color0.set_lane<3>(0.0f);
|
||||
color0a = lsl<1>(color0a);
|
||||
|
||||
// Mask off the top bit
|
||||
int r0b = r0a & 0xFF;
|
||||
int g0b = g0a & 0xFF;
|
||||
int b0b = b0a & 0xFF;
|
||||
vint4 color0b = color0a & 0xFF;
|
||||
|
||||
// Quantize, then unquantize in order to get a value that we take differences against.
|
||||
int r0be = quant_color(quant_level, r0b);
|
||||
int g0be = quant_color(quant_level, g0b);
|
||||
int b0be = quant_color(quant_level, b0b);
|
||||
|
||||
r0b = r0be | (r0a & 0x100);
|
||||
g0b = g0be | (g0a & 0x100);
|
||||
b0b = b0be | (b0a & 0x100);
|
||||
// Quantize then unquantize in order to get a value that we take differences against
|
||||
vint4 color0be = quant_color3(quant_level, color0b);
|
||||
color0b = color0be | (color0a & 0x100);
|
||||
|
||||
// Get hold of the second value
|
||||
int r1d = astc::flt2int_rtn(r1);
|
||||
int g1d = astc::flt2int_rtn(g1);
|
||||
int b1d = astc::flt2int_rtn(b1);
|
||||
vint4 color1d = float_to_int_rtn(color1);
|
||||
color1d = lsl<1>(color1d);
|
||||
|
||||
r1d <<= 1;
|
||||
g1d <<= 1;
|
||||
b1d <<= 1;
|
||||
|
||||
// .. and take differences!
|
||||
r1d -= r0b;
|
||||
g1d -= g0b;
|
||||
b1d -= b0b;
|
||||
// ... and take differences
|
||||
color1d = color1d - color0b;
|
||||
color1d.set_lane<3>(0);
|
||||
|
||||
// Check if the difference is too large to be encodable
|
||||
if (r1d > 63 || g1d > 63 || b1d > 63 || r1d < -64 || g1d < -64 || b1d < -64)
|
||||
if (any((color1d > vint4(63)) | (color1d < vint4(-64))))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Insert top bit of the base into the offset
|
||||
r1d &= 0x7F;
|
||||
g1d &= 0x7F;
|
||||
b1d &= 0x7F;
|
||||
color1d = color1d & 0x7F;
|
||||
color1d = color1d | lsr<1>(color0b & 0x100);
|
||||
|
||||
r1d |= (r0b & 0x100) >> 1;
|
||||
g1d |= (g0b & 0x100) >> 1;
|
||||
b1d |= (b0b & 0x100) >> 1;
|
||||
// Then quantize and unquantize; if this causes either top two bits to flip, then encoding fails
|
||||
// since we have then corrupted either the top bit of the base or the sign bit of the offset
|
||||
vint4 color1de = quant_color3(quant_level, color1d);
|
||||
|
||||
// Then quantize and unquantize; if this causes any of the top two bits to flip,
|
||||
// then encoding fails, since we have then corrupted either the top bit of the base
|
||||
// or the sign bit of the offset.
|
||||
int r1de = quant_color(quant_level, r1d);
|
||||
int g1de = quant_color(quant_level, g1d);
|
||||
int b1de = quant_color(quant_level, b1d);
|
||||
|
||||
if (((r1d ^ r1de) | (g1d ^ g1de) | (b1d ^ b1de)) & 0xC0)
|
||||
vint4 color_flips = (color1d ^ color1de) & 0xC0;
|
||||
color_flips.set_lane<3>(0);
|
||||
if (any(color_flips != vint4::zero()))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// If the sum of offsets does not trigger blue-contraction then encoding fails
|
||||
vint4 ep0(r0be, g0be, b0be, 0);
|
||||
vint4 ep1(r1de, g1de, b1de, 0);
|
||||
vint4 ep0 = color0be;
|
||||
vint4 ep1 = color1de;
|
||||
bit_transfer_signed(ep1, ep0);
|
||||
if (hadd_rgb_s(ep1) >= 0)
|
||||
{
|
||||
@@ -487,13 +479,8 @@ static bool try_quantize_rgb_delta_blue_contract(
|
||||
return false;
|
||||
}
|
||||
|
||||
output[0] = static_cast<uint8_t>(r0be);
|
||||
output[1] = static_cast<uint8_t>(r1de);
|
||||
output[2] = static_cast<uint8_t>(g0be);
|
||||
output[3] = static_cast<uint8_t>(g1de);
|
||||
output[4] = static_cast<uint8_t>(b0be);
|
||||
output[5] = static_cast<uint8_t>(b1de);
|
||||
|
||||
color0_out = color0be;
|
||||
color1_out = color1de;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -508,7 +495,8 @@ static bool try_quantize_rgb_delta_blue_contract(
|
||||
*
|
||||
* @param color0 The input unquantized color0 endpoint.
|
||||
* @param color1 The input unquantized color1 endpoint.
|
||||
* @param[out] output The output endpoints, returned as (x, x, x, x, x, x, a0, a1).
|
||||
* @param[out] color0_out The output quantized color0 endpoint; must preserve lane 0/1/2.
|
||||
* @param[out] color1_out The output quantized color1 endpoint; must preserve lane 0/1/2.
|
||||
* @param quant_level The quantization level to use.
|
||||
*
|
||||
* @return Returns @c false on failure, @c true on success.
|
||||
@@ -516,13 +504,12 @@ static bool try_quantize_rgb_delta_blue_contract(
|
||||
static bool try_quantize_alpha_delta(
|
||||
vfloat4 color0,
|
||||
vfloat4 color1,
|
||||
uint8_t output[8],
|
||||
vint4& color0_out,
|
||||
vint4& color1_out,
|
||||
quant_method quant_level
|
||||
) {
|
||||
float scale = 1.0f / 257.0f;
|
||||
|
||||
float a0 = astc::clamp255f(color0.lane<3>() * scale);
|
||||
float a1 = astc::clamp255f(color1.lane<3>() * scale);
|
||||
float a0 = color0.lane<3>();
|
||||
float a1 = color1.lane<3>();
|
||||
|
||||
int a0a = astc::flt2int_rtn(a0);
|
||||
a0a <<= 1;
|
||||
@@ -561,8 +548,8 @@ static bool try_quantize_alpha_delta(
|
||||
return false;
|
||||
}
|
||||
|
||||
output[6] = static_cast<uint8_t>(a0be);
|
||||
output[7] = static_cast<uint8_t>(a1de);
|
||||
color0_out.set_lane<3>(a0be);
|
||||
color1_out.set_lane<3>(a1de);
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -589,13 +576,11 @@ static bool try_quantize_luminance_alpha_delta(
|
||||
uint8_t output[4],
|
||||
quant_method quant_level
|
||||
) {
|
||||
float scale = 1.0f / 257.0f;
|
||||
float l0 = hadd_rgb_s(color0) * (1.0f / 3.0f);
|
||||
float l1 = hadd_rgb_s(color1) * (1.0f / 3.0f);
|
||||
|
||||
float l0 = astc::clamp255f(hadd_rgb_s(color0) * ((1.0f / 3.0f) * scale));
|
||||
float l1 = astc::clamp255f(hadd_rgb_s(color1) * ((1.0f / 3.0f) * scale));
|
||||
|
||||
float a0 = astc::clamp255f(color0.lane<3>() * scale);
|
||||
float a1 = astc::clamp255f(color1.lane<3>() * scale);
|
||||
float a0 = color0.lane<3>();
|
||||
float a1 = color1.lane<3>();
|
||||
|
||||
int l0a = astc::flt2int_rtn(l0);
|
||||
int a0a = astc::flt2int_rtn(a0);
|
||||
@@ -693,7 +678,8 @@ static bool try_quantize_luminance_alpha_delta(
|
||||
*
|
||||
* @param color0 The input unquantized color0 endpoint.
|
||||
* @param color1 The input unquantized color1 endpoint.
|
||||
* @param[out] output The output endpoints, returned as (r0, r1, b0, b1, g0, g1, a0, a1).
|
||||
* @param[out] color0_out The output quantized color0 endpoint
|
||||
* @param[out] color1_out The output quantized color1 endpoint
|
||||
* @param quant_level The quantization level to use.
|
||||
*
|
||||
* @return Returns @c false on failure, @c true on success.
|
||||
@@ -701,14 +687,14 @@ static bool try_quantize_luminance_alpha_delta(
|
||||
static bool try_quantize_rgba_delta(
|
||||
vfloat4 color0,
|
||||
vfloat4 color1,
|
||||
uint8_t output[8],
|
||||
vint4& color0_out,
|
||||
vint4& color1_out,
|
||||
quant_method quant_level
|
||||
) {
|
||||
return try_quantize_rgb_delta(color0, color1, output, quant_level) &&
|
||||
try_quantize_alpha_delta(color0, color1, output, quant_level);
|
||||
return try_quantize_rgb_delta(color0, color1, color0_out, color1_out, quant_level) &&
|
||||
try_quantize_alpha_delta(color0, color1, color0_out, color1_out, quant_level);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Try to quantize an LDR RGBA color using delta and blue contract encoding.
|
||||
*
|
||||
@@ -720,7 +706,8 @@ static bool try_quantize_rgba_delta(
|
||||
*
|
||||
* @param color0 The input unquantized color0 endpoint.
|
||||
* @param color1 The input unquantized color1 endpoint.
|
||||
* @param[out] output The output endpoints, returned as (r0, r1, b0, b1, g0, g1, a0, a1).
|
||||
* @param[out] color0_out The output quantized color0 endpoint
|
||||
* @param[out] color1_out The output quantized color1 endpoint
|
||||
* @param quant_level The quantization level to use.
|
||||
*
|
||||
* @return Returns @c false on failure, @c true on success.
|
||||
@@ -728,12 +715,13 @@ static bool try_quantize_rgba_delta(
|
||||
static bool try_quantize_rgba_delta_blue_contract(
|
||||
vfloat4 color0,
|
||||
vfloat4 color1,
|
||||
uint8_t output[8],
|
||||
vint4& color0_out,
|
||||
vint4& color1_out,
|
||||
quant_method quant_level
|
||||
) {
|
||||
// Note that we swap the color0 and color1 ordering for alpha to match RGB blue-contract
|
||||
return try_quantize_rgb_delta_blue_contract(color0, color1, output, quant_level) &&
|
||||
try_quantize_alpha_delta(color1, color0, output, quant_level);
|
||||
return try_quantize_rgb_delta_blue_contract(color0, color1, color0_out, color1_out, quant_level) &&
|
||||
try_quantize_alpha_delta(color1, color0, color0_out, color1_out, quant_level);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -774,6 +762,8 @@ static void quantize_rgbs(
|
||||
/**
|
||||
* @brief Quantize an LDR RGBA color using scale encoding.
|
||||
*
|
||||
* @param color0 The input unquantized color0 alpha endpoint.
|
||||
* @param color1 The input unquantized color1 alpha endpoint.
|
||||
* @param color The input unquantized color endpoint and scale factor.
|
||||
* @param[out] output The output endpoints, returned as (r0, g0, b0, s, a0, a1).
|
||||
* @param quant_level The quantization level to use.
|
||||
@@ -785,10 +775,8 @@ static void quantize_rgbs_alpha(
|
||||
uint8_t output[6],
|
||||
quant_method quant_level
|
||||
) {
|
||||
float scale = 1.0f / 257.0f;
|
||||
|
||||
float a0 = astc::clamp255f(color0.lane<3>() * scale);
|
||||
float a1 = astc::clamp255f(color1.lane<3>() * scale);
|
||||
float a0 = color0.lane<3>();
|
||||
float a1 = color1.lane<3>();
|
||||
|
||||
output[4] = quant_color(quant_level, astc::flt2int_rtn(a0), a0);
|
||||
output[5] = quant_color(quant_level, astc::flt2int_rtn(a1), a1);
|
||||
@@ -810,13 +798,8 @@ static void quantize_luminance(
|
||||
uint8_t output[2],
|
||||
quant_method quant_level
|
||||
) {
|
||||
float scale = 1.0f / 257.0f;
|
||||
|
||||
color0 = color0 * scale;
|
||||
color1 = color1 * scale;
|
||||
|
||||
float lum0 = astc::clamp255f(hadd_rgb_s(color0) * (1.0f / 3.0f));
|
||||
float lum1 = astc::clamp255f(hadd_rgb_s(color1) * (1.0f / 3.0f));
|
||||
float lum0 = hadd_rgb_s(color0) * (1.0f / 3.0f);
|
||||
float lum1 = hadd_rgb_s(color1) * (1.0f / 3.0f);
|
||||
|
||||
if (lum0 > lum1)
|
||||
{
|
||||
@@ -843,16 +826,11 @@ static void quantize_luminance_alpha(
|
||||
uint8_t output[4],
|
||||
quant_method quant_level
|
||||
) {
|
||||
float scale = 1.0f / 257.0f;
|
||||
float lum0 = hadd_rgb_s(color0) * (1.0f / 3.0f);
|
||||
float lum1 = hadd_rgb_s(color1) * (1.0f / 3.0f);
|
||||
|
||||
color0 = color0 * scale;
|
||||
color1 = color1 * scale;
|
||||
|
||||
float lum0 = astc::clamp255f(hadd_rgb_s(color0) * (1.0f / 3.0f));
|
||||
float lum1 = astc::clamp255f(hadd_rgb_s(color1) * (1.0f / 3.0f));
|
||||
|
||||
float a0 = astc::clamp255f(color0.lane<3>());
|
||||
float a1 = astc::clamp255f(color1.lane<3>());
|
||||
float a0 = color0.lane<3>();
|
||||
float a1 = color1.lane<3>();
|
||||
|
||||
output[0] = quant_color(quant_level, astc::flt2int_rtn(lum0), lum0);
|
||||
output[1] = quant_color(quant_level, astc::flt2int_rtn(lum1), lum1);
|
||||
@@ -1939,58 +1917,170 @@ uint8_t pack_color_endpoints(
|
||||
) {
|
||||
assert(QUANT_6 <= quant_level && quant_level <= QUANT_256);
|
||||
|
||||
// We do not support negative colors
|
||||
color0 = max(color0, 0.0f);
|
||||
color1 = max(color1, 0.0f);
|
||||
// Clamp colors to a valid LDR range
|
||||
// Note that HDR has a lower max, handled in the conversion functions
|
||||
color0 = clamp(0.0f, 65535.0f, color0);
|
||||
color1 = clamp(0.0f, 65535.0f, color1);
|
||||
|
||||
// Pre-scale the LDR value we need to the 0-255 quantizable range
|
||||
vfloat4 color0_ldr = color0 * (1.0f / 257.0f);
|
||||
vfloat4 color1_ldr = color1 * (1.0f / 257.0f);
|
||||
|
||||
uint8_t retval = 0;
|
||||
float best_error = ERROR_CALC_DEFAULT;
|
||||
vint4 color0_out, color1_out;
|
||||
vint4 color0_out2, color1_out2;
|
||||
|
||||
switch (format)
|
||||
{
|
||||
case FMT_RGB:
|
||||
if (quant_level <= QUANT_160)
|
||||
{
|
||||
if (try_quantize_rgb_delta_blue_contract(color0, color1, output, quant_level))
|
||||
if (try_quantize_rgb_delta_blue_contract(color0_ldr, color1_ldr, color0_out, color1_out, quant_level))
|
||||
{
|
||||
vint4 color0_unpack;
|
||||
vint4 color1_unpack;
|
||||
rgba_delta_unpack(color0_out, color1_out, color0_unpack, color1_unpack);
|
||||
|
||||
retval = FMT_RGB_DELTA;
|
||||
break;
|
||||
best_error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
|
||||
}
|
||||
if (try_quantize_rgb_delta(color0, color1, output, quant_level))
|
||||
|
||||
if (try_quantize_rgb_delta(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level))
|
||||
{
|
||||
retval = FMT_RGB_DELTA;
|
||||
break;
|
||||
vint4 color0_unpack;
|
||||
vint4 color1_unpack;
|
||||
rgba_delta_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack);
|
||||
|
||||
float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
|
||||
if (error < best_error)
|
||||
{
|
||||
retval = FMT_RGB_DELTA;
|
||||
best_error = error;
|
||||
color0_out = color0_out2;
|
||||
color1_out = color1_out2;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (quant_level < QUANT_256 && try_quantize_rgb_blue_contract(color0, color1, output, quant_level))
|
||||
|
||||
if (quant_level < QUANT_256)
|
||||
{
|
||||
retval = FMT_RGB;
|
||||
break;
|
||||
if (try_quantize_rgb_blue_contract(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level))
|
||||
{
|
||||
vint4 color0_unpack;
|
||||
vint4 color1_unpack;
|
||||
rgba_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack);
|
||||
|
||||
float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
|
||||
if (error < best_error)
|
||||
{
|
||||
retval = FMT_RGB;
|
||||
best_error = error;
|
||||
color0_out = color0_out2;
|
||||
color1_out = color1_out2;
|
||||
}
|
||||
}
|
||||
}
|
||||
quantize_rgb(color0, color1, output, quant_level);
|
||||
retval = FMT_RGB;
|
||||
|
||||
{
|
||||
quantize_rgb(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level);
|
||||
|
||||
vint4 color0_unpack;
|
||||
vint4 color1_unpack;
|
||||
rgba_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack);
|
||||
|
||||
float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
|
||||
if (error < best_error)
|
||||
{
|
||||
retval = FMT_RGB;
|
||||
color0_out = color0_out2;
|
||||
color1_out = color1_out2;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Can we vectorize this?
|
||||
output[0] = static_cast<uint8_t>(color0_out.lane<0>());
|
||||
output[1] = static_cast<uint8_t>(color1_out.lane<0>());
|
||||
output[2] = static_cast<uint8_t>(color0_out.lane<1>());
|
||||
output[3] = static_cast<uint8_t>(color1_out.lane<1>());
|
||||
output[4] = static_cast<uint8_t>(color0_out.lane<2>());
|
||||
output[5] = static_cast<uint8_t>(color1_out.lane<2>());
|
||||
break;
|
||||
|
||||
case FMT_RGBA:
|
||||
if (quant_level <= QUANT_160)
|
||||
{
|
||||
if (try_quantize_rgba_delta_blue_contract(color0, color1, output, quant_level))
|
||||
if (try_quantize_rgba_delta_blue_contract(color0_ldr, color1_ldr, color0_out, color1_out, quant_level))
|
||||
{
|
||||
vint4 color0_unpack;
|
||||
vint4 color1_unpack;
|
||||
rgba_delta_unpack(color0_out, color1_out, color0_unpack, color1_unpack);
|
||||
|
||||
retval = FMT_RGBA_DELTA;
|
||||
break;
|
||||
best_error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
|
||||
}
|
||||
if (try_quantize_rgba_delta(color0, color1, output, quant_level))
|
||||
|
||||
if (try_quantize_rgba_delta(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level))
|
||||
{
|
||||
retval = FMT_RGBA_DELTA;
|
||||
break;
|
||||
vint4 color0_unpack;
|
||||
vint4 color1_unpack;
|
||||
rgba_delta_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack);
|
||||
|
||||
float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
|
||||
if (error < best_error)
|
||||
{
|
||||
retval = FMT_RGBA_DELTA;
|
||||
best_error = error;
|
||||
color0_out = color0_out2;
|
||||
color1_out = color1_out2;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (quant_level < QUANT_256 && try_quantize_rgba_blue_contract(color0, color1, output, quant_level))
|
||||
|
||||
if (quant_level < QUANT_256)
|
||||
{
|
||||
retval = FMT_RGBA;
|
||||
break;
|
||||
if (try_quantize_rgba_blue_contract(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level))
|
||||
{
|
||||
vint4 color0_unpack;
|
||||
vint4 color1_unpack;
|
||||
rgba_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack);
|
||||
|
||||
float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
|
||||
if (error < best_error)
|
||||
{
|
||||
retval = FMT_RGBA;
|
||||
best_error = error;
|
||||
color0_out = color0_out2;
|
||||
color1_out = color1_out2;
|
||||
}
|
||||
}
|
||||
}
|
||||
quantize_rgba(color0, color1, output, quant_level);
|
||||
retval = FMT_RGBA;
|
||||
|
||||
{
|
||||
quantize_rgba(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level);
|
||||
|
||||
vint4 color0_unpack;
|
||||
vint4 color1_unpack;
|
||||
rgba_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack);
|
||||
|
||||
float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
|
||||
if (error < best_error)
|
||||
{
|
||||
retval = FMT_RGBA;
|
||||
color0_out = color0_out2;
|
||||
color1_out = color1_out2;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Can we vectorize this?
|
||||
output[0] = static_cast<uint8_t>(color0_out.lane<0>());
|
||||
output[1] = static_cast<uint8_t>(color1_out.lane<0>());
|
||||
output[2] = static_cast<uint8_t>(color0_out.lane<1>());
|
||||
output[3] = static_cast<uint8_t>(color1_out.lane<1>());
|
||||
output[4] = static_cast<uint8_t>(color0_out.lane<2>());
|
||||
output[5] = static_cast<uint8_t>(color1_out.lane<2>());
|
||||
output[6] = static_cast<uint8_t>(color0_out.lane<3>());
|
||||
output[7] = static_cast<uint8_t>(color1_out.lane<3>());
|
||||
break;
|
||||
|
||||
case FMT_RGB_SCALE:
|
||||
@@ -2009,7 +2099,7 @@ uint8_t pack_color_endpoints(
|
||||
break;
|
||||
|
||||
case FMT_RGB_SCALE_ALPHA:
|
||||
quantize_rgbs_alpha(color0, color1, rgbs_color, output, quant_level);
|
||||
quantize_rgbs_alpha(color0_ldr, color1_ldr, rgbs_color, output, quant_level);
|
||||
retval = FMT_RGB_SCALE_ALPHA;
|
||||
break;
|
||||
|
||||
@@ -2025,20 +2115,20 @@ uint8_t pack_color_endpoints(
|
||||
break;
|
||||
|
||||
case FMT_LUMINANCE:
|
||||
quantize_luminance(color0, color1, output, quant_level);
|
||||
quantize_luminance(color0_ldr, color1_ldr, output, quant_level);
|
||||
retval = FMT_LUMINANCE;
|
||||
break;
|
||||
|
||||
case FMT_LUMINANCE_ALPHA:
|
||||
if (quant_level <= 18)
|
||||
{
|
||||
if (try_quantize_luminance_alpha_delta(color0, color1, output, quant_level))
|
||||
if (try_quantize_luminance_alpha_delta(color0_ldr, color1_ldr, output, quant_level))
|
||||
{
|
||||
retval = FMT_LUMINANCE_ALPHA_DELTA;
|
||||
break;
|
||||
}
|
||||
}
|
||||
quantize_luminance_alpha(color0, color1, output, quant_level);
|
||||
quantize_luminance_alpha(color0_ldr, color1_ldr, output, quant_level);
|
||||
retval = FMT_LUMINANCE_ALPHA;
|
||||
break;
|
||||
|
||||
|
||||
@@ -40,15 +40,7 @@ static ASTCENC_SIMD_INLINE vint4 uncontract_color(
|
||||
return select(input, bc0, mask);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR RGBA color that uses delta encoding.
|
||||
*
|
||||
* @param input0 The packed endpoint 0 color.
|
||||
* @param input1 The packed endpoint 1 color deltas.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void rgba_delta_unpack(
|
||||
void rgba_delta_unpack(
|
||||
vint4 input0,
|
||||
vint4 input1,
|
||||
vint4& output0,
|
||||
@@ -92,15 +84,7 @@ static void rgb_delta_unpack(
|
||||
output1.set_lane<3>(255);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR RGBA color that uses direct encoding.
|
||||
*
|
||||
* @param input0 The packed endpoint 0 color.
|
||||
* @param input1 The packed endpoint 1 color.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void rgba_unpack(
|
||||
void rgba_unpack(
|
||||
vint4 input0,
|
||||
vint4 input1,
|
||||
vint4& output0,
|
||||
|
||||
@@ -1163,7 +1163,7 @@ static float prepare_block_statistics(
|
||||
void compress_block(
|
||||
const astcenc_contexti& ctx,
|
||||
const image_block& blk,
|
||||
physical_compressed_block& pcb,
|
||||
uint8_t pcb[16],
|
||||
compression_working_buffers& tmpbuf)
|
||||
{
|
||||
astcenc_profile decode_mode = ctx.config.profile;
|
||||
@@ -1282,9 +1282,10 @@ void compress_block(
|
||||
|
||||
static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
|
||||
|
||||
// Only enable MODE0 fast path (trial 0) if 2D, and more than 25 texels
|
||||
// Only enable MODE0 fast path if enabled
|
||||
// Never enable for 3D blocks as no "always" block modes are available
|
||||
int start_trial = 1;
|
||||
if ((bsd.texel_count >= TUNE_MIN_TEXELS_MODE0_FASTPATH) && (bsd.zdim == 1))
|
||||
if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
|
||||
{
|
||||
start_trial = 0;
|
||||
}
|
||||
|
||||
@@ -104,10 +104,10 @@ void unpack_weights(
|
||||
if (!is_dual_plane)
|
||||
{
|
||||
// Build full 64-entry weight lookup table
|
||||
vint4 tab0(reinterpret_cast<const int*>(scb.weights + 0));
|
||||
vint4 tab1(reinterpret_cast<const int*>(scb.weights + 16));
|
||||
vint4 tab2(reinterpret_cast<const int*>(scb.weights + 32));
|
||||
vint4 tab3(reinterpret_cast<const int*>(scb.weights + 48));
|
||||
vint4 tab0 = vint4::load(scb.weights + 0);
|
||||
vint4 tab1 = vint4::load(scb.weights + 16);
|
||||
vint4 tab2 = vint4::load(scb.weights + 32);
|
||||
vint4 tab3 = vint4::load(scb.weights + 48);
|
||||
|
||||
vint tab0p, tab1p, tab2p, tab3p;
|
||||
vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
|
||||
@@ -134,14 +134,14 @@ void unpack_weights(
|
||||
{
|
||||
// Build a 32-entry weight lookup table per plane
|
||||
// Plane 1
|
||||
vint4 tab0_plane1(reinterpret_cast<const int*>(scb.weights + 0));
|
||||
vint4 tab1_plane1(reinterpret_cast<const int*>(scb.weights + 16));
|
||||
vint4 tab0_plane1 = vint4::load(scb.weights + 0);
|
||||
vint4 tab1_plane1 = vint4::load(scb.weights + 16);
|
||||
vint tab0_plane1p, tab1_plane1p;
|
||||
vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
|
||||
|
||||
// Plane 2
|
||||
vint4 tab0_plane2(reinterpret_cast<const int*>(scb.weights + 32));
|
||||
vint4 tab1_plane2(reinterpret_cast<const int*>(scb.weights + 48));
|
||||
vint4 tab0_plane2 = vint4::load(scb.weights + 32);
|
||||
vint4 tab1_plane2 = vint4::load(scb.weights + 48);
|
||||
vint tab0_plane2p, tab1_plane2p;
|
||||
vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
|
||||
|
||||
|
||||
69
3rdparty/astc-encoder/source/astcenc_entry.cpp
vendored
69
3rdparty/astc-encoder/source/astcenc_entry.cpp
vendored
@@ -55,6 +55,7 @@ struct astcenc_preset_config
|
||||
float tune_2partition_early_out_limit_factor;
|
||||
float tune_3partition_early_out_limit_factor;
|
||||
float tune_2plane_early_out_limit_correlation;
|
||||
float tune_search_mode0_enable;
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -63,22 +64,22 @@ struct astcenc_preset_config
|
||||
static const std::array<astcenc_preset_config, 6> preset_configs_high {{
|
||||
{
|
||||
ASTCENC_PRE_FASTEST,
|
||||
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f
|
||||
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_FAST,
|
||||
3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f
|
||||
3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_MEDIUM,
|
||||
4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f
|
||||
4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_THOROUGH,
|
||||
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f
|
||||
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_VERYTHOROUGH,
|
||||
4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
|
||||
4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_EXHAUSTIVE,
|
||||
4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
|
||||
4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
|
||||
}
|
||||
}};
|
||||
|
||||
@@ -88,22 +89,22 @@ static const std::array<astcenc_preset_config, 6> preset_configs_high {{
|
||||
static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
|
||||
{
|
||||
ASTCENC_PRE_FASTEST,
|
||||
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f
|
||||
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_FAST,
|
||||
3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f
|
||||
3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_MEDIUM,
|
||||
4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f
|
||||
3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_THOROUGH,
|
||||
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f
|
||||
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_VERYTHOROUGH,
|
||||
4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
|
||||
4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_EXHAUSTIVE,
|
||||
4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
|
||||
4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
|
||||
}
|
||||
}};
|
||||
|
||||
@@ -113,22 +114,22 @@ static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
|
||||
static const std::array<astcenc_preset_config, 6> preset_configs_low {{
|
||||
{
|
||||
ASTCENC_PRE_FASTEST,
|
||||
2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f
|
||||
2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_FAST,
|
||||
2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f
|
||||
2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_MEDIUM,
|
||||
3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f
|
||||
3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_THOROUGH,
|
||||
4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f
|
||||
4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_VERYTHOROUGH,
|
||||
4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
|
||||
4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_EXHAUSTIVE,
|
||||
4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
|
||||
4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f
|
||||
}
|
||||
}};
|
||||
|
||||
@@ -504,10 +505,10 @@ astcenc_error astcenc_config_init(
|
||||
config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
|
||||
config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
|
||||
config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
|
||||
config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES);
|
||||
config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES);
|
||||
config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES);
|
||||
config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES);
|
||||
config.tune_candidate_limit = (*preset_configs)[start].tune_candidate_limit;
|
||||
config.tune_2partitioning_candidate_limit = (*preset_configs)[start].tune_2partitioning_candidate_limit;
|
||||
config.tune_3partitioning_candidate_limit = (*preset_configs)[start].tune_3partitioning_candidate_limit;
|
||||
config.tune_4partitioning_candidate_limit = (*preset_configs)[start].tune_4partitioning_candidate_limit;
|
||||
config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
|
||||
(*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
|
||||
|
||||
@@ -516,6 +517,7 @@ astcenc_error astcenc_config_init(
|
||||
config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor;
|
||||
config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor;
|
||||
config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation;
|
||||
config.tune_search_mode0_enable = (*preset_configs)[start].tune_search_mode0_enable;
|
||||
}
|
||||
// Start and end node are not the same - so interpolate between them
|
||||
else
|
||||
@@ -542,14 +544,10 @@ astcenc_error astcenc_config_init(
|
||||
config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
|
||||
config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
|
||||
config.tune_refinement_limit = LERPI(tune_refinement_limit);
|
||||
config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit),
|
||||
TUNE_MAX_TRIAL_CANDIDATES);
|
||||
config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit),
|
||||
BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit),
|
||||
BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit),
|
||||
BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_candidate_limit = LERPUI(tune_candidate_limit);
|
||||
config.tune_2partitioning_candidate_limit = LERPUI(tune_2partitioning_candidate_limit);
|
||||
config.tune_3partitioning_candidate_limit = LERPUI(tune_3partitioning_candidate_limit);
|
||||
config.tune_4partitioning_candidate_limit = LERPUI(tune_4partitioning_candidate_limit);
|
||||
config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
|
||||
LERP(tune_db_limit_b_base) - 19 * ltexels);
|
||||
|
||||
@@ -558,6 +556,7 @@ astcenc_error astcenc_config_init(
|
||||
config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor);
|
||||
config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor);
|
||||
config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation);
|
||||
config.tune_search_mode0_enable = LERP(tune_search_mode0_enable);
|
||||
#undef LERP
|
||||
#undef LERPI
|
||||
#undef LERPUI
|
||||
@@ -585,6 +584,7 @@ astcenc_error astcenc_config_init(
|
||||
case ASTCENC_PRF_HDR_RGB_LDR_A:
|
||||
case ASTCENC_PRF_HDR:
|
||||
config.tune_db_limit = 999.0f;
|
||||
config.tune_search_mode0_enable = 0.0f;
|
||||
break;
|
||||
default:
|
||||
return ASTCENC_ERR_BAD_PROFILE;
|
||||
@@ -914,8 +914,7 @@ static void compress_image(
|
||||
|
||||
int offset = ((z * yblocks + y) * xblocks + x) * 16;
|
||||
uint8_t *bp = buffer + offset;
|
||||
physical_compressed_block* pcb = reinterpret_cast<physical_compressed_block*>(bp);
|
||||
compress_block(ctx, blk, *pcb, temp_buffers);
|
||||
compress_block(ctx, blk, bp, temp_buffers);
|
||||
}
|
||||
|
||||
ctxo.manage_compress.complete_task_assignment(count);
|
||||
@@ -1182,10 +1181,9 @@ astcenc_error astcenc_decompress_image(
|
||||
unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
|
||||
const uint8_t* bp = data + offset;
|
||||
|
||||
const physical_compressed_block& pcb = *reinterpret_cast<const physical_compressed_block*>(bp);
|
||||
symbolic_compressed_block scb;
|
||||
|
||||
physical_to_symbolic(*ctx->bsd, pcb, scb);
|
||||
physical_to_symbolic(*ctx->bsd, bp, scb);
|
||||
|
||||
decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
|
||||
x * block_x, y * block_y, z * block_z,
|
||||
@@ -1224,9 +1222,8 @@ astcenc_error astcenc_get_block_info(
|
||||
astcenc_contexti* ctx = &ctxo->context;
|
||||
|
||||
// Decode the compressed data into a symbolic form
|
||||
const physical_compressed_block&pcb = *reinterpret_cast<const physical_compressed_block*>(data);
|
||||
symbolic_compressed_block scb;
|
||||
physical_to_symbolic(*ctx->bsd, pcb, scb);
|
||||
physical_to_symbolic(*ctx->bsd, data, scb);
|
||||
|
||||
// Fetch the appropriate partition and decimation tables
|
||||
block_size_descriptor& bsd = *ctx->bsd;
|
||||
|
||||
@@ -250,13 +250,16 @@ static void kmeans_update(
|
||||
*
|
||||
* @return The number of bit mismatches.
|
||||
*/
|
||||
static inline unsigned int partition_mismatch2(
|
||||
static inline uint8_t partition_mismatch2(
|
||||
const uint64_t a[2],
|
||||
const uint64_t b[2]
|
||||
) {
|
||||
int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
|
||||
int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
|
||||
return astc::min(v1, v2);
|
||||
|
||||
// Divide by 2 because XOR always counts errors twice, once when missing
|
||||
// in the expected position, and again when present in the wrong partition
|
||||
return static_cast<uint8_t>(astc::min(v1, v2) / 2);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -267,7 +270,7 @@ static inline unsigned int partition_mismatch2(
|
||||
*
|
||||
* @return The number of bit mismatches.
|
||||
*/
|
||||
static inline unsigned int partition_mismatch3(
|
||||
static inline uint8_t partition_mismatch3(
|
||||
const uint64_t a[3],
|
||||
const uint64_t b[3]
|
||||
) {
|
||||
@@ -295,7 +298,9 @@ static inline unsigned int partition_mismatch3(
|
||||
int s5 = p11 + p20;
|
||||
int v2 = astc::min(s4, s5) + p02;
|
||||
|
||||
return astc::min(v0, v1, v2);
|
||||
// Divide by 2 because XOR always counts errors twice, once when missing
|
||||
// in the expected position, and again when present in the wrong partition
|
||||
return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -306,7 +311,7 @@ static inline unsigned int partition_mismatch3(
|
||||
*
|
||||
* @return The number of bit mismatches.
|
||||
*/
|
||||
static inline unsigned int partition_mismatch4(
|
||||
static inline uint8_t partition_mismatch4(
|
||||
const uint64_t a[4],
|
||||
const uint64_t b[4]
|
||||
) {
|
||||
@@ -342,7 +347,9 @@ static inline unsigned int partition_mismatch4(
|
||||
int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
|
||||
int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
|
||||
|
||||
return astc::min(v0, v1, v2, v3);
|
||||
// Divide by 2 because XOR always counts errors twice, once when missing
|
||||
// in the expected position, and again when present in the wrong partition
|
||||
return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);
|
||||
}
|
||||
|
||||
using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
|
||||
@@ -359,7 +366,7 @@ static void count_partition_mismatch_bits(
|
||||
const block_size_descriptor& bsd,
|
||||
unsigned int partition_count,
|
||||
const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
|
||||
unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS]
|
||||
uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]
|
||||
) {
|
||||
unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
|
||||
promise(active_count > 0);
|
||||
@@ -369,6 +376,8 @@ static void count_partition_mismatch_bits(
|
||||
for (unsigned int i = 0; i < active_count; i++)
|
||||
{
|
||||
mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
|
||||
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
|
||||
assert(mismatch_counts[i] < bsd.texel_count);
|
||||
}
|
||||
}
|
||||
else if (partition_count == 3)
|
||||
@@ -376,6 +385,8 @@ static void count_partition_mismatch_bits(
|
||||
for (unsigned int i = 0; i < active_count; i++)
|
||||
{
|
||||
mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
|
||||
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
|
||||
assert(mismatch_counts[i] < bsd.texel_count);
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -383,6 +394,8 @@ static void count_partition_mismatch_bits(
|
||||
for (unsigned int i = 0; i < active_count; i++)
|
||||
{
|
||||
mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
|
||||
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
|
||||
assert(mismatch_counts[i] < bsd.texel_count);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -397,12 +410,13 @@ static void count_partition_mismatch_bits(
|
||||
* @return The number of active partitions in this selection.
|
||||
*/
|
||||
static unsigned int get_partition_ordering_by_mismatch_bits(
|
||||
unsigned int texel_count,
|
||||
unsigned int partitioning_count,
|
||||
const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS],
|
||||
unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
|
||||
const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],
|
||||
uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
|
||||
) {
|
||||
promise(partitioning_count > 0);
|
||||
unsigned int mscount[256] { 0 };
|
||||
uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };
|
||||
|
||||
// Create the histogram of mismatch counts
|
||||
for (unsigned int i = 0; i < partitioning_count; i++)
|
||||
@@ -410,16 +424,14 @@ static unsigned int get_partition_ordering_by_mismatch_bits(
|
||||
mscount[mismatch_count[i]]++;
|
||||
}
|
||||
|
||||
unsigned int active_count = partitioning_count - mscount[255];
|
||||
|
||||
// Create a running sum from the histogram array
|
||||
// Cells store previous values only; i.e. exclude self after sum
|
||||
unsigned int summa = 0;
|
||||
for (unsigned int i = 0; i < 256; i++)
|
||||
unsigned int sum = 0;
|
||||
for (unsigned int i = 0; i < texel_count; i++)
|
||||
{
|
||||
unsigned int cnt = mscount[i];
|
||||
mscount[i] = summa;
|
||||
summa += cnt;
|
||||
uint16_t cnt = mscount[i];
|
||||
mscount[i] = sum;
|
||||
sum += cnt;
|
||||
}
|
||||
|
||||
// Use the running sum as the index, incrementing after read to allow
|
||||
@@ -427,10 +439,10 @@ static unsigned int get_partition_ordering_by_mismatch_bits(
|
||||
for (unsigned int i = 0; i < partitioning_count; i++)
|
||||
{
|
||||
unsigned int idx = mscount[mismatch_count[i]]++;
|
||||
partition_ordering[idx] = i;
|
||||
partition_ordering[idx] = static_cast<uint16_t>(i);
|
||||
}
|
||||
|
||||
return active_count;
|
||||
return partitioning_count;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -447,7 +459,7 @@ static unsigned int compute_kmeans_partition_ordering(
|
||||
const block_size_descriptor& bsd,
|
||||
const image_block& blk,
|
||||
unsigned int partition_count,
|
||||
unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
|
||||
uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
|
||||
) {
|
||||
vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
|
||||
uint8_t texel_partitions[BLOCK_MAX_TEXELS];
|
||||
@@ -478,11 +490,12 @@ static unsigned int compute_kmeans_partition_ordering(
|
||||
}
|
||||
|
||||
// Count the mismatch between the block and the format's partition tables
|
||||
unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS];
|
||||
uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];
|
||||
count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
|
||||
|
||||
// Sort the partitions based on the number of mismatched bits
|
||||
return get_partition_ordering_by_mismatch_bits(
|
||||
texels_to_process,
|
||||
bsd.partitioning_count_selected[partition_count - 1],
|
||||
mismatch_counts, partition_ordering);
|
||||
}
|
||||
@@ -565,7 +578,7 @@ unsigned int find_best_partition_candidates(
|
||||
|
||||
weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
|
||||
|
||||
unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS];
|
||||
uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];
|
||||
unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
|
||||
partition_search_limit = astc::min(partition_search_limit, sequence_len);
|
||||
requested_candidates = astc::min(partition_search_limit, requested_candidates);
|
||||
|
||||
@@ -1023,7 +1023,7 @@ void compute_quantized_weights_for_decimation(
|
||||
// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
|
||||
if (get_quant_level(quant_level) <= 16)
|
||||
{
|
||||
vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant));
|
||||
vint4 tab0 = vint4::load(qat.quant_to_unquant);
|
||||
vint tab0p;
|
||||
vtable_prepare(tab0, tab0p);
|
||||
|
||||
@@ -1056,8 +1056,8 @@ void compute_quantized_weights_for_decimation(
|
||||
}
|
||||
else
|
||||
{
|
||||
vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant));
|
||||
vint4 tab1(reinterpret_cast<const int*>(qat.quant_to_unquant + 16));
|
||||
vint4 tab0 = vint4::load(qat.quant_to_unquant + 0);
|
||||
vint4 tab1 = vint4::load(qat.quant_to_unquant + 16);
|
||||
vint tab0p, tab1p;
|
||||
vtable_prepare(tab0, tab1, tab0p, tab1p);
|
||||
|
||||
|
||||
@@ -433,7 +433,7 @@ void store_image_block(
|
||||
|
||||
vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
|
||||
vmask store_mask = vint::lane_id() < vint(used_texels);
|
||||
store_lanes_masked(reinterpret_cast<int*>(data8_row), data_rgbai, store_mask);
|
||||
store_lanes_masked(data8_row, data_rgbai, store_mask);
|
||||
|
||||
data8_row += ASTCENC_SIMD_WIDTH * 4;
|
||||
idx += used_texels;
|
||||
|
||||
72
3rdparty/astc-encoder/source/astcenc_internal.h
vendored
72
3rdparty/astc-encoder/source/astcenc_internal.h
vendored
@@ -79,7 +79,7 @@ static constexpr unsigned int BLOCK_MAX_PARTITIONS { 4 };
|
||||
/** @brief The number of partitionings, per partition count, suported by the ASTC format. */
|
||||
static constexpr unsigned int BLOCK_MAX_PARTITIONINGS { 1024 };
|
||||
|
||||
/** @brief The maximum number of weights used during partition selection for texel clustering. */
|
||||
/** @brief The maximum number of texels used during partition selection for texel clustering. */
|
||||
static constexpr uint8_t BLOCK_MAX_KMEANS_TEXELS { 64 };
|
||||
|
||||
/** @brief The maximum number of weights a block can support. */
|
||||
@@ -119,11 +119,9 @@ static constexpr unsigned int WEIGHTS_MAX_DECIMATION_MODES { 87 };
|
||||
static constexpr float ERROR_CALC_DEFAULT { 1e30f };
|
||||
|
||||
/**
|
||||
* @brief The minimum texel count for a block to use the one partition fast path.
|
||||
*
|
||||
* This setting skips 4x4 and 5x4 block sizes.
|
||||
* @brief The minimum tuning setting threshold for the one partition fast path.
|
||||
*/
|
||||
static constexpr unsigned int TUNE_MIN_TEXELS_MODE0_FASTPATH { 24 };
|
||||
static constexpr float TUNE_MIN_SEARCH_MODE0 { 0.85f };
|
||||
|
||||
/**
|
||||
* @brief The maximum number of candidate encodings tested for each encoding mode.
|
||||
@@ -137,7 +135,7 @@ static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 8 };
|
||||
*
|
||||
* This can be dynamically reduced by the compression quality preset.
|
||||
*/
|
||||
static constexpr unsigned int TUNE_MAX_PARTITIONING_CANDIDATES { 32 };
|
||||
static constexpr unsigned int TUNE_MAX_PARTITIONING_CANDIDATES { 8 };
|
||||
|
||||
/**
|
||||
* @brief The maximum quant level using full angular endpoint search method.
|
||||
@@ -1025,13 +1023,13 @@ struct dt_init_working_buffers
|
||||
struct quant_and_transfer_table
|
||||
{
|
||||
/** @brief The unscrambled unquantized value. */
|
||||
int8_t quant_to_unquant[32];
|
||||
uint8_t quant_to_unquant[32];
|
||||
|
||||
/** @brief The scrambling order: scrambled_quant = map[unscrambled_quant]. */
|
||||
int8_t scramble_map[32];
|
||||
uint8_t scramble_map[32];
|
||||
|
||||
/** @brief The unscrambling order: unscrambled_unquant = map[scrambled_quant]. */
|
||||
int8_t unscramble_and_unquant_map[32];
|
||||
uint8_t unscramble_and_unquant_map[32];
|
||||
|
||||
/**
|
||||
* @brief A table of previous-and-next weights, indexed by the current unquantized value.
|
||||
@@ -1060,7 +1058,7 @@ static constexpr uint8_t SYM_BTYPE_NONCONST { 3 };
|
||||
* @brief A symbolic representation of a compressed block.
|
||||
*
|
||||
* The symbolic representation stores the unpacked content of a single
|
||||
* @c physical_compressed_block, in a form which is much easier to access for
|
||||
* physical compressed block, in a form which is much easier to access for
|
||||
* the rest of the compressor code.
|
||||
*/
|
||||
struct symbolic_compressed_block
|
||||
@@ -1122,18 +1120,6 @@ struct symbolic_compressed_block
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A physical representation of a compressed block.
|
||||
*
|
||||
* The physical representation stores the raw bytes of the format in memory.
|
||||
*/
|
||||
struct physical_compressed_block
|
||||
{
|
||||
/** @brief The ASTC encoded data for a single block. */
|
||||
uint8_t data[16];
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @brief Parameter structure for @c compute_pixel_region_variance().
|
||||
*
|
||||
@@ -1848,6 +1834,34 @@ void unpack_color_endpoints(
|
||||
vint4& output0,
|
||||
vint4& output1);
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR RGBA color that uses delta encoding.
|
||||
*
|
||||
* @param input0 The packed endpoint 0 color.
|
||||
* @param input1 The packed endpoint 1 color deltas.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
void rgba_delta_unpack(
|
||||
vint4 input0,
|
||||
vint4 input1,
|
||||
vint4& output0,
|
||||
vint4& output1);
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR RGBA color that uses direct encoding.
|
||||
*
|
||||
* @param input0 The packed endpoint 0 color.
|
||||
* @param input1 The packed endpoint 1 color.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
void rgba_unpack(
|
||||
vint4 input0,
|
||||
vint4 input1,
|
||||
vint4& output0,
|
||||
vint4& output1);
|
||||
|
||||
/**
|
||||
* @brief Unpack a set of quantized and decimated weights.
|
||||
*
|
||||
@@ -2007,7 +2021,7 @@ void compute_angular_endpoints_2planes(
|
||||
void compress_block(
|
||||
const astcenc_contexti& ctx,
|
||||
const image_block& blk,
|
||||
physical_compressed_block& pcb,
|
||||
uint8_t pcb[16],
|
||||
compression_working_buffers& tmpbuf);
|
||||
|
||||
/**
|
||||
@@ -2100,12 +2114,12 @@ float compute_symbolic_block_difference_1plane_1partition(
|
||||
*
|
||||
* @param bsd The block size information.
|
||||
* @param scb The symbolic representation.
|
||||
* @param[out] pcb The binary encoded data.
|
||||
* @param[out] pcb The physical compressed block output.
|
||||
*/
|
||||
void symbolic_to_physical(
|
||||
const block_size_descriptor& bsd,
|
||||
const symbolic_compressed_block& scb,
|
||||
physical_compressed_block& pcb);
|
||||
uint8_t pcb[16]);
|
||||
|
||||
/**
|
||||
* @brief Convert a binary physical encoding into a symbolic representation.
|
||||
@@ -2114,12 +2128,12 @@ void symbolic_to_physical(
|
||||
* flagged as an error block if the encoding is invalid.
|
||||
*
|
||||
* @param bsd The block size information.
|
||||
* @param pcb The binary encoded data.
|
||||
* @param pcb The physical compresesd block input.
|
||||
* @param[out] scb The output symbolic representation.
|
||||
*/
|
||||
void physical_to_symbolic(
|
||||
const block_size_descriptor& bsd,
|
||||
const physical_compressed_block& pcb,
|
||||
const uint8_t pcb[16],
|
||||
symbolic_compressed_block& scb);
|
||||
|
||||
/* ============================================================================
|
||||
@@ -2164,9 +2178,9 @@ template<typename T>
|
||||
void aligned_free(T* ptr)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
_aligned_free(reinterpret_cast<void*>(ptr));
|
||||
_aligned_free(ptr);
|
||||
#else
|
||||
free(reinterpret_cast<void*>(ptr));
|
||||
free(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -102,7 +102,7 @@ static inline void write_bits(
|
||||
void symbolic_to_physical(
|
||||
const block_size_descriptor& bsd,
|
||||
const symbolic_compressed_block& scb,
|
||||
physical_compressed_block& pcb
|
||||
uint8_t pcb[16]
|
||||
) {
|
||||
assert(scb.block_type != SYM_BTYPE_ERROR);
|
||||
|
||||
@@ -113,13 +113,13 @@ void symbolic_to_physical(
|
||||
static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
|
||||
for (unsigned int i = 0; i < 8; i++)
|
||||
{
|
||||
pcb.data[i] = cbytes[i];
|
||||
pcb[i] = cbytes[i];
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
|
||||
{
|
||||
pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
|
||||
pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
|
||||
pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
|
||||
pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
|
||||
}
|
||||
|
||||
return;
|
||||
@@ -132,13 +132,13 @@ void symbolic_to_physical(
|
||||
static const uint8_t cbytes[8] { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
|
||||
for (unsigned int i = 0; i < 8; i++)
|
||||
{
|
||||
pcb.data[i] = cbytes[i];
|
||||
pcb[i] = cbytes[i];
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
|
||||
{
|
||||
pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
|
||||
pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
|
||||
pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
|
||||
pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
|
||||
}
|
||||
|
||||
return;
|
||||
@@ -194,23 +194,23 @@ void symbolic_to_physical(
|
||||
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
pcb.data[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
|
||||
pcb[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
|
||||
}
|
||||
|
||||
write_bits(scb.block_mode, 11, 0, pcb.data);
|
||||
write_bits(partition_count - 1, 2, 11, pcb.data);
|
||||
write_bits(scb.block_mode, 11, 0, pcb);
|
||||
write_bits(partition_count - 1, 2, 11, pcb);
|
||||
|
||||
int below_weights_pos = 128 - bits_for_weights;
|
||||
|
||||
// Encode partition index and color endpoint types for blocks with 2+ partitions
|
||||
if (partition_count > 1)
|
||||
{
|
||||
write_bits(scb.partition_index, 6, 13, pcb.data);
|
||||
write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb.data);
|
||||
write_bits(scb.partition_index, 6, 13, pcb);
|
||||
write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb);
|
||||
|
||||
if (scb.color_formats_matched)
|
||||
{
|
||||
write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
|
||||
write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -249,20 +249,20 @@ void symbolic_to_physical(
|
||||
int encoded_type_highpart = encoded_type >> 6;
|
||||
int encoded_type_highpart_size = (3 * partition_count) - 4;
|
||||
int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
|
||||
write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
|
||||
write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb.data);
|
||||
write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb);
|
||||
write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb);
|
||||
below_weights_pos -= encoded_type_highpart_size;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
write_bits(scb.color_formats[0], 4, 13, pcb.data);
|
||||
write_bits(scb.color_formats[0], 4, 13, pcb);
|
||||
}
|
||||
|
||||
// In dual-plane mode, encode the color component of the second plane of weights
|
||||
if (is_dual_plane)
|
||||
{
|
||||
write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb.data);
|
||||
write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb);
|
||||
}
|
||||
|
||||
// Encode the color components
|
||||
@@ -281,7 +281,7 @@ void symbolic_to_physical(
|
||||
valuecount_to_encode += vals;
|
||||
}
|
||||
|
||||
encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb.data,
|
||||
encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb,
|
||||
scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS);
|
||||
}
|
||||
|
||||
@@ -290,7 +290,7 @@ void symbolic_to_physical(
|
||||
/* See header for documentation. */
|
||||
void physical_to_symbolic(
|
||||
const block_size_descriptor& bsd,
|
||||
const physical_compressed_block& pcb,
|
||||
const uint8_t pcb[16],
|
||||
symbolic_compressed_block& scb
|
||||
) {
|
||||
uint8_t bswapped[16];
|
||||
@@ -298,7 +298,7 @@ void physical_to_symbolic(
|
||||
scb.block_type = SYM_BTYPE_NONCONST;
|
||||
|
||||
// Extract header fields
|
||||
int block_mode = read_bits(11, 0, pcb.data);
|
||||
int block_mode = read_bits(11, 0, pcb);
|
||||
if ((block_mode & 0x1FF) == 0x1FC)
|
||||
{
|
||||
// Constant color block
|
||||
@@ -316,24 +316,24 @@ void physical_to_symbolic(
|
||||
scb.partition_count = 0;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
scb.constant_color[i] = pcb.data[2 * i + 8] | (pcb.data[2 * i + 9] << 8);
|
||||
scb.constant_color[i] = pcb[2 * i + 8] | (pcb[2 * i + 9] << 8);
|
||||
}
|
||||
|
||||
// Additionally, check that the void-extent
|
||||
if (bsd.zdim == 1)
|
||||
{
|
||||
// 2D void-extent
|
||||
int rsvbits = read_bits(2, 10, pcb.data);
|
||||
int rsvbits = read_bits(2, 10, pcb);
|
||||
if (rsvbits != 3)
|
||||
{
|
||||
scb.block_type = SYM_BTYPE_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
int vx_low_s = read_bits(8, 12, pcb.data) | (read_bits(5, 12 + 8, pcb.data) << 8);
|
||||
int vx_high_s = read_bits(8, 25, pcb.data) | (read_bits(5, 25 + 8, pcb.data) << 8);
|
||||
int vx_low_t = read_bits(8, 38, pcb.data) | (read_bits(5, 38 + 8, pcb.data) << 8);
|
||||
int vx_high_t = read_bits(8, 51, pcb.data) | (read_bits(5, 51 + 8, pcb.data) << 8);
|
||||
int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8);
|
||||
int vx_high_s = read_bits(8, 25, pcb) | (read_bits(5, 25 + 8, pcb) << 8);
|
||||
int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8);
|
||||
int vx_high_t = read_bits(8, 51, pcb) | (read_bits(5, 51 + 8, pcb) << 8);
|
||||
|
||||
int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
|
||||
|
||||
@@ -346,12 +346,12 @@ void physical_to_symbolic(
|
||||
else
|
||||
{
|
||||
// 3D void-extent
|
||||
int vx_low_s = read_bits(9, 10, pcb.data);
|
||||
int vx_high_s = read_bits(9, 19, pcb.data);
|
||||
int vx_low_t = read_bits(9, 28, pcb.data);
|
||||
int vx_high_t = read_bits(9, 37, pcb.data);
|
||||
int vx_low_p = read_bits(9, 46, pcb.data);
|
||||
int vx_high_p = read_bits(9, 55, pcb.data);
|
||||
int vx_low_s = read_bits(9, 10, pcb);
|
||||
int vx_high_s = read_bits(9, 19, pcb);
|
||||
int vx_low_t = read_bits(9, 28, pcb);
|
||||
int vx_high_t = read_bits(9, 37, pcb);
|
||||
int vx_low_p = read_bits(9, 46, pcb);
|
||||
int vx_high_p = read_bits(9, 55, pcb);
|
||||
|
||||
int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;
|
||||
|
||||
@@ -383,7 +383,7 @@ void physical_to_symbolic(
|
||||
|
||||
int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
|
||||
|
||||
int partition_count = read_bits(2, 11, pcb.data) + 1;
|
||||
int partition_count = read_bits(2, 11, pcb) + 1;
|
||||
promise(partition_count > 0);
|
||||
|
||||
scb.block_mode = static_cast<uint16_t>(block_mode);
|
||||
@@ -391,7 +391,7 @@ void physical_to_symbolic(
|
||||
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
bswapped[i] = static_cast<uint8_t>(bitrev8(pcb.data[15 - i]));
|
||||
bswapped[i] = static_cast<uint8_t>(bitrev8(pcb[15 - i]));
|
||||
}
|
||||
|
||||
int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
|
||||
@@ -432,14 +432,15 @@ void physical_to_symbolic(
|
||||
int encoded_type_highpart_size = 0;
|
||||
if (partition_count == 1)
|
||||
{
|
||||
color_formats[0] = read_bits(4, 13, pcb.data);
|
||||
color_formats[0] = read_bits(4, 13, pcb);
|
||||
scb.partition_index = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
encoded_type_highpart_size = (3 * partition_count) - 4;
|
||||
below_weights_pos -= encoded_type_highpart_size;
|
||||
int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb.data) | (read_bits(encoded_type_highpart_size, below_weights_pos, pcb.data) << 6);
|
||||
int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb) |
|
||||
(read_bits(encoded_type_highpart_size, below_weights_pos, pcb) << 6);
|
||||
int baseclass = encoded_type & 0x3;
|
||||
if (baseclass == 0)
|
||||
{
|
||||
@@ -469,7 +470,8 @@ void physical_to_symbolic(
|
||||
bitpos += 2;
|
||||
}
|
||||
}
|
||||
scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb.data) | (read_bits(PARTITION_INDEX_BITS - 6, 19, pcb.data) << 6));
|
||||
scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb) |
|
||||
(read_bits(PARTITION_INDEX_BITS - 6, 19, pcb) << 6));
|
||||
}
|
||||
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
@@ -515,7 +517,7 @@ void physical_to_symbolic(
|
||||
scb.quant_mode = static_cast<quant_method>(color_quant_level);
|
||||
|
||||
uint8_t values_to_decode[32];
|
||||
decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb.data,
|
||||
decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb,
|
||||
values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS));
|
||||
|
||||
int valuecount_to_decode = 0;
|
||||
@@ -534,6 +536,6 @@ void physical_to_symbolic(
|
||||
scb.plane2_component = -1;
|
||||
if (is_dual_plane)
|
||||
{
|
||||
scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb.data));
|
||||
scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -241,6 +241,14 @@ struct vint8
|
||||
return vint8(_mm256_broadcastd_epi32(a));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from unaligned memory.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p)
|
||||
{
|
||||
return vint8(_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(p)));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from 32B aligned memory.
|
||||
*/
|
||||
@@ -1000,7 +1008,7 @@ ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
|
||||
{
|
||||
a = round(a);
|
||||
a = a + vfloat8(0.5f);
|
||||
return vint8(_mm256_cvttps_epi32(a.m));
|
||||
}
|
||||
|
||||
@@ -1152,9 +1160,9 @@ ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
|
||||
*
|
||||
* All masked lanes must be at the end of vector, after all non-masked lanes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint8 data, vmask8 mask)
|
||||
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask)
|
||||
{
|
||||
_mm256_maskstore_epi32(base, _mm256_castps_si256(mask.m), data.m);
|
||||
_mm256_maskstore_epi32(reinterpret_cast<int*>(base), _mm256_castps_si256(mask.m), data.m);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2019-2022 Arm Limited
|
||||
// Copyright 2019-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -38,6 +38,7 @@
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
// ============================================================================
|
||||
// vfloat4 data type
|
||||
@@ -269,6 +270,16 @@ struct vint4
|
||||
return vint4(*p);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from unaligned memory.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
|
||||
{
|
||||
vint4 data;
|
||||
std::memcpy(&data.m, p, 4 * sizeof(int));
|
||||
return data;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from 16B aligned memory.
|
||||
*/
|
||||
@@ -584,6 +595,14 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
|
||||
vst1q_s32(p, a.m);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector to an unaligned memory address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
|
||||
{
|
||||
std::memcpy(p, &a.m, sizeof(int) * 4);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store lowest N (vector width) bytes into an unaligned address.
|
||||
*/
|
||||
@@ -849,7 +868,7 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
|
||||
{
|
||||
a = round(a);
|
||||
a = a + vfloat4(0.5f);
|
||||
return vint4(vcvtq_s32_f32(a.m));
|
||||
}
|
||||
|
||||
@@ -1027,31 +1046,39 @@ ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
|
||||
return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a single vector lane to an unaligned address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
|
||||
{
|
||||
std::memcpy(base, &data, sizeof(int));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector, skipping masked lanes.
|
||||
*
|
||||
* All masked lanes must be at the end of vector, after all non-masked lanes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
|
||||
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
|
||||
{
|
||||
if (mask.lane<3>())
|
||||
{
|
||||
store(data, base);
|
||||
}
|
||||
else if (mask.lane<2>())
|
||||
else if (mask.lane<2>() != 0.0f)
|
||||
{
|
||||
base[0] = data.lane<0>();
|
||||
base[1] = data.lane<1>();
|
||||
base[2] = data.lane<2>();
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
store_lane(base + 4, data.lane<1>());
|
||||
store_lane(base + 8, data.lane<2>());
|
||||
}
|
||||
else if (mask.lane<1>())
|
||||
else if (mask.lane<1>() != 0.0f)
|
||||
{
|
||||
base[0] = data.lane<0>();
|
||||
base[1] = data.lane<1>();
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
store_lane(base + 4, data.lane<1>());
|
||||
}
|
||||
else if (mask.lane<0>())
|
||||
else if (mask.lane<0>() != 0.0f)
|
||||
{
|
||||
base[0] = data.lane<0>();
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -275,6 +275,16 @@ struct vint4
|
||||
return vint4(*p);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from unaligned memory.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
|
||||
{
|
||||
vint4 data;
|
||||
std::memcpy(&data.m, p, 4 * sizeof(int));
|
||||
return data;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from 16B aligned memory.
|
||||
*/
|
||||
@@ -644,13 +654,20 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
|
||||
p[3] = a.m[3];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector to an unaligned memory address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
|
||||
{
|
||||
std::memcpy(p, a.m, sizeof(int) * 4);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store lowest N (vector width) bytes into an unaligned address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
|
||||
{
|
||||
int* pi = reinterpret_cast<int*>(p);
|
||||
*pi = a.m[0];
|
||||
std::memcpy(p, a.m, sizeof(uint8_t) * 4);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -963,10 +980,11 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
|
||||
{
|
||||
return vint4(static_cast<int>(a.m[0] + 0.5f),
|
||||
static_cast<int>(a.m[1] + 0.5f),
|
||||
static_cast<int>(a.m[2] + 0.5f),
|
||||
static_cast<int>(a.m[3] + 0.5f));
|
||||
a = a + vfloat4(0.5f);
|
||||
return vint4(static_cast<int>(a.m[0]),
|
||||
static_cast<int>(a.m[1]),
|
||||
static_cast<int>(a.m[2]),
|
||||
static_cast<int>(a.m[3]));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1030,7 +1048,7 @@ ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
|
||||
ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
|
||||
{
|
||||
vint4 r;
|
||||
memcpy(r.m, a.m, 4 * 4);
|
||||
std::memcpy(r.m, a.m, 4 * 4);
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -1044,7 +1062,7 @@ ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
|
||||
ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
|
||||
{
|
||||
vfloat4 r;
|
||||
memcpy(r.m, a.m, 4 * 4);
|
||||
std::memcpy(r.m, a.m, 4 * 4);
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -1079,12 +1097,13 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
|
||||
* @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
|
||||
{
|
||||
uint8_t table[16];
|
||||
storea(t0, reinterpret_cast<int*>(table + 0));
|
||||
|
||||
std::memcpy(table + 0, t0.m, 4 * sizeof(int));
|
||||
|
||||
return vint4(table[idx.lane<0>()],
|
||||
table[idx.lane<1>()],
|
||||
@@ -1099,8 +1118,9 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
|
||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
|
||||
{
|
||||
uint8_t table[32];
|
||||
storea(t0, reinterpret_cast<int*>(table + 0));
|
||||
storea(t1, reinterpret_cast<int*>(table + 16));
|
||||
|
||||
std::memcpy(table + 0, t0.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 16, t1.m, 4 * sizeof(int));
|
||||
|
||||
return vint4(table[idx.lane<0>()],
|
||||
table[idx.lane<1>()],
|
||||
@@ -1114,10 +1134,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
|
||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
|
||||
{
|
||||
uint8_t table[64];
|
||||
storea(t0, reinterpret_cast<int*>(table + 0));
|
||||
storea(t1, reinterpret_cast<int*>(table + 16));
|
||||
storea(t2, reinterpret_cast<int*>(table + 32));
|
||||
storea(t3, reinterpret_cast<int*>(table + 48));
|
||||
|
||||
std::memcpy(table + 0, t0.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 16, t1.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 32, t2.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 48, t3.m, 4 * sizeof(int));
|
||||
|
||||
return vint4(table[idx.lane<0>()],
|
||||
table[idx.lane<1>()],
|
||||
@@ -1138,12 +1159,21 @@ ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
|
||||
return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a single vector lane to an unaligned address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
|
||||
{
|
||||
std::memcpy(base, &data, sizeof(int));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector, skipping masked lanes.
|
||||
*
|
||||
* All masked lanes must be at the end of vector, after all non-masked lanes.
|
||||
* Input is a byte array of at least 4 bytes per unmasked entry.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
|
||||
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
|
||||
{
|
||||
if (mask.m[3])
|
||||
{
|
||||
@@ -1151,18 +1181,18 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
|
||||
}
|
||||
else if (mask.m[2])
|
||||
{
|
||||
base[0] = data.lane<0>();
|
||||
base[1] = data.lane<1>();
|
||||
base[2] = data.lane<2>();
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
store_lane(base + 4, data.lane<1>());
|
||||
store_lane(base + 8, data.lane<2>());
|
||||
}
|
||||
else if (mask.m[1])
|
||||
{
|
||||
base[0] = data.lane<0>();
|
||||
base[1] = data.lane<1>();
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
store_lane(base + 4, data.lane<1>());
|
||||
}
|
||||
else if (mask.m[0])
|
||||
{
|
||||
base[0] = data.lane<0>();
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
// ============================================================================
|
||||
// vfloat4 data type
|
||||
@@ -292,6 +293,18 @@ struct vint4
|
||||
return vint4(*p);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from unaligned memory.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
|
||||
{
|
||||
#if ASTCENC_SSE >= 41
|
||||
return vint4(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(p)));
|
||||
#else
|
||||
return vint4(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p)));
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from 16B aligned memory.
|
||||
*/
|
||||
@@ -633,6 +646,14 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
|
||||
_mm_storeu_ps(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector to an unaligned memory address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
|
||||
{
|
||||
std::memcpy(p, &a.m, sizeof(int) * 4);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store lowest N (vector width) bytes into an unaligned address.
|
||||
*/
|
||||
@@ -934,7 +955,7 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
|
||||
{
|
||||
a = round(a);
|
||||
a = a + vfloat4(0.5f);
|
||||
return vint4(_mm_cvttps_epi32(a.m));
|
||||
}
|
||||
|
||||
@@ -1087,8 +1108,9 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
|
||||
__m128i result = _mm_shuffle_epi8(t0.m, idxx);
|
||||
return vint4(result);
|
||||
#else
|
||||
alignas(ASTCENC_VECALIGN) uint8_t table[16];
|
||||
storea(t0, reinterpret_cast<int*>(table + 0));
|
||||
uint8_t table[16];
|
||||
|
||||
std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
|
||||
|
||||
return vint4(table[idx.lane<0>()],
|
||||
table[idx.lane<1>()],
|
||||
@@ -1114,9 +1136,10 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
|
||||
|
||||
return vint4(result);
|
||||
#else
|
||||
alignas(ASTCENC_VECALIGN) uint8_t table[32];
|
||||
storea(t0, reinterpret_cast<int*>(table + 0));
|
||||
storea(t1, reinterpret_cast<int*>(table + 16));
|
||||
uint8_t table[32];
|
||||
|
||||
std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
|
||||
|
||||
return vint4(table[idx.lane<0>()],
|
||||
table[idx.lane<1>()],
|
||||
@@ -1150,11 +1173,12 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3
|
||||
|
||||
return vint4(result);
|
||||
#else
|
||||
alignas(ASTCENC_VECALIGN) uint8_t table[64];
|
||||
storea(t0, reinterpret_cast<int*>(table + 0));
|
||||
storea(t1, reinterpret_cast<int*>(table + 16));
|
||||
storea(t2, reinterpret_cast<int*>(table + 32));
|
||||
storea(t3, reinterpret_cast<int*>(table + 48));
|
||||
uint8_t table[64];
|
||||
|
||||
std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 32, &t2.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 48, &t3.m, 4 * sizeof(int));
|
||||
|
||||
return vint4(table[idx.lane<0>()],
|
||||
table[idx.lane<1>()],
|
||||
@@ -1190,15 +1214,23 @@ ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a single vector lane to an unaligned address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
|
||||
{
|
||||
std::memcpy(base, &data, sizeof(int));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector, skipping masked lanes.
|
||||
*
|
||||
* All masked lanes must be at the end of vector, after all non-masked lanes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
|
||||
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
|
||||
{
|
||||
#if ASTCENC_AVX >= 2
|
||||
_mm_maskstore_epi32(base, _mm_castps_si128(mask.m), data.m);
|
||||
_mm_maskstore_epi32(reinterpret_cast<int*>(base), _mm_castps_si128(mask.m), data.m);
|
||||
#else
|
||||
// Note - we cannot use _mm_maskmoveu_si128 as the underlying hardware doesn't guarantee
|
||||
// fault suppression on masked lanes so we can get page faults at the end of an image.
|
||||
@@ -1208,18 +1240,18 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
|
||||
}
|
||||
else if (mask.lane<2>() != 0.0f)
|
||||
{
|
||||
base[0] = data.lane<0>();
|
||||
base[1] = data.lane<1>();
|
||||
base[2] = data.lane<2>();
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
store_lane(base + 4, data.lane<1>());
|
||||
store_lane(base + 8, data.lane<2>());
|
||||
}
|
||||
else if (mask.lane<1>() != 0.0f)
|
||||
{
|
||||
base[0] = data.lane<0>();
|
||||
base[1] = data.lane<1>();
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
store_lane(base + 4, data.lane<1>());
|
||||
}
|
||||
else if (mask.lane<0>() != 0.0f)
|
||||
{
|
||||
base[0] = data.lane<0>();
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user