ASTC encoding support

- Add 3rdparty/astc with lib version of the standard encoder - Add astc_compress call for ASTC formats - Add BIMG_CONFIG_ASTC_DECODE-gated decompression support. This is just for testing, the decompress code is currently too heavy to include in the core lib. - Add fourcc support for DDS decode so ASTC and other formats not covered by dxgi can be read - Add --formats option to texturec, lists all supported formats - Update genie files -- add astc to bimg_encode and remove redundant files from texturec
2026-02-17 20:52:38 +01:00 · 2018-07-23 19:05:11 +01:00
parent 746f1053d7
commit 03ad3921ef
36 changed files with 40258 additions and 46 deletions
--- a/3rdparty/astc/astc_averages_and_directions.cpp
+++ b/3rdparty/astc/astc_averages_and_directions.cpp
@@ -0,0 +1,627 @@
+/*----------------------------------------------------------------------------*/  
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Implements functions for finding dominant direction of a set of
+ *			colors, using ARM patent pending method.
+ */ 
+/*----------------------------------------------------------------------------*/ 
+
+#include "astc_codec_internals.h"
+
+#include <math.h>
+#include "mathlib.h"
+
+#ifdef DEBUG_CAPTURE_NAN
+	#ifndef _GNU_SOURCE
+		#define _GNU_SOURCE
+	#endif
+
+	#include <fenv.h>
+#endif
+
+/* routines to compute average colors and dominant directions for blocks with 3 and 4 components. */
+
+/*
+	for a full block, functions to compute averages and dominant directions. The averages and directions are computed separately for each partition.
+	We have separate versions for blocks with and without alpha, since the processing for blocks with alpha is significantly more expensive.
+	The direction vectors it produces are NOT normalized. 
+*/
+void compute_averages_and_directions_rgba(const partition_info * pt,
+										  const imageblock * blk,
+										  const error_weight_block * ewb,
+										  const float4 * color_scalefactors,
+										  float4 * averages, float4 * directions_rgba, float3 * directions_gba, float3 * directions_rba, float3 * directions_rga, float3 * directions_rgb)
+{
+	int i;
+	int partition_count = pt->partition_count;
+	int partition;
+
+	for (partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *weights = pt->texels_of_partition[partition];
+		int texelcount = pt->texels_per_partition[partition];
+
+		float4 base_sum = float4(0, 0, 0, 0);
+		float partition_weight = 0.0f;
+
+		for (i = 0; i < texelcount; i++)
+		{
+			int iwt = weights[i];
+			float weight = ewb->texel_weight[iwt];
+			float4 texel_datum = float4(blk->work_data[4 * iwt],
+										blk->work_data[4 * iwt + 1],
+										blk->work_data[4 * iwt + 2],
+										blk->work_data[4 * iwt + 3]) * weight;
+			partition_weight += weight;
+
+			base_sum = base_sum + texel_datum;
+		}
+
+		float4 average = base_sum * 1.0f / MAX(partition_weight, 1e-7f);
+		averages[partition] = average * color_scalefactors[partition];
+
+
+		float4 sum_xp = float4(0, 0, 0, 0);
+		float4 sum_yp = float4(0, 0, 0, 0);
+		float4 sum_zp = float4(0, 0, 0, 0);
+		float4 sum_wp = float4(0, 0, 0, 0);
+
+		for (i = 0; i < texelcount; i++)
+		{
+			int iwt = weights[i];
+			float weight = ewb->texel_weight[iwt];
+			float4 texel_datum = float4(blk->work_data[4 * iwt],
+										blk->work_data[4 * iwt + 1],
+										blk->work_data[4 * iwt + 2],
+										blk->work_data[4 * iwt + 3]);
+			texel_datum = (texel_datum - average) * weight;
+
+			if (texel_datum.x > 0.0f)
+				sum_xp = sum_xp + texel_datum;
+			if (texel_datum.y > 0.0f)
+				sum_yp = sum_yp + texel_datum;
+			if (texel_datum.z > 0.0f)
+				sum_zp = sum_zp + texel_datum;
+			if (texel_datum.w > 0.0f)
+				sum_wp = sum_wp + texel_datum;
+		}
+
+		float prod_xp = dot(sum_xp, sum_xp);
+		float prod_yp = dot(sum_yp, sum_yp);
+		float prod_zp = dot(sum_zp, sum_zp);
+		float prod_wp = dot(sum_wp, sum_wp);
+
+		float4 best_vector = sum_xp;
+		float best_sum = prod_xp;
+		if (prod_yp > best_sum)
+		{
+			best_vector = sum_yp;
+			best_sum = prod_yp;
+		}
+		if (prod_zp > best_sum)
+		{
+			best_vector = sum_zp;
+			best_sum = prod_zp;
+		}
+		if (prod_wp > best_sum)
+		{
+			best_vector = sum_wp;
+			best_sum = prod_wp;
+		}
+
+		directions_rgba[partition] = best_vector;
+		directions_rgb[partition] = best_vector.xyz;
+		directions_rga[partition] = best_vector.xyw;
+		directions_rba[partition] = best_vector.xzw;
+		directions_gba[partition] = best_vector.yzw;
+	}
+}
+
+
+
+
+void compute_averages_and_directions_rgb(const partition_info * pt,
+										 const imageblock * blk,
+										 const error_weight_block * ewb,
+										 const float4 * color_scalefactors, float3 * averages, float3 * directions_rgb, float2 * directions_rg, float2 * directions_rb, float2 * directions_gb)
+{
+	int i;
+	int partition_count = pt->partition_count;
+	int partition;
+
+	const float *texel_weights = ewb->texel_weight_rgb;
+
+	for (partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *weights = pt->texels_of_partition[partition];
+		int texelcount = pt->texels_per_partition[partition];
+
+		float3 base_sum = float3(0, 0, 0);
+		float partition_weight = 0.0f;
+
+		for (i = 0; i < texelcount; i++)
+		{
+			int iwt = weights[i];
+			float weight = texel_weights[iwt];
+			float3 texel_datum = float3(blk->work_data[4 * iwt],
+										blk->work_data[4 * iwt + 1],
+										blk->work_data[4 * iwt + 2]) * weight;
+			partition_weight += weight;
+
+			base_sum = base_sum + texel_datum;
+		}
+
+		float4 csf = color_scalefactors[partition];
+		float3 average = base_sum * 1.0f / MAX(partition_weight, 1e-7f);
+		averages[partition] = average * csf.xyz;
+
+
+		float3 sum_xp = float3(0, 0, 0);
+		float3 sum_yp = float3(0, 0, 0);
+		float3 sum_zp = float3(0, 0, 0);
+
+		for (i = 0; i < texelcount; i++)
+		{
+			int iwt = weights[i];
+			float weight = texel_weights[iwt];
+			float3 texel_datum = float3(blk->work_data[4 * iwt],
+										blk->work_data[4 * iwt + 1],
+										blk->work_data[4 * iwt + 2]);
+			texel_datum = (texel_datum - average) * weight;
+
+			if (texel_datum.x > 0.0f)
+				sum_xp = sum_xp + texel_datum;
+			if (texel_datum.y > 0.0f)
+				sum_yp = sum_yp + texel_datum;
+			if (texel_datum.z > 0.0f)
+				sum_zp = sum_zp + texel_datum;
+		}
+
+		float prod_xp = dot(sum_xp, sum_xp);
+		float prod_yp = dot(sum_yp, sum_yp);
+		float prod_zp = dot(sum_zp, sum_zp);
+
+		float3 best_vector = sum_xp;
+		float best_sum = prod_xp;
+		if (prod_yp > best_sum)
+		{
+			best_vector = sum_yp;
+			best_sum = prod_yp;
+		}
+		if (prod_zp > best_sum)
+		{
+			best_vector = sum_zp;
+			best_sum = prod_zp;
+		}
+
+		directions_rgb[partition] = best_vector;
+		directions_gb[partition] = best_vector.yz;
+		directions_rb[partition] = best_vector.xz;
+		directions_rg[partition] = best_vector.xy;
+	}
+}
+
+void compute_averages_and_directions_3_components(const partition_info * pt,
+												  const imageblock * blk,
+												  const error_weight_block * ewb,
+												  const float3 * color_scalefactors, int component1, int component2, int component3, float3 * averages, float3 * directions)
+{
+	int i;
+	int partition_count = pt->partition_count;
+	int partition;
+
+	const float *texel_weights;
+	if (component1 == 1 && component2 == 2 && component3 == 3)
+		texel_weights = ewb->texel_weight_gba;
+	else if (component1 == 0 && component2 == 2 && component3 == 3)
+		texel_weights = ewb->texel_weight_rba;
+	else if (component1 == 0 && component2 == 1 && component3 == 3)
+		texel_weights = ewb->texel_weight_rga;
+	else if (component1 == 0 && component2 == 1 && component3 == 2)
+		texel_weights = ewb->texel_weight_rgb;
+	else
+	{
+		texel_weights = ewb->texel_weight_gba;
+		ASTC_CODEC_INTERNAL_ERROR;
+	}
+
+
+	for (partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *weights = pt->texels_of_partition[partition];
+		int texelcount = pt->texels_per_partition[partition];
+
+		float3 base_sum = float3(0, 0, 0);
+		float partition_weight = 0.0f;
+
+		for (i = 0; i < texelcount; i++)
+		{
+			int iwt = weights[i];
+			float weight = texel_weights[iwt];
+			float3 texel_datum = float3(blk->work_data[4 * iwt + component1],
+										blk->work_data[4 * iwt + component2],
+										blk->work_data[4 * iwt + component3]) * weight;
+			partition_weight += weight;
+
+			base_sum = base_sum + texel_datum;
+		}
+
+		float3 csf = color_scalefactors[partition];
+
+		float3 average = base_sum * 1.0f / MAX(partition_weight, 1e-7f);
+		averages[partition] = average * csf.xyz;
+
+
+		float3 sum_xp = float3(0, 0, 0);
+		float3 sum_yp = float3(0, 0, 0);
+		float3 sum_zp = float3(0, 0, 0);
+
+		for (i = 0; i < texelcount; i++)
+		{
+			int iwt = weights[i];
+			float weight = texel_weights[iwt];
+			float3 texel_datum = float3(blk->work_data[4 * iwt + component1],
+										blk->work_data[4 * iwt + component2],
+										blk->work_data[4 * iwt + component3]);
+			texel_datum = (texel_datum - average) * weight;
+
+			if (texel_datum.x > 0.0f)
+				sum_xp = sum_xp + texel_datum;
+			if (texel_datum.y > 0.0f)
+				sum_yp = sum_yp + texel_datum;
+			if (texel_datum.z > 0.0f)
+				sum_zp = sum_zp + texel_datum;
+		}
+
+		float prod_xp = dot(sum_xp, sum_xp);
+		float prod_yp = dot(sum_yp, sum_yp);
+		float prod_zp = dot(sum_zp, sum_zp);
+
+		float3 best_vector = sum_xp;
+		float best_sum = prod_xp;
+		if (prod_yp > best_sum)
+		{
+			best_vector = sum_yp;
+			best_sum = prod_yp;
+		}
+		if (prod_zp > best_sum)
+		{
+			best_vector = sum_zp;
+			best_sum = prod_zp;
+		}
+
+		if (dot(best_vector, best_vector) < 1e-18)
+			best_vector = float3(1, 1, 1);
+		directions[partition] = best_vector;
+	}
+
+}
+
+
+
+
+void compute_averages_and_directions_2_components(const partition_info * pt,
+												  const imageblock * blk,
+												  const error_weight_block * ewb, const float2 * color_scalefactors, int component1, int component2, float2 * averages, float2 * directions)
+{
+	int i;
+	int partition_count = pt->partition_count;
+	int partition;
+
+	const float *texel_weights;
+	if (component1 == 0 && component2 == 1)
+		texel_weights = ewb->texel_weight_rg;
+	else if (component1 == 0 && component2 == 2)
+		texel_weights = ewb->texel_weight_rb;
+	else if (component1 == 1 && component2 == 2)
+		texel_weights = ewb->texel_weight_gb;
+	else
+	{
+		texel_weights = ewb->texel_weight_rg;
+		// unsupported set of color components.
+		ASTC_CODEC_INTERNAL_ERROR;
+		exit(1);
+	}
+
+
+	for (partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *weights = pt->texels_of_partition[partition];
+		int texelcount = pt->texels_per_partition[partition];
+
+		float2 base_sum = float2(0, 0);
+		float partition_weight = 0.0f;
+
+		for (i = 0; i < texelcount; i++)
+		{
+			int iwt = weights[i];
+			float weight = texel_weights[iwt];
+			float2 texel_datum = float2(blk->work_data[4 * iwt + component1],
+										blk->work_data[4 * iwt + component2]) * weight;
+			partition_weight += weight;
+
+			base_sum = base_sum + texel_datum;
+		}
+
+		float2 csf = color_scalefactors[partition];
+
+		float2 average = base_sum * 1.0f / MAX(partition_weight, 1e-7f);
+		averages[partition] = average * csf.xy;
+
+
+		float2 sum_xp = float2(0, 0);
+		float2 sum_yp = float2(0, 0);
+
+		for (i = 0; i < texelcount; i++)
+		{
+			int iwt = weights[i];
+			float weight = texel_weights[iwt];
+			float2 texel_datum = float2(blk->work_data[4 * iwt + component1],
+										blk->work_data[4 * iwt + component2]);
+			texel_datum = (texel_datum - average) * weight;
+
+			if (texel_datum.x > 0.0f)
+				sum_xp = sum_xp + texel_datum;
+			if (texel_datum.y > 0.0f)
+				sum_yp = sum_yp + texel_datum;
+		}
+
+		float prod_xp = dot(sum_xp, sum_xp);
+		float prod_yp = dot(sum_yp, sum_yp);
+
+		float2 best_vector = sum_xp;
+		float best_sum = prod_xp;
+		if (prod_yp > best_sum)
+		{
+			best_vector = sum_yp;
+			best_sum = prod_yp;
+		}
+
+		directions[partition] = best_vector;
+	}
+
+}
+
+
+#define XPASTE(x,y) x##y
+#define PASTE(x,y) XPASTE(x,y)
+
+#define TWO_COMPONENT_ERROR_FUNC( funcname, c0_iwt, c1_iwt, c01_name, c01_rname ) \
+float funcname( \
+	const partition_info *pt, \
+	const imageblock *blk, \
+	const error_weight_block *ewb, \
+	const processed_line2 *plines, \
+	float *length_of_lines \
+	) \
+	{ \
+	int i; \
+	float errorsum = 0.0f; \
+	int partition; \
+	for(partition=0; partition<pt->partition_count; partition++) \
+		{ \
+		const uint8_t *weights = pt->texels_of_partition[ partition ]; \
+		int texelcount = pt->texels_per_partition[ partition ]; \
+		float lowparam = 1e10f; \
+		float highparam = -1e10f; \
+		processed_line2 l = plines[partition]; \
+		if( ewb->contains_zeroweight_texels ) \
+			{ \
+			for(i=0;i<texelcount;i++) \
+				{ \
+				int iwt = weights[i]; \
+				float texel_weight = ewb-> PASTE(texel_weight_ , c01_rname) [i]; \
+				if( texel_weight > 1e-20f ) \
+					{ \
+					float2 point = float2(blk->work_data[4*iwt + c0_iwt], blk->work_data[4*iwt + c1_iwt] ); \
+					float param = dot( point, l.bs ); \
+					float2 rp1 = l.amod + param*l.bis; \
+					float2 dist = rp1 - point; \
+					float4 ews = ewb->error_weights[iwt]; \
+					errorsum += dot( ews. c01_name, dist*dist ); \
+					if( param < lowparam ) lowparam = param; \
+					if( param > highparam ) highparam = param; \
+					} \
+				} \
+			} \
+		else \
+			{ \
+			for(i=0;i<texelcount;i++) \
+				{ \
+				int iwt = weights[i]; \
+				float2 point = float2(blk->work_data[4*iwt + c0_iwt], blk->work_data[4*iwt + c1_iwt] ); \
+				float param = dot( point, l.bs ); \
+				float2 rp1 = l.amod + param*l.bis; \
+				float2 dist = rp1 - point; \
+				float4 ews = ewb->error_weights[iwt]; \
+				errorsum += dot( ews. c01_name, dist*dist ); \
+				if( param < lowparam ) lowparam = param; \
+				if( param > highparam ) highparam = param; \
+				} \
+			} \
+		float linelen = highparam - lowparam; \
+		if( !(linelen > 1e-7f) ) \
+			linelen = 1e-7f; \
+		length_of_lines[partition] = linelen; \
+		} \
+	return errorsum; \
+	}
+
+
+TWO_COMPONENT_ERROR_FUNC(compute_error_squared_rg, 0, 1, xy, rg)
+TWO_COMPONENT_ERROR_FUNC(compute_error_squared_rb, 0, 2, xz, rb)
+TWO_COMPONENT_ERROR_FUNC(compute_error_squared_gb, 1, 2, yz, gb)
+TWO_COMPONENT_ERROR_FUNC(compute_error_squared_ra, 0, 3, zw, ra)
+
+// function to compute the error across a tile when using a particular set of
+// lines for a particular partitioning. Also compute the length of each
+// color-space line in each partitioning.
+
+#define THREE_COMPONENT_ERROR_FUNC( funcname, c0_iwt, c1_iwt, c2_iwt, c012_name, c012_rname ) \
+float funcname( \
+	const partition_info *pt, \
+	const imageblock *blk, \
+	const error_weight_block *ewb, \
+	const processed_line3 *plines, \
+	float *length_of_lines \
+	) \
+	{ \
+	int i; \
+	float errorsum = 0.0f; \
+	int partition; \
+	for(partition=0; partition<pt->partition_count; partition++) \
+		{ \
+		const uint8_t *weights = pt->texels_of_partition[ partition ]; \
+		int texelcount = pt->texels_per_partition[ partition ]; \
+		float lowparam = 1e10f; \
+		float highparam = -1e10f; \
+		processed_line3 l = plines[partition]; \
+		if( ewb->contains_zeroweight_texels ) \
+			{ \
+			for(i=0;i<texelcount;i++) \
+				{ \
+				int iwt = weights[i]; \
+				float texel_weight = ewb-> PASTE(texel_weight_ , c012_rname) [i]; \
+				if( texel_weight > 1e-20f ) \
+					{ \
+					float3 point = float3(blk->work_data[4*iwt + c0_iwt], blk->work_data[4*iwt + c1_iwt], blk->work_data[4*iwt + c2_iwt] ); \
+					float param = dot( point, l.bs ); \
+					float3 rp1 = l.amod + param*l.bis; \
+					float3 dist = rp1 - point; \
+					float4 ews = ewb->error_weights[iwt]; \
+					errorsum += dot( ews. c012_name, dist*dist ); \
+					if( param < lowparam ) lowparam = param; \
+					if( param > highparam ) highparam = param; \
+					} \
+				} \
+			} \
+		else \
+			{ \
+			for(i=0;i<texelcount;i++) \
+				{ \
+				int iwt = weights[i]; \
+				float3 point = float3(blk->work_data[4*iwt + c0_iwt], blk->work_data[4*iwt + c1_iwt], blk->work_data[4*iwt + c2_iwt] ); \
+				float param = dot( point, l.bs ); \
+				float3 rp1 = l.amod + param*l.bis; \
+				float3 dist = rp1 - point; \
+				float4 ews = ewb->error_weights[iwt]; \
+				errorsum += dot( ews. c012_name, dist*dist ); \
+				if( param < lowparam ) lowparam = param; \
+				if( param > highparam ) highparam = param; \
+				} \
+			} \
+		float linelen = highparam - lowparam; \
+		if( !(linelen > 1e-7f) ) \
+			linelen = 1e-7f; \
+		length_of_lines[partition] = linelen; \
+		} \
+	return errorsum; \
+	}
+
+THREE_COMPONENT_ERROR_FUNC(compute_error_squared_gba, 1, 2, 3, yzw, gba)
+THREE_COMPONENT_ERROR_FUNC(compute_error_squared_rba, 0, 2, 3, xzw, rba)
+THREE_COMPONENT_ERROR_FUNC(compute_error_squared_rga, 0, 1, 3, xyw, rga)
+THREE_COMPONENT_ERROR_FUNC(compute_error_squared_rgb, 0, 1, 2, xyz, rgb)
+
+float compute_error_squared_rgba(const partition_info * pt,	// the partition that we use when computing the squared-error.
+								 const imageblock * blk, const error_weight_block * ewb, const processed_line4 * plines, float *length_of_lines)
+{
+	int i;
+
+	float errorsum = 0.0f;
+	int partition;
+	for (partition = 0; partition < pt->partition_count; partition++)
+	{
+		const uint8_t *weights = pt->texels_of_partition[partition];
+		int texelcount = pt->texels_per_partition[partition];
+		float lowparam = 1e10;
+		float highparam = -1e10;
+
+		processed_line4 l = plines[partition];
+
+		if (ewb->contains_zeroweight_texels)
+		{
+			for (i = 0; i < texelcount; i++)
+			{
+				int iwt = weights[i];
+				if (ewb->texel_weight[iwt] > 1e-20)
+				{
+					float4 point = float4(blk->work_data[4 * iwt], blk->work_data[4 * iwt + 1], blk->work_data[4 * iwt + 2], blk->work_data[4 * iwt + 3]);
+					float param = dot(point, l.bs);
+					float4 rp1 = l.amod + param * l.bis;
+					float4 dist = rp1 - point;
+					float4 ews = ewb->error_weights[iwt];
+					errorsum += dot(ews, dist * dist);
+					if (param < lowparam)
+						lowparam = param;
+					if (param > highparam)
+						highparam = param;
+				}
+			}
+		}
+		else
+		{
+			for (i = 0; i < texelcount; i++)
+			{
+				int iwt = weights[i];
+				float4 point = float4(blk->work_data[4 * iwt], blk->work_data[4 * iwt + 1], blk->work_data[4 * iwt + 2], blk->work_data[4 * iwt + 3]);
+				float param = dot(point, l.bs);
+				float4 rp1 = l.amod + param * l.bis;
+				float4 dist = rp1 - point;
+				float4 ews = ewb->error_weights[iwt];
+				errorsum += dot(ews, dist * dist);
+				if (param < lowparam)
+					lowparam = param;
+				if (param > highparam)
+					highparam = param;
+			}
+		}
+
+		float linelen = highparam - lowparam;
+		if (!(linelen > 1e-7f))
+			linelen = 1e-7f;
+		length_of_lines[partition] = linelen;
+	}
+
+	return errorsum;
+}
+
+
+
+// function to compute the error across a tile when using a particular line for
+// a particular partition.
+float compute_error_squared_rgb_single_partition(int partition_to_test, int xdim, int ydim, int zdim, const partition_info * pt,	// the partition that we use when computing the squared-error.
+												 const imageblock * blk, const error_weight_block * ewb, const processed_line3 * lin	// the line for the partition.
+	)
+{
+	int i;
+
+	int texels_per_block = xdim * ydim * zdim;
+
+	float errorsum = 0.0f;
+
+	for (i = 0; i < texels_per_block; i++)
+	{
+		int partition = pt->partition_of_texel[i];
+		float texel_weight = ewb->texel_weight_rgb[i];
+		if (partition != partition_to_test || texel_weight < 1e-20)
+			continue;
+		float3 point = float3(blk->work_data[4 * i], blk->work_data[4 * i + 1], blk->work_data[4 * i + 2]);
+
+		float param = dot(point, lin->bs);
+		float3 rp1 = lin->amod + param * lin->bis;
+		float3 dist = rp1 - point;
+		float4 ews = ewb->error_weights[i];
+
+		errorsum += dot(ews.xyz, dist * dist);
+	}
+	return errorsum;
+}
--- a/3rdparty/astc/astc_block_sizes2.cpp
+++ b/3rdparty/astc/astc_block_sizes2.cpp
@@ -0,0 +1,977 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	For ASTC, generate the block size descriptor and the associated
+ *			decimation tables.
+ */
+/*----------------------------------------------------------------------------*/
+
+#include "astc_codec_internals.h"
+
+extern const float percentile_table_4x4[2048];
+extern const float percentile_table_4x5[2048];
+extern const float percentile_table_4x6[2048];
+extern const float percentile_table_4x8[2048];
+extern const float percentile_table_4x10[2048];
+extern const float percentile_table_4x12[2048];
+extern const float percentile_table_5x4[2048];
+extern const float percentile_table_5x5[2048];
+extern const float percentile_table_5x6[2048];
+extern const float percentile_table_5x8[2048];
+extern const float percentile_table_5x10[2048];
+extern const float percentile_table_5x12[2048];
+extern const float percentile_table_6x4[2048];
+extern const float percentile_table_6x5[2048];
+extern const float percentile_table_6x6[2048];
+extern const float percentile_table_6x8[2048];
+extern const float percentile_table_6x10[2048];
+extern const float percentile_table_6x12[2048];
+extern const float percentile_table_8x4[2048];
+extern const float percentile_table_8x5[2048];
+extern const float percentile_table_8x6[2048];
+extern const float percentile_table_8x8[2048];
+extern const float percentile_table_8x10[2048];
+extern const float percentile_table_8x12[2048];
+extern const float percentile_table_10x4[2048];
+extern const float percentile_table_10x5[2048];
+extern const float percentile_table_10x6[2048];
+extern const float percentile_table_10x8[2048];
+extern const float percentile_table_10x10[2048];
+extern const float percentile_table_10x12[2048];
+extern const float percentile_table_12x4[2048];
+extern const float percentile_table_12x5[2048];
+extern const float percentile_table_12x6[2048];
+extern const float percentile_table_12x8[2048];
+extern const float percentile_table_12x10[2048];
+extern const float percentile_table_12x12[2048];
+
+const float *get_2d_percentile_table(int blockdim_x, int blockdim_y)
+{
+	switch (blockdim_x)
+	{
+	case 4:
+		switch (blockdim_y)
+		{
+		case 4:
+			return percentile_table_4x4;
+		case 5:
+			return percentile_table_4x5;
+		case 6:
+			return percentile_table_4x6;
+		case 8:
+			return percentile_table_4x8;
+		case 10:
+			return percentile_table_4x10;
+		case 12:
+			return percentile_table_4x12;
+		}
+		break;
+	case 5:
+		switch (blockdim_y)
+		{
+		case 4:
+			return percentile_table_5x4;
+		case 5:
+			return percentile_table_5x5;
+		case 6:
+			return percentile_table_5x6;
+		case 8:
+			return percentile_table_5x8;
+		case 10:
+			return percentile_table_5x10;
+		case 12:
+			return percentile_table_5x12;
+		}
+		break;
+
+	case 6:
+		switch (blockdim_y)
+		{
+		case 4:
+			return percentile_table_6x4;
+		case 5:
+			return percentile_table_6x5;
+		case 6:
+			return percentile_table_6x6;
+		case 8:
+			return percentile_table_6x8;
+		case 10:
+			return percentile_table_6x10;
+		case 12:
+			return percentile_table_6x12;
+		}
+		break;
+
+	case 8:
+		switch (blockdim_y)
+		{
+		case 4:
+			return percentile_table_8x4;
+		case 5:
+			return percentile_table_8x5;
+		case 6:
+			return percentile_table_8x6;
+		case 8:
+			return percentile_table_8x8;
+		case 10:
+			return percentile_table_8x10;
+		case 12:
+			return percentile_table_8x12;
+		}
+		break;
+
+	case 10:
+		switch (blockdim_y)
+		{
+		case 4:
+			return percentile_table_10x4;
+		case 5:
+			return percentile_table_10x5;
+		case 6:
+			return percentile_table_10x6;
+		case 8:
+			return percentile_table_10x8;
+		case 10:
+			return percentile_table_10x10;
+		case 12:
+			return percentile_table_10x12;
+		}
+		break;
+
+	case 12:
+		switch (blockdim_y)
+		{
+		case 4:
+			return percentile_table_12x4;
+		case 5:
+			return percentile_table_12x5;
+		case 6:
+			return percentile_table_12x6;
+		case 8:
+			return percentile_table_12x8;
+		case 10:
+			return percentile_table_12x10;
+		case 12:
+			return percentile_table_12x12;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return NULL;				// should never happen.
+}
+
+// stubbed for the time being.
+static const float dummy_percentile_table_3d[2048] = { 0 };
+const float *get_3d_percentile_table(int blockdim_x, int blockdim_y, int blockdim_z)
+{
+	IGNORE(blockdim_x);
+	IGNORE(blockdim_y);
+	IGNORE(blockdim_z);
+	return dummy_percentile_table_3d;
+}
+
+
+
+// return 0 on invalid mode, 1 on valid mode.
+static int decode_block_mode_2d(int blockmode, int *Nval, int *Mval, int *dual_weight_plane, int *quant_mode)
+{
+	int base_quant_mode = (blockmode >> 4) & 1;
+	int H = (blockmode >> 9) & 1;
+	int D = (blockmode >> 10) & 1;
+
+	int A = (blockmode >> 5) & 0x3;
+
+	int N = 0, M = 0;
+
+	if ((blockmode & 3) != 0)
+	{
+		base_quant_mode |= (blockmode & 3) << 1;
+		int B = (blockmode >> 7) & 3;
+		switch ((blockmode >> 2) & 3)
+		{
+		case 0:
+			N = B + 4;
+			M = A + 2;
+			break;
+		case 1:
+			N = B + 8;
+			M = A + 2;
+			break;
+		case 2:
+			N = A + 2;
+			M = B + 8;
+			break;
+		case 3:
+			B &= 1;
+			if (blockmode & 0x100)
+			{
+				N = B + 2;
+				M = A + 2;
+			}
+			else
+			{
+				N = A + 2;
+				M = B + 6;
+			}
+			break;
+		}
+	}
+	else
+	{
+		base_quant_mode |= ((blockmode >> 2) & 3) << 1;
+		if (((blockmode >> 2) & 3) == 0)
+			return 0;
+		int B = (blockmode >> 9) & 3;
+		switch ((blockmode >> 7) & 3)
+		{
+		case 0:
+			N = 12;
+			M = A + 2;
+			break;
+		case 1:
+			N = A + 2;
+			M = 12;
+			break;
+		case 2:
+			N = A + 6;
+			M = B + 6;
+			D = 0;
+			H = 0;
+			break;
+		case 3:
+			switch ((blockmode >> 5) & 3)
+			{
+			case 0:
+				N = 6;
+				M = 10;
+				break;
+			case 1:
+				N = 10;
+				M = 6;
+				break;
+			case 2:
+			case 3:
+				return 0;
+			}
+			break;
+		}
+	}
+
+	int weight_count = N * M * (D + 1);
+	int qmode = (base_quant_mode - 2) + 6 * H;
+
+	int weightbits = compute_ise_bitcount(weight_count, (quantization_method) qmode);
+	if (weight_count > MAX_WEIGHTS_PER_BLOCK || weightbits < MIN_WEIGHT_BITS_PER_BLOCK || weightbits > MAX_WEIGHT_BITS_PER_BLOCK)
+		return 0;
+
+	*Nval = N;
+	*Mval = M;
+	*dual_weight_plane = D;
+	*quant_mode = qmode;
+	return 1;
+}
+
+
+static int decode_block_mode_3d(int blockmode, int *Nval, int *Mval, int *Qval, int *dual_weight_plane, int *quant_mode)
+{
+	int base_quant_mode = (blockmode >> 4) & 1;
+	int H = (blockmode >> 9) & 1;
+	int D = (blockmode >> 10) & 1;
+
+	int A = (blockmode >> 5) & 0x3;
+
+	int N = 0, M = 0, Q = 0;
+
+	if ((blockmode & 3) != 0)
+	{
+		base_quant_mode |= (blockmode & 3) << 1;
+		int B = (blockmode >> 7) & 3;
+		int C = (blockmode >> 2) & 0x3;
+		N = A + 2;
+		M = B + 2;
+		Q = C + 2;
+	}
+	else
+	{
+		base_quant_mode |= ((blockmode >> 2) & 3) << 1;
+		if (((blockmode >> 2) & 3) == 0)
+			return 0;
+		int B = (blockmode >> 9) & 3;
+		if (((blockmode >> 7) & 3) != 3)
+		{
+			D = 0;
+			H = 0;
+		}
+		switch ((blockmode >> 7) & 3)
+		{
+		case 0:
+			N = 6;
+			M = B + 2;
+			Q = A + 2;
+			break;
+		case 1:
+			N = A + 2;
+			M = 6;
+			Q = B + 2;
+			break;
+		case 2:
+			N = A + 2;
+			M = B + 2;
+			Q = 6;
+			break;
+		case 3:
+			N = 2;
+			M = 2;
+			Q = 2;
+			switch ((blockmode >> 5) & 3)
+			{
+			case 0:
+				N = 6;
+				break;
+			case 1:
+				M = 6;
+				break;
+			case 2:
+				Q = 6;
+				break;
+			case 3:
+				return 0;
+			}
+			break;
+		}
+	}
+
+	int weight_count = N * M * Q * (D + 1);
+	int qmode = (base_quant_mode - 2) + 6 * H;
+
+	int weightbits = compute_ise_bitcount(weight_count, (quantization_method) qmode);
+	if (weight_count > MAX_WEIGHTS_PER_BLOCK || weightbits < MIN_WEIGHT_BITS_PER_BLOCK || weightbits > MAX_WEIGHT_BITS_PER_BLOCK)
+		return 0;
+
+	*Nval = N;
+	*Mval = M;
+	*Qval = Q;
+	*dual_weight_plane = D;
+	*quant_mode = qmode;
+	return 1;
+}
+
+
+
+
+static void initialize_decimation_table_2d(
+											  // dimensions of the block
+											  int xdim, int ydim,
+											  // number of grid points in 2d weight grid
+											  int x_weights, int y_weights, decimation_table * dt)
+{
+	int i, j;
+	int x, y;
+
+	int texels_per_block = xdim * ydim;
+	int weights_per_block = x_weights * y_weights;
+
+	int weightcount_of_texel[MAX_TEXELS_PER_BLOCK];
+	int grid_weights_of_texel[MAX_TEXELS_PER_BLOCK][4];
+	int weights_of_texel[MAX_TEXELS_PER_BLOCK][4];
+
+	int texelcount_of_weight[MAX_WEIGHTS_PER_BLOCK];
+	int texels_of_weight[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];
+	int texelweights_of_weight[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];
+
+	for (i = 0; i < weights_per_block; i++)
+		texelcount_of_weight[i] = 0;
+	for (i = 0; i < texels_per_block; i++)
+		weightcount_of_texel[i] = 0;
+
+	for (y = 0; y < ydim; y++)
+		for (x = 0; x < xdim; x++)
+		{
+			int texel = y * xdim + x;
+
+			int x_weight = (((1024 + xdim / 2) / (xdim - 1)) * x * (x_weights - 1) + 32) >> 6;
+			int y_weight = (((1024 + ydim / 2) / (ydim - 1)) * y * (y_weights - 1) + 32) >> 6;
+
+			int x_weight_frac = x_weight & 0xF;
+			int y_weight_frac = y_weight & 0xF;
+			int x_weight_int = x_weight >> 4;
+			int y_weight_int = y_weight >> 4;
+			int qweight[4];
+			int weight[4];
+			qweight[0] = x_weight_int + y_weight_int * x_weights;
+			qweight[1] = qweight[0] + 1;
+			qweight[2] = qweight[0] + x_weights;
+			qweight[3] = qweight[2] + 1;
+
+			// truncated-precision bilinear interpolation.
+			int prod = x_weight_frac * y_weight_frac;
+
+			weight[3] = (prod + 8) >> 4;
+			weight[1] = x_weight_frac - weight[3];
+			weight[2] = y_weight_frac - weight[3];
+			weight[0] = 16 - x_weight_frac - y_weight_frac + weight[3];
+
+			for (i = 0; i < 4; i++)
+				if (weight[i] != 0)
+				{
+					grid_weights_of_texel[texel][weightcount_of_texel[texel]] = qweight[i];
+					weights_of_texel[texel][weightcount_of_texel[texel]] = weight[i];
+					weightcount_of_texel[texel]++;
+					texels_of_weight[qweight[i]][texelcount_of_weight[qweight[i]]] = texel;
+					texelweights_of_weight[qweight[i]][texelcount_of_weight[qweight[i]]] = weight[i];
+					texelcount_of_weight[qweight[i]]++;
+				}
+		}
+
+	for (i = 0; i < texels_per_block; i++)
+	{
+		dt->texel_num_weights[i] = weightcount_of_texel[i];
+
+		// ensure that all 4 entries are actually initialized.
+		// This allows a branch-free implementation of compute_value_of_texel_flt()
+		for (j = 0; j < 4; j++)
+		{
+			dt->texel_weights_int[i][j] = 0;
+			dt->texel_weights_float[i][j] = 0.0f;
+			dt->texel_weights[i][j] = 0;
+		}
+
+		for (j = 0; j < weightcount_of_texel[i]; j++)
+		{
+			dt->texel_weights_int[i][j] = weights_of_texel[i][j];
+			dt->texel_weights_float[i][j] = static_cast < float >(weights_of_texel[i][j]) * (1.0f / TEXEL_WEIGHT_SUM);
+			dt->texel_weights[i][j] = grid_weights_of_texel[i][j];
+		}
+	}
+
+	for (i = 0; i < weights_per_block; i++)
+	{
+		dt->weight_num_texels[i] = texelcount_of_weight[i];
+
+
+		for (j = 0; j < texelcount_of_weight[i]; j++)
+		{
+			dt->weight_texel[i][j] = texels_of_weight[i][j];
+			dt->weights_int[i][j] = texelweights_of_weight[i][j];
+			dt->weights_flt[i][j] = static_cast < float >(texelweights_of_weight[i][j]);
+		}
+	}
+
+	dt->num_texels = texels_per_block;
+	dt->num_weights = weights_per_block;
+
+
+}
+
+
+
+
+static void initialize_decimation_table_3d(
+											  // dimensions of the block
+											  int xdim, int ydim, int zdim,
+											  // number of grid points in 3d weight grid
+											  int x_weights, int y_weights, int z_weights, decimation_table * dt)
+{
+	int i, j;
+	int x, y, z;
+
+	int texels_per_block = xdim * ydim * zdim;
+	int weights_per_block = x_weights * y_weights * z_weights;
+
+	int weightcount_of_texel[MAX_TEXELS_PER_BLOCK];
+	int grid_weights_of_texel[MAX_TEXELS_PER_BLOCK][4];
+	int weights_of_texel[MAX_TEXELS_PER_BLOCK][4];
+
+	int texelcount_of_weight[MAX_WEIGHTS_PER_BLOCK];
+	int texels_of_weight[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];
+	int texelweights_of_weight[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];
+
+	for (i = 0; i < weights_per_block; i++)
+		texelcount_of_weight[i] = 0;
+	for (i = 0; i < texels_per_block; i++)
+		weightcount_of_texel[i] = 0;
+
+	for (z = 0; z < zdim; z++)
+		for (y = 0; y < ydim; y++)
+			for (x = 0; x < xdim; x++)
+			{
+				int texel = (z * ydim + y) * xdim + x;
+
+				int x_weight = (((1024 + xdim / 2) / (xdim - 1)) * x * (x_weights - 1) + 32) >> 6;
+				int y_weight = (((1024 + ydim / 2) / (ydim - 1)) * y * (y_weights - 1) + 32) >> 6;
+				int z_weight = (((1024 + zdim / 2) / (zdim - 1)) * z * (z_weights - 1) + 32) >> 6;
+
+				int x_weight_frac = x_weight & 0xF;
+				int y_weight_frac = y_weight & 0xF;
+				int z_weight_frac = z_weight & 0xF;
+				int x_weight_int = x_weight >> 4;
+				int y_weight_int = y_weight >> 4;
+				int z_weight_int = z_weight >> 4;
+				int qweight[4];
+				int weight[4];
+				qweight[0] = (z_weight_int * y_weights + y_weight_int) * x_weights + x_weight_int;
+				qweight[3] = ((z_weight_int + 1) * y_weights + (y_weight_int + 1)) * x_weights + (x_weight_int + 1);
+
+				// simplex interpolation
+				int fs = x_weight_frac;
+				int ft = y_weight_frac;
+				int fp = z_weight_frac;
+
+				int cas = ((fs > ft) << 2) + ((ft > fp) << 1) + ((fs > fp));
+				int N = x_weights;
+				int NM = x_weights * y_weights;
+
+				int s1, s2, w0, w1, w2, w3;
+				switch (cas)
+				{
+				case 7:
+					s1 = 1;
+					s2 = N;
+					w0 = 16 - fs;
+					w1 = fs - ft;
+					w2 = ft - fp;
+					w3 = fp;
+					break;
+				case 3:
+					s1 = N;
+					s2 = 1;
+					w0 = 16 - ft;
+					w1 = ft - fs;
+					w2 = fs - fp;
+					w3 = fp;
+					break;
+				case 5:
+					s1 = 1;
+					s2 = NM;
+					w0 = 16 - fs;
+					w1 = fs - fp;
+					w2 = fp - ft;
+					w3 = ft;
+					break;
+				case 4:
+					s1 = NM;
+					s2 = 1;
+					w0 = 16 - fp;
+					w1 = fp - fs;
+					w2 = fs - ft;
+					w3 = ft;
+					break;
+				case 2:
+					s1 = N;
+					s2 = NM;
+					w0 = 16 - ft;
+					w1 = ft - fp;
+					w2 = fp - fs;
+					w3 = fs;
+					break;
+				case 0:
+					s1 = NM;
+					s2 = N;
+					w0 = 16 - fp;
+					w1 = fp - ft;
+					w2 = ft - fs;
+					w3 = fs;
+					break;
+
+				default:
+					s1 = NM;
+					s2 = N;
+					w0 = 16 - fp;
+					w1 = fp - ft;
+					w2 = ft - fs;
+					w3 = fs;
+					break;
+				}
+
+				qweight[1] = qweight[0] + s1;
+				qweight[2] = qweight[1] + s2;
+				weight[0] = w0;
+				weight[1] = w1;
+				weight[2] = w2;
+				weight[3] = w3;
+
+				/*
+				   for(i=0;i<4;i++) weight[i] <<= 4; */
+
+				for (i = 0; i < 4; i++)
+					if (weight[i] != 0)
+					{
+						grid_weights_of_texel[texel][weightcount_of_texel[texel]] = qweight[i];
+						weights_of_texel[texel][weightcount_of_texel[texel]] = weight[i];
+						weightcount_of_texel[texel]++;
+						texels_of_weight[qweight[i]][texelcount_of_weight[qweight[i]]] = texel;
+						texelweights_of_weight[qweight[i]][texelcount_of_weight[qweight[i]]] = weight[i];
+						texelcount_of_weight[qweight[i]]++;
+					}
+			}
+
+	for (i = 0; i < texels_per_block; i++)
+	{
+		dt->texel_num_weights[i] = weightcount_of_texel[i];
+
+		// ensure that all 4 entries are actually initialized.
+		// This allows a branch-free implementation of compute_value_of_texel_flt()
+		for (j = 0; j < 4; j++)
+		{
+			dt->texel_weights_int[i][j] = 0;
+			dt->texel_weights_float[i][j] = 0.0f;
+			dt->texel_weights[i][j] = 0;
+		}
+
+		for (j = 0; j < weightcount_of_texel[i]; j++)
+		{
+			dt->texel_weights_int[i][j] = weights_of_texel[i][j];
+			dt->texel_weights_float[i][j] = static_cast < float >(weights_of_texel[i][j]) * (1.0f / TEXEL_WEIGHT_SUM);
+			dt->texel_weights[i][j] = grid_weights_of_texel[i][j];
+		}
+	}
+
+	for (i = 0; i < weights_per_block; i++)
+	{
+		dt->weight_num_texels[i] = texelcount_of_weight[i];
+		for (j = 0; j < texelcount_of_weight[i]; j++)
+		{
+			dt->weight_texel[i][j] = texels_of_weight[i][j];
+			dt->weights_int[i][j] = texelweights_of_weight[i][j];
+			dt->weights_flt[i][j] = static_cast < float >(texelweights_of_weight[i][j]);
+		}
+	}
+
+	dt->num_texels = texels_per_block;
+	dt->num_weights = weights_per_block;
+}
+
+
+
+void construct_block_size_descriptor_2d(int xdim, int ydim, block_size_descriptor * bsd)
+{
+	int decimation_mode_index[256];	// for each of the 256 entries in the decim_table_array, its index
+	int decimation_mode_count = 0;
+
+	int i;
+	int x_weights;
+	int y_weights;
+
+	for (i = 0; i < 256; i++)
+	{
+		decimation_mode_index[i] = -1;
+	}
+
+	// gather all the infill-modes that can be used with the current block size
+	for (x_weights = 2; x_weights <= 12; x_weights++)
+		for (y_weights = 2; y_weights <= 12; y_weights++)
+		{
+			if (x_weights * y_weights > MAX_WEIGHTS_PER_BLOCK)
+				continue;
+			decimation_table *dt = new decimation_table;
+			decimation_mode_index[y_weights * 16 + x_weights] = decimation_mode_count;
+			initialize_decimation_table_2d(xdim, ydim, x_weights, y_weights, dt);
+
+			int weight_count = x_weights * y_weights;
+
+			int maxprec_1plane = -1;
+			int maxprec_2planes = -1;
+			for (i = 0; i < 12; i++)
+			{
+				int bits_1plane = compute_ise_bitcount(weight_count, (quantization_method) i);
+				int bits_2planes = compute_ise_bitcount(2 * weight_count, (quantization_method) i);
+				if (bits_1plane >= MIN_WEIGHT_BITS_PER_BLOCK && bits_1plane <= MAX_WEIGHT_BITS_PER_BLOCK)
+					maxprec_1plane = i;
+				if (bits_2planes >= MIN_WEIGHT_BITS_PER_BLOCK && bits_2planes <= MAX_WEIGHT_BITS_PER_BLOCK)
+					maxprec_2planes = i;
+			}
+
+			if (2 * x_weights * y_weights > MAX_WEIGHTS_PER_BLOCK)
+				maxprec_2planes = -1;
+
+			bsd->permit_encode[decimation_mode_count] = (x_weights <= xdim && y_weights <= ydim);
+
+			bsd->decimation_mode_samples[decimation_mode_count] = weight_count;
+			bsd->decimation_mode_maxprec_1plane[decimation_mode_count] = maxprec_1plane;
+			bsd->decimation_mode_maxprec_2planes[decimation_mode_count] = maxprec_2planes;
+			bsd->decimation_tables[decimation_mode_count] = dt;
+
+			decimation_mode_count++;
+		}
+
+	for (i = 0; i < MAX_DECIMATION_MODES; i++)
+	{
+		bsd->decimation_mode_percentile[i] = 1.0f;
+	}
+
+	for (i = decimation_mode_count; i < MAX_DECIMATION_MODES; i++)
+	{
+		bsd->permit_encode[i] = 0;
+		bsd->decimation_mode_samples[i] = 0;
+		bsd->decimation_mode_maxprec_1plane[i] = -1;
+		bsd->decimation_mode_maxprec_2planes[i] = -1;
+	}
+
+	bsd->decimation_mode_count = decimation_mode_count;
+
+	const float *percentiles = get_2d_percentile_table(xdim, ydim);
+
+	// then construct the list of block formats
+	for (i = 0; i < 2048; i++)
+	{
+		int x_weights, y_weights;
+		int is_dual_plane;
+		int quantization_mode;
+		int fail = 0;
+		int permit_encode = 1;
+
+		if (decode_block_mode_2d(i, &x_weights, &y_weights, &is_dual_plane, &quantization_mode))
+		{
+			if (x_weights > xdim || y_weights > ydim)
+				permit_encode = 0;
+		}
+		else
+		{
+			fail = 1;
+			permit_encode = 0;
+		}
+
+		if (fail)
+		{
+			bsd->block_modes[i].decimation_mode = -1;
+			bsd->block_modes[i].quantization_mode = -1;
+			bsd->block_modes[i].is_dual_plane = -1;
+			bsd->block_modes[i].permit_encode = 0;
+			bsd->block_modes[i].permit_decode = 0;
+			bsd->block_modes[i].percentile = 1.0f;
+		}
+		else
+		{
+			int decimation_mode = decimation_mode_index[y_weights * 16 + x_weights];
+			bsd->block_modes[i].decimation_mode = decimation_mode;
+			bsd->block_modes[i].quantization_mode = quantization_mode;
+			bsd->block_modes[i].is_dual_plane = is_dual_plane;
+			bsd->block_modes[i].permit_encode = permit_encode;
+			bsd->block_modes[i].permit_decode = permit_encode;	// disallow decode of grid size larger than block size.
+			bsd->block_modes[i].percentile = percentiles[i];
+
+			if (bsd->decimation_mode_percentile[decimation_mode] > percentiles[i])
+				bsd->decimation_mode_percentile[decimation_mode] = percentiles[i];
+		}
+
+	}
+
+	if (xdim * ydim <= 64)
+	{
+		bsd->texelcount_for_bitmap_partitioning = xdim * ydim;
+		for (i = 0; i < xdim * ydim; i++)
+			bsd->texels_for_bitmap_partitioning[i] = i;
+	}
+
+	else
+	{
+		// pick 64 random texels for use with bitmap partitioning.
+		int arr[MAX_TEXELS_PER_BLOCK];
+		for (i = 0; i < xdim * ydim; i++)
+			arr[i] = 0;
+		int arr_elements_set = 0;
+		while (arr_elements_set < 64)
+		{
+			int idx = rand() % (xdim * ydim);
+			if (arr[idx] == 0)
+			{
+				arr_elements_set++;
+				arr[idx] = 1;
+			}
+		}
+		int texel_weights_written = 0;
+		int idx = 0;
+		while (texel_weights_written < 64)
+		{
+			if (arr[idx])
+				bsd->texels_for_bitmap_partitioning[texel_weights_written++] = idx;
+			idx++;
+		}
+		bsd->texelcount_for_bitmap_partitioning = 64;
+
+	}
+}
+
+
+
+void construct_block_size_descriptor_3d(int xdim, int ydim, int zdim, block_size_descriptor * bsd)
+{
+	int decimation_mode_index[512];	// for each of the 512 entries in the decim_table_array, its index
+	int decimation_mode_count = 0;
+
+	int i;
+	int x_weights;
+	int y_weights;
+	int z_weights;
+
+	for (i = 0; i < 512; i++)
+	{
+		decimation_mode_index[i] = -1;
+	}
+
+	// gather all the infill-modes that can be used with the current block size
+	for (x_weights = 2; x_weights <= 6; x_weights++)
+		for (y_weights = 2; y_weights <= 6; y_weights++)
+			for (z_weights = 2; z_weights <= 6; z_weights++)
+			{
+				if ((x_weights * y_weights * z_weights) > MAX_WEIGHTS_PER_BLOCK)
+					continue;
+				decimation_table *dt = new decimation_table;
+				decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights] = decimation_mode_count;
+				initialize_decimation_table_3d(xdim, ydim, zdim, x_weights, y_weights, z_weights, dt);
+
+				int weight_count = x_weights * y_weights * z_weights;
+
+				int maxprec_1plane = -1;
+				int maxprec_2planes = -1;
+				for (i = 0; i < 12; i++)
+				{
+					int bits_1plane = compute_ise_bitcount(weight_count, (quantization_method) i);
+					int bits_2planes = compute_ise_bitcount(2 * weight_count, (quantization_method) i);
+					if (bits_1plane >= MIN_WEIGHT_BITS_PER_BLOCK && bits_1plane <= MAX_WEIGHT_BITS_PER_BLOCK)
+						maxprec_1plane = i;
+					if (bits_2planes >= MIN_WEIGHT_BITS_PER_BLOCK && bits_2planes <= MAX_WEIGHT_BITS_PER_BLOCK)
+						maxprec_2planes = i;
+				}
+
+				if ((2 * x_weights * y_weights * z_weights) > MAX_WEIGHTS_PER_BLOCK)
+					maxprec_2planes = -1;
+
+				bsd->permit_encode[decimation_mode_count] = (x_weights <= xdim && y_weights <= ydim && z_weights <= zdim);
+
+				bsd->decimation_mode_samples[decimation_mode_count] = weight_count;
+				bsd->decimation_mode_maxprec_1plane[decimation_mode_count] = maxprec_1plane;
+				bsd->decimation_mode_maxprec_2planes[decimation_mode_count] = maxprec_2planes;
+				bsd->decimation_tables[decimation_mode_count] = dt;
+
+				decimation_mode_count++;
+			}
+
+	for (i = 0; i < MAX_DECIMATION_MODES; i++)
+	{
+		bsd->decimation_mode_percentile[i] = 1.0f;
+	}
+
+	for (i = decimation_mode_count; i < MAX_DECIMATION_MODES; i++)
+	{
+		bsd->permit_encode[i] = 0;
+		bsd->decimation_mode_samples[i] = 0;
+		bsd->decimation_mode_maxprec_1plane[i] = -1;
+		bsd->decimation_mode_maxprec_2planes[i] = -1;
+	}
+
+	bsd->decimation_mode_count = decimation_mode_count;
+
+	const float *percentiles = get_3d_percentile_table(xdim, ydim, zdim);
+
+	// then construct the list of block formats
+	for (i = 0; i < 2048; i++)
+	{
+		int x_weights, y_weights, z_weights;
+		int is_dual_plane;
+		int quantization_mode;
+		int fail = 0;
+		int permit_encode = 1;
+
+		if (decode_block_mode_3d(i, &x_weights, &y_weights, &z_weights, &is_dual_plane, &quantization_mode))
+		{
+			if (x_weights > xdim || y_weights > ydim || z_weights > zdim)
+				permit_encode = 0;
+		}
+		else
+		{
+			fail = 1;
+			permit_encode = 0;
+		}
+		if (fail)
+		{
+			bsd->block_modes[i].decimation_mode = -1;
+			bsd->block_modes[i].quantization_mode = -1;
+			bsd->block_modes[i].is_dual_plane = -1;
+			bsd->block_modes[i].permit_encode = 0;
+			bsd->block_modes[i].permit_decode = 0;
+			bsd->block_modes[i].percentile = 1.0f;
+		}
+		else
+		{
+			int decimation_mode = decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights];
+			bsd->block_modes[i].decimation_mode = decimation_mode;
+			bsd->block_modes[i].quantization_mode = quantization_mode;
+			bsd->block_modes[i].is_dual_plane = is_dual_plane;
+			bsd->block_modes[i].permit_encode = permit_encode;
+			bsd->block_modes[i].permit_decode = permit_encode;
+			bsd->block_modes[i].percentile = percentiles[i];
+
+			if (bsd->decimation_mode_percentile[decimation_mode] > percentiles[i])
+				bsd->decimation_mode_percentile[decimation_mode] = percentiles[i];
+		}
+
+	}
+
+	if (xdim * ydim * zdim <= 64)
+	{
+		bsd->texelcount_for_bitmap_partitioning = xdim * ydim * zdim;
+		for (i = 0; i < xdim * ydim * zdim; i++)
+			bsd->texels_for_bitmap_partitioning[i] = i;
+	}
+
+	else
+	{
+		// pick 64 random texels for use with bitmap partitioning.
+		int arr[MAX_TEXELS_PER_BLOCK];
+		for (i = 0; i < xdim * ydim * zdim; i++)
+			arr[i] = 0;
+		int arr_elements_set = 0;
+		while (arr_elements_set < 64)
+		{
+			int idx = rand() % (xdim * ydim * zdim);
+			if (arr[idx] == 0)
+			{
+				arr_elements_set++;
+				arr[idx] = 1;
+			}
+		}
+		int texel_weights_written = 0;
+		int idx = 0;
+		while (texel_weights_written < 64)
+		{
+			if (arr[idx])
+				bsd->texels_for_bitmap_partitioning[texel_weights_written++] = idx;
+			idx++;
+		}
+		bsd->texelcount_for_bitmap_partitioning = 64;
+	}
+}
+
+
+
+
+static block_size_descriptor *bsd_pointers[4096];
+
+// function to obtain a block size descriptor. If the descriptor does not exist,
+// it is created as needed. Should not be called from within multi-threaded code.
+const block_size_descriptor *get_block_size_descriptor(int xdim, int ydim, int zdim)
+{
+	int bsd_index = xdim + (ydim << 4) + (zdim << 8);
+	if (bsd_pointers[bsd_index] == NULL)
+	{
+		block_size_descriptor *bsd = new block_size_descriptor;
+		if (zdim > 1)
+			construct_block_size_descriptor_3d(xdim, ydim, zdim, bsd);
+		else
+			construct_block_size_descriptor_2d(xdim, ydim, bsd);
+
+		bsd_pointers[bsd_index] = bsd;
+	}
+	return bsd_pointers[bsd_index];
+}
--- a/3rdparty/astc/astc_codec_internals.h
+++ b/3rdparty/astc/astc_codec_internals.h
@@ -0,0 +1,815 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012, 2018 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Internal function and data declarations for ASTC codec.
+ */
+/*----------------------------------------------------------------------------*/
+
+#ifndef ASTC_CODEC_INTERNALS_INCLUDED
+
+#define ASTC_CODEC_INTERNALS_INCLUDED
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "mathlib.h"
+
+#ifndef MIN
+	#define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+
+#ifndef MAX
+	#define MAX(x,y) ((x)>(y)?(x):(y))
+#endif
+
+// Macro to silence warnings on ignored parameters.
+// The presence of this macro should be a signal to look at refactoring.
+#define IGNORE(param) ((void)&param)
+
+#define astc_isnan(p) ((p)!=(p))
+
+// ASTC parameters
+#define MAX_TEXELS_PER_BLOCK 216
+#define MAX_WEIGHTS_PER_BLOCK 64
+#define MIN_WEIGHT_BITS_PER_BLOCK 24
+#define MAX_WEIGHT_BITS_PER_BLOCK 96
+#define PARTITION_BITS 10
+#define PARTITION_COUNT (1 << PARTITION_BITS)
+
+// the sum of weights for one texel.
+#define TEXEL_WEIGHT_SUM 16
+#define MAX_DECIMATION_MODES 87
+#define MAX_WEIGHT_MODES 2048
+
+// error reporting for codec internal errors.
+#define ASTC_CODEC_INTERNAL_ERROR astc_codec_internal_error(__FILE__, __LINE__)
+
+void astc_codec_internal_error(const char *filename, int linenumber);
+
+// uncomment this macro to enable checking for inappropriate NaNs;
+// works on Linux only, and slows down encoding significantly.
+// #define DEBUG_CAPTURE_NAN
+
+// the PRINT_DIAGNOSTICS macro enables the -diag command line switch,
+// which can be used to look for codec bugs
+#define DEBUG_PRINT_DIAGNOSTICS
+
+#ifdef DEBUG_PRINT_DIAGNOSTICS
+	extern int print_diagnostics;
+#endif
+
+extern int print_tile_errors;
+extern int print_statistics;
+
+extern int perform_srgb_transform;
+extern int rgb_force_use_of_hdr;
+extern int alpha_force_use_of_hdr;
+
+struct processed_line2
+{
+	float2 amod;
+	float2 bs;
+	float2 bis;
+};
+struct processed_line3
+{
+	float3 amod;
+	float3 bs;
+	float3 bis;
+};
+struct processed_line4
+{
+	float4 amod;
+	float4 bs;
+	float4 bis;
+};
+
+enum astc_decode_mode
+{
+	DECODE_LDR_SRGB,
+	DECODE_LDR,
+	DECODE_HDR
+};
+
+
+/*
+	Partition table representation:
+	For each block size, we have 3 tables, each with 1024 partitionings;
+	these three tables correspond to 2, 3 and 4 partitions respectively.
+	For each partitioning, we have:
+	* a 4-entry table indicating how many texels there are in each of the 4 partitions.
+	  This may be from 0 to a very large value.
+	* a table indicating the partition index of each of the texels in the block.
+	  Each index may be 0, 1, 2 or 3.
+	* Each element in the table is an uint8_t indicating partition index (0, 1, 2 or 3)
+*/
+
+struct partition_info
+{
+	int partition_count;
+	uint8_t texels_per_partition[4];
+	uint8_t partition_of_texel[MAX_TEXELS_PER_BLOCK];
+	uint8_t texels_of_partition[4][MAX_TEXELS_PER_BLOCK];
+
+	uint64_t coverage_bitmaps[4];	// used for the purposes of k-means partition search.
+};
+
+
+
+
+/*
+   In ASTC, we don't necessarily provide a weight for every texel.
+   As such, for each block size, there are a number of patterns where some texels
+   have their weights computed as a weighted average of more than 1 weight.
+   As such, the codec uses a data structure that tells us: for each texel, which
+   weights it is a combination of for each weight, which texels it contributes to.
+   The decimation_table is this data structure.
+*/
+struct decimation_table
+{
+	int num_texels;
+	int num_weights;
+	uint8_t texel_num_weights[MAX_TEXELS_PER_BLOCK];	// number of indices that go into the calculation for a texel
+	uint8_t texel_weights_int[MAX_TEXELS_PER_BLOCK][4];	// the weight to assign to each weight
+	float texel_weights_float[MAX_TEXELS_PER_BLOCK][4];	// the weight to assign to each weight
+	uint8_t texel_weights[MAX_TEXELS_PER_BLOCK][4];	// the weights that go into a texel calculation
+	uint8_t weight_num_texels[MAX_WEIGHTS_PER_BLOCK];	// the number of texels that a given weight contributes to
+	uint8_t weight_texel[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];	// the texels that the weight contributes to
+	uint8_t weights_int[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];	// the weights that the weight contributes to a texel.
+	float weights_flt[MAX_WEIGHTS_PER_BLOCK][MAX_TEXELS_PER_BLOCK];	// the weights that the weight contributes to a texel.
+};
+
+
+
+
+/*
+   data structure describing information that pertains to a block size and its associated block modes.
+*/
+struct block_mode
+{
+	int8_t decimation_mode;
+	int8_t quantization_mode;
+	int8_t is_dual_plane;
+	int8_t permit_encode;
+	int8_t permit_decode;
+	float percentile;
+};
+
+
+struct block_size_descriptor
+{
+	int decimation_mode_count;
+	int decimation_mode_samples[MAX_DECIMATION_MODES];
+	int decimation_mode_maxprec_1plane[MAX_DECIMATION_MODES];
+	int decimation_mode_maxprec_2planes[MAX_DECIMATION_MODES];
+	float decimation_mode_percentile[MAX_DECIMATION_MODES];
+	int permit_encode[MAX_DECIMATION_MODES];
+	const decimation_table *decimation_tables[MAX_DECIMATION_MODES + 1];
+	block_mode block_modes[MAX_WEIGHT_MODES];
+
+	// for the k-means bed bitmap partitioning algorithm, we don't
+	// want to consider more than 64 texels; this array specifies
+	// which 64 texels (if that many) to consider.
+	int texelcount_for_bitmap_partitioning;
+	int texels_for_bitmap_partitioning[64];
+};
+
+// data structure representing one block of an image.
+// it is expanded to float prior to processing to save some computation time
+// on conversions to/from uint8_t (this also allows us to handle HDR textures easily)
+struct imageblock
+{
+	float orig_data[MAX_TEXELS_PER_BLOCK * 4];  // original input data
+	float work_data[MAX_TEXELS_PER_BLOCK * 4];  // the data that we will compress, either linear or LNS (0..65535 in both cases)
+	float deriv_data[MAX_TEXELS_PER_BLOCK * 4]; // derivative of the conversion function used, used to modify error weighting
+
+	uint8_t rgb_lns[MAX_TEXELS_PER_BLOCK];      // 1 if RGB data are being treated as LNS
+	uint8_t alpha_lns[MAX_TEXELS_PER_BLOCK];    // 1 if Alpha data are being treated as LNS
+	uint8_t nan_texel[MAX_TEXELS_PER_BLOCK];    // 1 if the texel is a NaN-texel.
+
+	float red_min, red_max;
+	float green_min, green_max;
+	float blue_min, blue_max;
+	float alpha_min, alpha_max;
+	int grayscale;				// 1 if R=G=B for every pixel, 0 otherwise
+
+	int xpos, ypos, zpos;
+};
+
+
+struct error_weighting_params
+{
+	float rgb_power;
+	float rgb_base_weight;
+	float rgb_mean_weight;
+	float rgb_stdev_weight;
+	float alpha_power;
+	float alpha_base_weight;
+	float alpha_mean_weight;
+	float alpha_stdev_weight;
+	float rgb_mean_and_stdev_mixing;
+	int mean_stdev_radius;
+	int enable_rgb_scale_with_alpha;
+	int alpha_radius;
+	int ra_normal_angular_scale;
+	float block_artifact_suppression;
+	float rgba_weights[4];
+
+	float block_artifact_suppression_expanded[MAX_TEXELS_PER_BLOCK];
+
+	// parameters that deal with heuristic codec speedups
+	int partition_search_limit;
+	float block_mode_cutoff;
+	float texel_avg_error_limit;
+	float partition_1_to_2_limit;
+	float lowest_correlation_cutoff;
+	int max_refinement_iters;
+};
+
+
+
+
+void update_imageblock_flags(imageblock * pb, int xdim, int ydim, int zdim);
+
+
+void imageblock_initialize_orig_from_work(imageblock * pb, int pixelcount);
+
+
+void imageblock_initialize_work_from_orig(imageblock * pb, int pixelcount);
+
+
+
+/*
+	Data structure representing error weighting for one block of an image. this is used as
+	a multiplier for the error weight to apply to each color component when computing PSNR.
+
+	This weighting has several uses: it's usable for RA, GA, BA, A weighting, which is useful
+	for alpha-textures it's usable for HDR textures, where weighting should be approximately inverse to
+	luminance it's usable for perceptual weighting, where we assign higher weight to low-variability
+	regions than to high-variability regions. it's usable for suppressing off-edge block content in
+	case the texture doesn't actually extend to the edge of the block.
+
+	For the default case (everything is evenly weighted), every weight is 1. For the RA,GA,BA,A case,
+	we multiply the R,G,B weights with that of the alpha.
+
+	Putting the same weight in every component should result in the default case.
+	The following relations should hold:
+
+	texel_weight_rg[i] = (texel_weight_r[i] + texel_weight_g[i]) / 2
+	texel_weight_lum[i] = (texel_weight_r[i] + texel_weight_g[i] + texel_weight_b[i]) / 3
+	texel_weight[i] = (texel_weight_r[i] + texel_weight_g[i] + texel_weight_b[i] + texel_weight_a[i] / 4
+ */
+
+struct error_weight_block
+{
+	float4 error_weights[MAX_TEXELS_PER_BLOCK];
+	float texel_weight[MAX_TEXELS_PER_BLOCK];
+	float texel_weight_gba[MAX_TEXELS_PER_BLOCK];
+	float texel_weight_rba[MAX_TEXELS_PER_BLOCK];
+	float texel_weight_rga[MAX_TEXELS_PER_BLOCK];
+	float texel_weight_rgb[MAX_TEXELS_PER_BLOCK];
+
+	float texel_weight_rg[MAX_TEXELS_PER_BLOCK];
+	float texel_weight_rb[MAX_TEXELS_PER_BLOCK];
+	float texel_weight_gb[MAX_TEXELS_PER_BLOCK];
+	float texel_weight_ra[MAX_TEXELS_PER_BLOCK];
+
+	float texel_weight_r[MAX_TEXELS_PER_BLOCK];
+	float texel_weight_g[MAX_TEXELS_PER_BLOCK];
+	float texel_weight_b[MAX_TEXELS_PER_BLOCK];
+	float texel_weight_a[MAX_TEXELS_PER_BLOCK];
+
+	int contains_zeroweight_texels;
+};
+
+
+
+struct error_weight_block_orig
+{
+	float4 error_weights[MAX_TEXELS_PER_BLOCK];
+};
+
+
+// enumeration of all the quantization methods we support under this format.
+enum quantization_method
+{
+	QUANT_2 = 0,
+	QUANT_3 = 1,
+	QUANT_4 = 2,
+	QUANT_5 = 3,
+	QUANT_6 = 4,
+	QUANT_8 = 5,
+	QUANT_10 = 6,
+	QUANT_12 = 7,
+	QUANT_16 = 8,
+	QUANT_20 = 9,
+	QUANT_24 = 10,
+	QUANT_32 = 11,
+	QUANT_40 = 12,
+	QUANT_48 = 13,
+	QUANT_64 = 14,
+	QUANT_80 = 15,
+	QUANT_96 = 16,
+	QUANT_128 = 17,
+	QUANT_160 = 18,
+	QUANT_192 = 19,
+	QUANT_256 = 20
+};
+
+
+/*
+	In ASTC, we support relatively many combinations of weight precisions and weight transfer functions.
+	As such, for each combination we support, we have a hardwired data structure.
+
+	This structure provides the following information: A table, used to estimate the closest quantized
+	weight for a given floating-point weight. For each quantized weight, the corresponding unquantized
+	and floating-point values. For each quantized weight, a previous-value and a next-value.
+*/
+
+struct quantization_and_transfer_table
+{
+	quantization_method method;
+	uint8_t unquantized_value[32];	// 0..64
+	float unquantized_value_flt[32];	// 0..1
+	uint8_t prev_quantized_value[32];
+	uint8_t next_quantized_value[32];
+	uint8_t closest_quantized_weight[1025];
+};
+
+extern const quantization_and_transfer_table quant_and_xfer_tables[12];
+
+
+
+enum endpoint_formats
+{
+	FMT_LUMINANCE = 0,
+	FMT_LUMINANCE_DELTA = 1,
+	FMT_HDR_LUMINANCE_LARGE_RANGE = 2,
+	FMT_HDR_LUMINANCE_SMALL_RANGE = 3,
+	FMT_LUMINANCE_ALPHA = 4,
+	FMT_LUMINANCE_ALPHA_DELTA = 5,
+	FMT_RGB_SCALE = 6,
+	FMT_HDR_RGB_SCALE = 7,
+	FMT_RGB = 8,
+	FMT_RGB_DELTA = 9,
+	FMT_RGB_SCALE_ALPHA = 10,
+	FMT_HDR_RGB = 11,
+	FMT_RGBA = 12,
+	FMT_RGBA_DELTA = 13,
+	FMT_HDR_RGB_LDR_ALPHA = 14,
+	FMT_HDR_RGBA = 15,
+};
+
+
+
+struct symbolic_compressed_block
+{
+	int error_block;			// 1 marks error block, 0 marks non-error-block.
+	int block_mode;				// 0 to 2047. Negative value marks constant-color block (-1: FP16, -2:UINT16)
+	int partition_count;		// 1 to 4; Zero marks a constant-color block.
+	int partition_index;		// 0 to 1023
+	int color_formats[4];		// color format for each endpoint color pair.
+	int color_formats_matched;	// color format for all endpoint pairs are matched.
+	int color_values[4][12];	// quantized endpoint color pairs.
+	int color_quantization_level;
+	uint8_t plane1_weights[MAX_WEIGHTS_PER_BLOCK];	// quantized and decimated weights
+	uint8_t plane2_weights[MAX_WEIGHTS_PER_BLOCK];
+	int plane2_color_component;	// color component for the secondary plane of weights
+	int constant_color[4];		// constant-color, as FP16 or UINT16. Used for constant-color blocks only.
+};
+
+
+struct physical_compressed_block
+{
+	uint8_t data[16];
+};
+
+
+
+
+const block_size_descriptor *get_block_size_descriptor(int xdim, int ydim, int zdim);
+
+
+// ***********************************************************
+// functions and data pertaining to quantization and encoding
+// **********************************************************
+extern const uint8_t color_quantization_tables[21][256];
+extern const uint8_t color_unquantization_tables[21][256];
+
+void encode_ise(int quantization_level, int elements, const uint8_t * input_data, uint8_t * output_data, int bit_offset);
+
+void decode_ise(int quantization_level, int elements, const uint8_t * input_data, uint8_t * output_data, int bit_offset);
+
+int compute_ise_bitcount(int items, quantization_method quant);
+
+void build_quantization_mode_table(void);
+extern int quantization_mode_table[17][128];
+
+
+// **********************************************
+// functions and data pertaining to partitioning
+// **********************************************
+
+// function to get a pointer to a partition table or an array thereof.
+const partition_info *get_partition_table(int xdim, int ydim, int zdim, int partition_count);
+
+
+
+
+// functions to compute color averages and dominant directions
+// for each partition in a block
+
+
+void compute_averages_and_directions_rgb(const partition_info * pt,
+										 const imageblock * blk,
+										 const error_weight_block * ewb,
+										 const float4 * color_scalefactors, float3 * averages, float3 * directions_rgb, float2 * directions_rg, float2 * directions_rb, float2 * directions_gb);
+
+
+
+void compute_averages_and_directions_rgba(const partition_info * pt,
+										  const imageblock * blk,
+										  const error_weight_block * ewb,
+										  const float4 * color_scalefactors,
+										  float4 * averages, float4 * directions_rgba, float3 * directions_gba, float3 * directions_rba, float3 * directions_rga, float3 * directions_rgb);
+
+
+void compute_averages_and_directions_3_components(const partition_info * pt,
+												  const imageblock * blk,
+												  const error_weight_block * ewb,
+												  const float3 * color_scalefactors, int component1, int component2, int component3, float3 * averages, float3 * directions);
+
+void compute_averages_and_directions_2_components(const partition_info * pt,
+												  const imageblock * blk,
+												  const error_weight_block * ewb, const float2 * color_scalefactors, int component1, int component2, float2 * averages, float2 * directions);
+
+// functions to compute error value across a tile given a partitioning
+// (with the assumption that each partitioning has colors lying on a line where
+// they are represented with infinite precision. Also return the length of the line
+// segments that the partition's colors are actually projected onto.
+float compute_error_squared_gba(const partition_info * pt,	// the partition that we use when computing the squared-error.
+								const imageblock * blk, const error_weight_block * ewb, const processed_line3 * plines,
+								// output: computed length of the partitioning's line. This is not part of the
+								// error introduced by partitioning itself, but us used to estimate the error introduced by quantization
+								float *length_of_lines);
+
+float compute_error_squared_rba(const partition_info * pt,	// the partition that we use when computing the squared-error.
+								const imageblock * blk, const error_weight_block * ewb, const processed_line3 * plines,
+								// output: computed length of the partitioning's line. This is not part of the
+								// error introduced by partitioning itself, but us used to estimate the error introduced by quantization
+								float *length_of_lines);
+
+float compute_error_squared_rga(const partition_info * pt,	// the partition that we use when computing the squared-error.
+								const imageblock * blk, const error_weight_block * ewb, const processed_line3 * plines,
+								// output: computed length of the partitioning's line. This is not part of the
+								// error introduced by partitioning itself, but us used to estimate the error introduced by quantization
+								float *length_of_lines);
+
+float compute_error_squared_rgb(const partition_info * pt,	// the partition that we use when computing the squared-error.
+								const imageblock * blk, const error_weight_block * ewb, const processed_line3 * plines,
+								// output: computed length of the partitioning's line. This is not part of the
+								// error introduced by partitioning itself, but us used to estimate the error introduced by quantization
+								float *length_of_lines);
+
+
+float compute_error_squared_rgba(const partition_info * pt,	// the partition that we use when computing the squared-error.
+								 const imageblock * blk, const error_weight_block * ewb, const processed_line4 * lines,	// one line for each of the partitions. The lines are assumed to be normalized.
+								 float *length_of_lines);
+
+float compute_error_squared_rg(const partition_info * pt,	// the partition that we use when computing the squared-error.
+							   const imageblock * blk, const error_weight_block * ewb, const processed_line2 * plines, float *length_of_lines);
+
+float compute_error_squared_rb(const partition_info * pt,	// the partition that we use when computing the squared-error.
+							   const imageblock * blk, const error_weight_block * ewb, const processed_line2 * plines, float *length_of_lines);
+
+float compute_error_squared_gb(const partition_info * pt,	// the partition that we use when computing the squared-error.
+							   const imageblock * blk, const error_weight_block * ewb, const processed_line2 * plines, float *length_of_lines);
+
+float compute_error_squared_ra(const partition_info * pt,	// the partition that we use when computing the squared-error.
+							   const imageblock * blk, const error_weight_block * ewb, const processed_line2 * plines, float *length_of_lines);
+
+
+// functions to compute error value across a tile for a particular line function
+// for a single partition.
+float compute_error_squared_rgb_single_partition(int partition_to_test, int xdim, int ydim, int zdim, const partition_info * pt,	// the partition that we use when computing the squared-error.
+												 const imageblock * blk, const error_weight_block * ewb, const processed_line3 * lin	// the line for the partition.
+	);
+
+
+
+// for each partition, compute its color weightings.
+void compute_partition_error_color_weightings(int xdim, int ydim, int zdim, const error_weight_block * ewb, const partition_info * pi, float4 error_weightings[4], float4 color_scalefactors[4]);
+
+
+
+// function to find the best partitioning for a given block.
+
+void find_best_partitionings(int partition_search_limit, int xdim, int ydim, int zdim, int partition_count, const imageblock * pb, const error_weight_block * ewb, int candidates_to_return,
+							 // best partitionings to use if the endpoint colors are assumed to be uncorrelated
+							 int *best_partitions_uncorrellated,
+							 // best partitionings to use if the endpoint colors have the same chroma
+							 int *best_partitions_samechroma,
+							 // best partitionings to use if dual plane of weights are present
+							 int *best_partitions_dual_weight_planes);
+
+
+// use k-means clustering to compute a partition ordering for a block.
+void kmeans_compute_partition_ordering(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, int *ordering);
+
+
+
+
+// *********************************************************
+// functions and data pertaining to images and imageblocks
+// *********************************************************
+
+struct astc_codec_image
+{
+	uint8_t ***imagedata8;
+	uint16_t ***imagedata16;
+	int xsize;
+	int ysize;
+	int zsize;
+	int padding;
+};
+
+void destroy_image(astc_codec_image * img);
+astc_codec_image *allocate_image(int bitness, int xsize, int ysize, int zsize, int padding);
+void initialize_image(astc_codec_image * img);
+void fill_image_padding_area(astc_codec_image * img);
+
+
+extern float4 ***input_averages;
+extern float4 ***input_variances;
+extern float ***input_alpha_averages;
+
+
+// the entries here : 0=red, 1=green, 2=blue, 3=alpha, 4=0.0, 5=1.0
+struct swizzlepattern
+{
+	uint8_t r;
+	uint8_t g;
+	uint8_t b;
+	uint8_t a;
+};
+
+
+
+int determine_image_channels(const astc_codec_image * img);
+
+// function to compute regional averages and variances for an image
+void compute_averages_and_variances(const astc_codec_image * img, float rgb_power_to_use, float alpha_power_to_use, int avg_kernel_radius, int var_kernel_radius, swizzlepattern swz);
+
+
+/*
+	Functions to load image from file.
+	If successful, return an astc_codec_image object.
+	If unsuccessful, returns NULL.
+
+	*result is used to return a result. In case of a successfully loaded image, bits[2:0]
+	of *result indicate how many components are present, and bit[7] indicate whether
+	the input image was LDR or HDR (0=LDR, 1=HDR).
+
+	In case of failure, *result is given a negative value.
+*/
+
+
+astc_codec_image *load_ktx_uncompressed_image(const char *filename, int padding, int *result);
+astc_codec_image *load_dds_uncompressed_image(const char *filename, int padding, int *result);
+astc_codec_image *load_tga_image(const char *tga_filename, int padding, int *result);
+astc_codec_image *load_image_with_stb(const char *filename, int padding, int *result);
+
+astc_codec_image *astc_codec_load_image(const char *filename, int padding, int *result);
+int astc_codec_unlink(const char *filename);
+
+// function to store image to file
+// If successful, returns the number of channels in input image
+// If unsuccessful, returns a negative number.
+int store_ktx_uncompressed_image(const astc_codec_image * img, const char *filename, int bitness);
+int store_dds_uncompressed_image(const astc_codec_image * img, const char *filename, int bitness);
+int store_tga_image(const astc_codec_image * img, const char *tga_filename, int bitness);
+
+int astc_codec_store_image(const astc_codec_image * img, const char *filename, int bitness, const char **format_string);
+
+int get_output_filename_enforced_bitness(const char *filename);
+
+
+// compute a bunch of error metrics
+void compute_error_metrics(int input_image_is_hdr, int input_components, const astc_codec_image * img1, const astc_codec_image * img2, int low_fstop, int high_fstop, int psnrmode);
+
+// fetch an image-block from the input file
+void fetch_imageblock(const astc_codec_image * img, imageblock * pb,	// picture-block to initialize with image data
+					  // block dimensions
+					  int xdim, int ydim, int zdim,
+					  // position in picture to fetch block from
+					  int xpos, int ypos, int zpos, swizzlepattern swz);
+
+
+// write an image block to the output file buffer.
+// the data written are taken from orig_data.
+void write_imageblock(astc_codec_image * img, const imageblock * pb,	// picture-block to initialize with image data
+					  // block dimensions
+					  int xdim, int ydim, int zdim,
+					  // position in picture to write block to.
+					  int xpos, int ypos, int zpos, swizzlepattern swz);
+
+
+// helper function to check whether a given picture-block has alpha that is not
+// just uniformly 1.
+int imageblock_uses_alpha(int xdim, int ydim, int zdim, const imageblock * pb);
+
+
+float compute_imageblock_difference(int xdim, int ydim, int zdim, const imageblock * p1, const imageblock * p2, const error_weight_block * ewb);
+
+
+
+
+
+// ***********************************************************
+// functions pertaining to computing texel weights for a block
+// ***********************************************************
+
+
+struct endpoints
+{
+	int partition_count;
+	float4 endpt0[4];
+	float4 endpt1[4];
+};
+
+
+struct endpoints_and_weights
+{
+	endpoints ep;
+	float weights[MAX_TEXELS_PER_BLOCK];
+	float weight_error_scale[MAX_TEXELS_PER_BLOCK];
+};
+
+
+void compute_endpoints_and_ideal_weights_1_plane(int xdim, int ydim, int zdim, const partition_info * pt, const imageblock * blk, const error_weight_block * ewb, endpoints_and_weights * ei);
+
+void compute_endpoints_and_ideal_weights_2_planes(int xdim, int ydim, int zdim, const partition_info * pt, const imageblock * blk, const error_weight_block * ewb, int separate_component,
+												  endpoints_and_weights * ei1,	// for the three components of the primary plane of weights
+												  endpoints_and_weights * ei2	// for the remaining component.
+	);
+
+void compute_ideal_weights_for_decimation_table(const endpoints_and_weights * eai, const decimation_table * it, float *weight_set, float *weights);
+
+void compute_ideal_quantized_weights_for_decimation_table(const endpoints_and_weights * eai,
+														  const decimation_table * it,
+														  float low_bound, float high_bound, const float *weight_set_in, float *weight_set_out, uint8_t * quantized_weight_set, int quantization_level);
+
+
+float compute_error_of_weight_set(const endpoints_and_weights * eai, const decimation_table * it, const float *weights);
+
+
+float compute_value_of_texel_flt(int texel_to_get, const decimation_table * it, const float *weights);
+
+
+int compute_value_of_texel_int(int texel_to_get, const decimation_table * it, const int *weights);
+
+
+void merge_endpoints(const endpoints * ep1,	// contains three of the color components
+					 const endpoints * ep2,	// contains the remaining color component
+					 int separate_component, endpoints * res);
+
+// functions dealing with color endpoints
+
+// function to pack a pair of color endpoints into a series of integers.
+// the format used may or may not match the format specified;
+// the return value is the format actually used.
+int pack_color_endpoints(astc_decode_mode decode_mode, float4 color0, float4 color1, float4 rgbs_color, float4 rgbo_color, float2 luminances, int format, int *output, int quantization_level);
+
+
+// unpack a pair of color endpoints from a series of integers.
+void unpack_color_endpoints(astc_decode_mode decode_mode, int format, int quantization_level, const int *input, int *rgb_hdr, int *alpha_hdr, int *nan_endpoint, ushort4 * output0, ushort4 * output1);
+
+
+struct encoding_choice_errors
+{
+	float rgb_scale_error;		// error of using LDR RGB-scale instead of complete endpoints.
+	float rgb_luma_error;		// error of using HDR RGB-scale instead of complete endpoints.
+	float luminance_error;		// error of using luminance instead of RGB
+	float alpha_drop_error;		// error of discarding alpha
+	float rgb_drop_error;		// error of discarding RGB
+	int can_offset_encode;
+	int can_blue_contract;
+};
+
+// buffers used to store intermediate data in compress_symbolic_block_fixed_partition_*()
+struct compress_fixed_partition_buffers
+{
+	endpoints_and_weights* ei1;
+	endpoints_and_weights* ei2;
+	endpoints_and_weights* eix1;
+	endpoints_and_weights* eix2;
+	float *decimated_quantized_weights;
+	float *decimated_weights;
+	float *flt_quantized_decimated_quantized_weights;
+	uint8_t *u8_quantized_decimated_quantized_weights;
+};
+
+struct compress_symbolic_block_buffers
+{
+	error_weight_block *ewb;
+	error_weight_block_orig *ewbo;
+	symbolic_compressed_block *tempblocks;
+	imageblock *temp;
+	compress_fixed_partition_buffers *plane1;
+	compress_fixed_partition_buffers *planes2;
+};
+
+void compute_encoding_choice_errors(int xdim, int ydim, int zdim, const imageblock * pb, const partition_info * pi, const error_weight_block * ewb,
+									int separate_component,	// component that is separated out in 2-plane mode, -1 in 1-plane mode
+									encoding_choice_errors * eci);
+
+
+
+void determine_optimal_set_of_endpoint_formats_to_use(int xdim, int ydim, int zdim, const partition_info * pt, const imageblock * blk, const error_weight_block * ewb, const endpoints * ep,
+													  int separate_component,	// separate color component for 2-plane mode; -1 for single-plane mode
+													  // bitcounts and errors computed for the various quantization methods
+													  const int *qwt_bitcounts, const float *qwt_errors,
+													  // output data
+													  int partition_format_specifiers[4][4], int quantized_weight[4], int quantization_level[4], int quantization_level_mod[4]);
+
+
+void recompute_ideal_colors(int xdim, int ydim, int zdim, int weight_quantization_mode, endpoints * ep,	// contains the endpoints we wish to update
+							float4 * rgbs_vectors,	// used to return RGBS-vectors for endpoint mode #6
+							float4 * rgbo_vectors,	// used to return RGBS-vectors for endpoint mode #7
+							float2 * lum_vectors,	// used to return luminance-vectors.
+							const uint8_t * weight_set,	// the current set of weight values
+							const uint8_t * plane2_weight_set,	// NULL if plane 2 is not actually used.
+							int plane2_color_component,	// color component for 2nd plane of weights; -1 if the 2nd plane of weights is not present
+							const partition_info * pi, const decimation_table * it, const imageblock * pb,	// picture-block containing the actual data.
+							const error_weight_block * ewb);
+
+
+
+void expand_block_artifact_suppression(int xdim, int ydim, int zdim, error_weighting_params * ewp);
+
+// Function to set error weights for each color component for each texel in a block.
+// Returns the sum of all the error values set.
+float prepare_error_weight_block(const astc_codec_image * input_image,
+								 // dimensions of error weight block.
+								 int xdim, int ydim, int zdim, const error_weighting_params * ewp, const imageblock * blk, error_weight_block * ewb, error_weight_block_orig * ewbo);
+
+
+// functions pertaining to weight alignment
+void prepare_angular_tables(void);
+
+void compute_angular_endpoints_1plane(float mode_cutoff,
+									  const block_size_descriptor * bsd,
+									  const float *decimated_quantized_weights, const float *decimated_weights, float low_value[MAX_WEIGHT_MODES], float high_value[MAX_WEIGHT_MODES]);
+
+void compute_angular_endpoints_2planes(float mode_cutoff,
+									   const block_size_descriptor * bsd,
+									   const float *decimated_quantized_weights,
+									   const float *decimated_weights,
+									   float low_value1[MAX_WEIGHT_MODES], float high_value1[MAX_WEIGHT_MODES], float low_value2[MAX_WEIGHT_MODES], float high_value2[MAX_WEIGHT_MODES]);
+
+
+
+
+/* *********************************** high-level encode and decode functions ************************************ */
+
+float compress_symbolic_block(const astc_codec_image * input_image,
+							  astc_decode_mode decode_mode, int xdim, int ydim, int zdim, const error_weighting_params * ewp, const imageblock * blk, symbolic_compressed_block * scb,
+							  compress_symbolic_block_buffers * tmpbuf);
+
+
+float4 lerp_color_flt(const float4 color0, const float4 color1, float weight,	// 0..1
+					  float plane2_weight,	// 0..1
+					  int plane2_color_component	// 0..3; -1 if only one plane of weights is present.
+	);
+
+
+ushort4 lerp_color_int(astc_decode_mode decode_mode, ushort4 color0, ushort4 color1, int weight,	// 0..64
+					   int plane2_weight,	// 0..64
+					   int plane2_color_component	// 0..3; -1 if only one plane of weights is present.
+	);
+
+
+void decompress_symbolic_block(astc_decode_mode decode_mode,
+							   // dimensions of block
+							   int xdim, int ydim, int zdim,
+							   // position of block
+							   int xpos, int ypos, int zpos, const symbolic_compressed_block * scb, imageblock * blk);
+
+
+physical_compressed_block symbolic_to_physical(int xdim, int ydim, int zdim, const symbolic_compressed_block * sc);
+
+void physical_to_symbolic(int xdim, int ydim, int zdim, physical_compressed_block pb, symbolic_compressed_block * res);
+
+
+uint16_t unorm16_to_sf16(uint16_t p);
+uint16_t lns_to_sf16(uint16_t p);
+
+
+#endif
--- a/3rdparty/astc/astc_color_quantize.cpp
+++ b/3rdparty/astc/astc_color_quantize.cpp
--- a/3rdparty/astc/astc_color_unquantize.cpp
+++ b/3rdparty/astc/astc_color_unquantize.cpp
@@ -0,0 +1,970 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Color unquantization functions for ASTC.
+ */
+/*----------------------------------------------------------------------------*/
+
+#include "astc_codec_internals.h"
+
+#include "mathlib.h"
+#include "softfloat.h"
+
+int rgb_delta_unpack(const int input[6], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	// unquantize the color endpoints
+	int r0 = color_unquantization_tables[quantization_level][input[0]];
+	int g0 = color_unquantization_tables[quantization_level][input[2]];
+	int b0 = color_unquantization_tables[quantization_level][input[4]];
+
+	int r1 = color_unquantization_tables[quantization_level][input[1]];
+	int g1 = color_unquantization_tables[quantization_level][input[3]];
+	int b1 = color_unquantization_tables[quantization_level][input[5]];
+
+	// perform the bit-transfer procedure
+	r0 |= (r1 & 0x80) << 1;
+	g0 |= (g1 & 0x80) << 1;
+	b0 |= (b1 & 0x80) << 1;
+	r1 &= 0x7F;
+	g1 &= 0x7F;
+	b1 &= 0x7F;
+	if (r1 & 0x40)
+		r1 -= 0x80;
+	if (g1 & 0x40)
+		g1 -= 0x80;
+	if (b1 & 0x40)
+		b1 -= 0x80;
+
+	r0 >>= 1;
+	g0 >>= 1;
+	b0 >>= 1;
+	r1 >>= 1;
+	g1 >>= 1;
+	b1 >>= 1;
+
+	int rgbsum = r1 + g1 + b1;
+
+	r1 += r0;
+	g1 += g0;
+	b1 += b0;
+
+
+	int retval;
+
+	int r0e, g0e, b0e;
+	int r1e, g1e, b1e;
+
+	if (rgbsum >= 0)
+	{
+		r0e = r0;
+		g0e = g0;
+		b0e = b0;
+
+		r1e = r1;
+		g1e = g1;
+		b1e = b1;
+
+		retval = 0;
+	}
+	else
+	{
+		r0e = (r1 + b1) >> 1;
+		g0e = (g1 + b1) >> 1;
+		b0e = b1;
+
+		r1e = (r0 + b0) >> 1;
+		g1e = (g0 + b0) >> 1;
+		b1e = b0;
+
+		retval = 1;
+	}
+
+	if (r0e < 0)
+		r0e = 0;
+	else if (r0e > 255)
+		r0e = 255;
+
+	if (g0e < 0)
+		g0e = 0;
+	else if (g0e > 255)
+		g0e = 255;
+
+	if (b0e < 0)
+		b0e = 0;
+	else if (b0e > 255)
+		b0e = 255;
+
+	if (r1e < 0)
+		r1e = 0;
+	else if (r1e > 255)
+		r1e = 255;
+
+	if (g1e < 0)
+		g1e = 0;
+	else if (g1e > 255)
+		g1e = 255;
+
+	if (b1e < 0)
+		b1e = 0;
+	else if (b1e > 255)
+		b1e = 255;
+
+	output0->x = r0e;
+	output0->y = g0e;
+	output0->z = b0e;
+	output0->w = 0xFF;
+
+	output1->x = r1e;
+	output1->y = g1e;
+	output1->z = b1e;
+	output1->w = 0xFF;
+
+	return retval;
+}
+
+
+int rgb_unpack(const int input[6], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+
+	int ri0b = color_unquantization_tables[quantization_level][input[0]];
+	int ri1b = color_unquantization_tables[quantization_level][input[1]];
+	int gi0b = color_unquantization_tables[quantization_level][input[2]];
+	int gi1b = color_unquantization_tables[quantization_level][input[3]];
+	int bi0b = color_unquantization_tables[quantization_level][input[4]];
+	int bi1b = color_unquantization_tables[quantization_level][input[5]];
+
+	if (ri0b + gi0b + bi0b > ri1b + gi1b + bi1b)
+	{
+		// blue-contraction
+		ri0b = (ri0b + bi0b) >> 1;
+		gi0b = (gi0b + bi0b) >> 1;
+		ri1b = (ri1b + bi1b) >> 1;
+		gi1b = (gi1b + bi1b) >> 1;
+
+		output0->x = ri1b;
+		output0->y = gi1b;
+		output0->z = bi1b;
+		output0->w = 255;
+
+		output1->x = ri0b;
+		output1->y = gi0b;
+		output1->z = bi0b;
+		output1->w = 255;
+		return 1;
+	}
+	else
+	{
+		output0->x = ri0b;
+		output0->y = gi0b;
+		output0->z = bi0b;
+		output0->w = 255;
+
+		output1->x = ri1b;
+		output1->y = gi1b;
+		output1->z = bi1b;
+		output1->w = 255;
+		return 0;
+	}
+}
+
+
+
+
+void rgba_unpack(const int input[8], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	int order = rgb_unpack(input, quantization_level, output0, output1);
+	if (order == 0)
+	{
+		output0->w = color_unquantization_tables[quantization_level][input[6]];
+		output1->w = color_unquantization_tables[quantization_level][input[7]];
+	}
+	else
+	{
+		output0->w = color_unquantization_tables[quantization_level][input[7]];
+		output1->w = color_unquantization_tables[quantization_level][input[6]];
+	}
+}
+
+
+
+void rgba_delta_unpack(const int input[8], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	int a0 = color_unquantization_tables[quantization_level][input[6]];
+	int a1 = color_unquantization_tables[quantization_level][input[7]];
+	a0 |= (a1 & 0x80) << 1;
+	a1 &= 0x7F;
+	if (a1 & 0x40)
+		a1 -= 0x80;
+	a0 >>= 1;
+	a1 >>= 1;
+	a1 += a0;
+
+	if (a1 < 0)
+		a1 = 0;
+	else if (a1 > 255)
+		a1 = 255;
+
+	int order = rgb_delta_unpack(input, quantization_level, output0, output1);
+	if (order == 0)
+	{
+		output0->w = a0;
+		output1->w = a1;
+	}
+	else
+	{
+		output0->w = a1;
+		output1->w = a0;
+	}
+}
+
+
+void rgb_scale_unpack(const int input[4], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	int ir = color_unquantization_tables[quantization_level][input[0]];
+	int ig = color_unquantization_tables[quantization_level][input[1]];
+	int ib = color_unquantization_tables[quantization_level][input[2]];
+
+	int iscale = color_unquantization_tables[quantization_level][input[3]];
+
+	*output1 = ushort4(ir, ig, ib, 255);
+	*output0 = ushort4((ir * iscale) >> 8, (ig * iscale) >> 8, (ib * iscale) >> 8, 255);
+}
+
+
+
+void rgb_scale_alpha_unpack(const int input[6], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	rgb_scale_unpack(input, quantization_level, output0, output1);
+	output0->w = color_unquantization_tables[quantization_level][input[4]];
+	output1->w = color_unquantization_tables[quantization_level][input[5]];
+
+}
+
+
+void luminance_unpack(const int input[2], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	int lum0 = color_unquantization_tables[quantization_level][input[0]];
+	int lum1 = color_unquantization_tables[quantization_level][input[1]];
+	*output0 = ushort4(lum0, lum0, lum0, 255);
+	*output1 = ushort4(lum1, lum1, lum1, 255);
+}
+
+
+void luminance_delta_unpack(const int input[2], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	int v0 = color_unquantization_tables[quantization_level][input[0]];
+	int v1 = color_unquantization_tables[quantization_level][input[1]];
+	int l0 = (v0 >> 2) | (v1 & 0xC0);
+	int l1 = l0 + (v1 & 0x3F);
+
+	if (l1 > 255)
+		l1 = 255;
+
+	*output0 = ushort4(l0, l0, l0, 255);
+	*output1 = ushort4(l1, l1, l1, 255);
+}
+
+
+
+
+void luminance_alpha_unpack(const int input[4], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	int lum0 = color_unquantization_tables[quantization_level][input[0]];
+	int lum1 = color_unquantization_tables[quantization_level][input[1]];
+	int alpha0 = color_unquantization_tables[quantization_level][input[2]];
+	int alpha1 = color_unquantization_tables[quantization_level][input[3]];
+	*output0 = ushort4(lum0, lum0, lum0, alpha0);
+	*output1 = ushort4(lum1, lum1, lum1, alpha1);
+}
+
+
+void luminance_alpha_delta_unpack(const int input[4], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	int lum0 = color_unquantization_tables[quantization_level][input[0]];
+	int lum1 = color_unquantization_tables[quantization_level][input[1]];
+	int alpha0 = color_unquantization_tables[quantization_level][input[2]];
+	int alpha1 = color_unquantization_tables[quantization_level][input[3]];
+
+	lum0 |= (lum1 & 0x80) << 1;
+	alpha0 |= (alpha1 & 0x80) << 1;
+	lum1 &= 0x7F;
+	alpha1 &= 0x7F;
+	if (lum1 & 0x40)
+		lum1 -= 0x80;
+	if (alpha1 & 0x40)
+		alpha1 -= 0x80;
+
+	lum0 >>= 1;
+	lum1 >>= 1;
+	alpha0 >>= 1;
+	alpha1 >>= 1;
+	lum1 += lum0;
+	alpha1 += alpha0;
+
+	if (lum1 < 0)
+		lum1 = 0;
+	else if (lum1 > 255)
+		lum1 = 255;
+
+	if (alpha1 < 0)
+		alpha1 = 0;
+	else if (alpha1 > 255)
+		alpha1 = 255;
+
+	*output0 = ushort4(lum0, lum0, lum0, alpha0);
+	*output1 = ushort4(lum1, lum1, lum1, alpha1);
+}
+
+
+
+
+// RGB-offset format
+void hdr_rgbo_unpack3(const int input[4], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	int v0 = color_unquantization_tables[quantization_level][input[0]];
+	int v1 = color_unquantization_tables[quantization_level][input[1]];
+	int v2 = color_unquantization_tables[quantization_level][input[2]];
+	int v3 = color_unquantization_tables[quantization_level][input[3]];
+
+	int modeval = ((v0 & 0xC0) >> 6) | (((v1 & 0x80) >> 7) << 2) | (((v2 & 0x80) >> 7) << 3);
+
+	int majcomp;
+	int mode;
+	if ((modeval & 0xC) != 0xC)
+	{
+		majcomp = modeval >> 2;
+		mode = modeval & 3;
+	}
+	else if (modeval != 0xF)
+	{
+		majcomp = modeval & 3;
+		mode = 4;
+	}
+	else
+	{
+		majcomp = 0;
+		mode = 5;
+	}
+
+	int red = v0 & 0x3F;
+	int green = v1 & 0x1F;
+	int blue = v2 & 0x1F;
+	int scale = v3 & 0x1F;
+
+	int bit0 = (v1 >> 6) & 1;
+	int bit1 = (v1 >> 5) & 1;
+	int bit2 = (v2 >> 6) & 1;
+	int bit3 = (v2 >> 5) & 1;
+	int bit4 = (v3 >> 7) & 1;
+	int bit5 = (v3 >> 6) & 1;
+	int bit6 = (v3 >> 5) & 1;
+
+	int ohcomp = 1 << mode;
+
+	if (ohcomp & 0x30)
+		green |= bit0 << 6;
+	if (ohcomp & 0x3A)
+		green |= bit1 << 5;
+	if (ohcomp & 0x30)
+		blue |= bit2 << 6;
+	if (ohcomp & 0x3A)
+		blue |= bit3 << 5;
+
+	if (ohcomp & 0x3D)
+		scale |= bit6 << 5;
+	if (ohcomp & 0x2D)
+		scale |= bit5 << 6;
+	if (ohcomp & 0x04)
+		scale |= bit4 << 7;
+
+	if (ohcomp & 0x3B)
+		red |= bit4 << 6;
+	if (ohcomp & 0x04)
+		red |= bit3 << 6;
+
+	if (ohcomp & 0x10)
+		red |= bit5 << 7;
+	if (ohcomp & 0x0F)
+		red |= bit2 << 7;
+
+	if (ohcomp & 0x05)
+		red |= bit1 << 8;
+	if (ohcomp & 0x0A)
+		red |= bit0 << 8;
+
+	if (ohcomp & 0x05)
+		red |= bit0 << 9;
+	if (ohcomp & 0x02)
+		red |= bit6 << 9;
+
+	if (ohcomp & 0x01)
+		red |= bit3 << 10;
+	if (ohcomp & 0x02)
+		red |= bit5 << 10;
+
+
+	// expand to 12 bits.
+	static const int shamts[6] = { 1, 1, 2, 3, 4, 5 };
+	int shamt = shamts[mode];
+	red <<= shamt;
+	green <<= shamt;
+	blue <<= shamt;
+	scale <<= shamt;
+
+	// on modes 0 to 4, the values stored for "green" and "blue" are differentials,
+	// not absolute values.
+	if (mode != 5)
+	{
+		green = red - green;
+		blue = red - blue;
+	}
+
+	// switch around components.
+	int temp;
+	switch (majcomp)
+	{
+	case 1:
+		temp = red;
+		red = green;
+		green = temp;
+		break;
+	case 2:
+		temp = red;
+		red = blue;
+		blue = temp;
+		break;
+	default:
+		break;
+	}
+
+
+	int red0 = red - scale;
+	int green0 = green - scale;
+	int blue0 = blue - scale;
+
+	// clamp to [0,0xFFF].
+	if (red < 0)
+		red = 0;
+	if (green < 0)
+		green = 0;
+	if (blue < 0)
+		blue = 0;
+
+	if (red0 < 0)
+		red0 = 0;
+	if (green0 < 0)
+		green0 = 0;
+	if (blue0 < 0)
+		blue0 = 0;
+
+	*output0 = ushort4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
+	*output1 = ushort4(red << 4, green << 4, blue << 4, 0x7800);
+}
+
+
+
+void hdr_rgb_unpack3(const int input[6], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+
+	int v0 = color_unquantization_tables[quantization_level][input[0]];
+	int v1 = color_unquantization_tables[quantization_level][input[1]];
+	int v2 = color_unquantization_tables[quantization_level][input[2]];
+	int v3 = color_unquantization_tables[quantization_level][input[3]];
+	int v4 = color_unquantization_tables[quantization_level][input[4]];
+	int v5 = color_unquantization_tables[quantization_level][input[5]];
+
+	// extract all the fixed-placement bitfields
+	int modeval = ((v1 & 0x80) >> 7) | (((v2 & 0x80) >> 7) << 1) | (((v3 & 0x80) >> 7) << 2);
+
+	int majcomp = ((v4 & 0x80) >> 7) | (((v5 & 0x80) >> 7) << 1);
+
+	if (majcomp == 3)
+	{
+		*output0 = ushort4(v0 << 8, v2 << 8, (v4 & 0x7F) << 9, 0x7800);
+		*output1 = ushort4(v1 << 8, v3 << 8, (v5 & 0x7F) << 9, 0x7800);
+		return;
+	}
+
+	int a = v0 | ((v1 & 0x40) << 2);
+	int b0 = v2 & 0x3f;
+	int b1 = v3 & 0x3f;
+	int c = v1 & 0x3f;
+	int d0 = v4 & 0x7f;
+	int d1 = v5 & 0x7f;
+
+	// get hold of the number of bits in 'd0' and 'd1'
+	static const int dbits_tab[8] = { 7, 6, 7, 6, 5, 6, 5, 6 };
+	int dbits = dbits_tab[modeval];
+
+	// extract six variable-placement bits
+	int bit0 = (v2 >> 6) & 1;
+	int bit1 = (v3 >> 6) & 1;
+
+	int bit2 = (v4 >> 6) & 1;
+	int bit3 = (v5 >> 6) & 1;
+	int bit4 = (v4 >> 5) & 1;
+	int bit5 = (v5 >> 5) & 1;
+
+
+	// and prepend the variable-placement bits depending on mode.
+	int ohmod = 1 << modeval;	// one-hot-mode
+	if (ohmod & 0xA4)
+		a |= bit0 << 9;
+	if (ohmod & 0x8)
+		a |= bit2 << 9;
+	if (ohmod & 0x50)
+		a |= bit4 << 9;
+
+	if (ohmod & 0x50)
+		a |= bit5 << 10;
+	if (ohmod & 0xA0)
+		a |= bit1 << 10;
+
+	if (ohmod & 0xC0)
+		a |= bit2 << 11;
+
+	if (ohmod & 0x4)
+		c |= bit1 << 6;
+	if (ohmod & 0xE8)
+		c |= bit3 << 6;
+
+	if (ohmod & 0x20)
+		c |= bit2 << 7;
+
+
+	if (ohmod & 0x5B)
+		b0 |= bit0 << 6;
+	if (ohmod & 0x5B)
+		b1 |= bit1 << 6;
+
+	if (ohmod & 0x12)
+		b0 |= bit2 << 7;
+	if (ohmod & 0x12)
+		b1 |= bit3 << 7;
+
+	if (ohmod & 0xAF)
+		d0 |= bit4 << 5;
+	if (ohmod & 0xAF)
+		d1 |= bit5 << 5;
+	if (ohmod & 0x5)
+		d0 |= bit2 << 6;
+	if (ohmod & 0x5)
+		d1 |= bit3 << 6;
+
+	// sign-extend 'd0' and 'd1'
+	// note: this code assumes that signed right-shift actually sign-fills, not zero-fills.
+	int32_t d0x = d0;
+	int32_t d1x = d1;
+	int sx_shamt = 32 - dbits;
+	d0x <<= sx_shamt;
+	d0x >>= sx_shamt;
+	d1x <<= sx_shamt;
+	d1x >>= sx_shamt;
+	d0 = d0x;
+	d1 = d1x;
+
+	// expand all values to 12 bits, with left-shift as needed.
+	int val_shamt = (modeval >> 1) ^ 3;
+	a <<= val_shamt;
+	b0 <<= val_shamt;
+	b1 <<= val_shamt;
+	c <<= val_shamt;
+	d0 <<= val_shamt;
+	d1 <<= val_shamt;
+
+	// then compute the actual color values.
+	int red1 = a;
+	int green1 = a - b0;
+	int blue1 = a - b1;
+	int red0 = a - c;
+	int green0 = a - b0 - c - d0;
+	int blue0 = a - b1 - c - d1;
+
+	// clamp the color components to [0,2^12 - 1]
+	if (red0 < 0)
+		red0 = 0;
+	else if (red0 > 0xFFF)
+		red0 = 0xFFF;
+
+	if (green0 < 0)
+		green0 = 0;
+	else if (green0 > 0xFFF)
+		green0 = 0xFFF;
+
+	if (blue0 < 0)
+		blue0 = 0;
+	else if (blue0 > 0xFFF)
+		blue0 = 0xFFF;
+
+	if (red1 < 0)
+		red1 = 0;
+	else if (red1 > 0xFFF)
+		red1 = 0xFFF;
+
+	if (green1 < 0)
+		green1 = 0;
+	else if (green1 > 0xFFF)
+		green1 = 0xFFF;
+
+	if (blue1 < 0)
+		blue1 = 0;
+	else if (blue1 > 0xFFF)
+		blue1 = 0xFFF;
+
+
+	// switch around the color components
+	int temp0, temp1;
+	switch (majcomp)
+	{
+	case 1:					// switch around red and green
+		temp0 = red0;
+		temp1 = red1;
+		red0 = green0;
+		red1 = green1;
+		green0 = temp0;
+		green1 = temp1;
+		break;
+	case 2:					// switch around red and blue
+		temp0 = red0;
+		temp1 = red1;
+		red0 = blue0;
+		red1 = blue1;
+		blue0 = temp0;
+		blue1 = temp1;
+		break;
+	case 0:					// no switch
+		break;
+	}
+
+	*output0 = ushort4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
+	*output1 = ushort4(red1 << 4, green1 << 4, blue1 << 4, 0x7800);
+}
+
+
+
+
+void hdr_rgb_ldr_alpha_unpack3(const int input[8], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	hdr_rgb_unpack3(input, quantization_level, output0, output1);
+
+	int v6 = color_unquantization_tables[quantization_level][input[6]];
+	int v7 = color_unquantization_tables[quantization_level][input[7]];
+	output0->w = v6;
+	output1->w = v7;
+}
+
+
+
+void hdr_luminance_small_range_unpack(const int input[2], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	int v0 = color_unquantization_tables[quantization_level][input[0]];
+	int v1 = color_unquantization_tables[quantization_level][input[1]];
+
+	int y0, y1;
+	if (v0 & 0x80)
+	{
+		y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2);
+		y1 = (v1 & 0x1F) << 2;
+	}
+	else
+	{
+		y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1);
+		y1 = (v1 & 0xF) << 1;
+	}
+
+	y1 += y0;
+	if (y1 > 0xFFF)
+		y1 = 0xFFF;
+
+	*output0 = ushort4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
+	*output1 = ushort4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
+}
+
+
+void hdr_luminance_large_range_unpack(const int input[2], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	int v0 = color_unquantization_tables[quantization_level][input[0]];
+	int v1 = color_unquantization_tables[quantization_level][input[1]];
+
+	int y0, y1;
+	if (v1 >= v0)
+	{
+		y0 = v0 << 4;
+		y1 = v1 << 4;
+	}
+	else
+	{
+		y0 = (v1 << 4) + 8;
+		y1 = (v0 << 4) - 8;
+	}
+	*output0 = ushort4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
+	*output1 = ushort4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
+}
+
+
+
+void hdr_alpha_unpack(const int input[2], int quantization_level, int *a0, int *a1)
+{
+
+	int v6 = color_unquantization_tables[quantization_level][input[0]];
+	int v7 = color_unquantization_tables[quantization_level][input[1]];
+
+	int selector = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
+	v6 &= 0x7F;
+	v7 &= 0x7F;
+	if (selector == 3)
+	{
+		*a0 = v6 << 5;
+		*a1 = v7 << 5;
+	}
+	else
+	{
+		v6 |= (v7 << (selector + 1)) & 0x780;
+		v7 &= (0x3f >> selector);
+		v7 ^= 32 >> selector;
+		v7 -= 32 >> selector;
+		v6 <<= (4 - selector);
+		v7 <<= (4 - selector);
+		v7 += v6;
+
+		if (v7 < 0)
+			v7 = 0;
+		else if (v7 > 0xFFF)
+			v7 = 0xFFF;
+
+		*a0 = v6;
+		*a1 = v7;
+	}
+
+	*a0 <<= 4;
+	*a1 <<= 4;
+}
+
+
+
+void hdr_rgb_hdr_alpha_unpack3(const int input[8], int quantization_level, ushort4 * output0, ushort4 * output1)
+{
+	hdr_rgb_unpack3(input, quantization_level, output0, output1);
+
+	int alpha0, alpha1;
+	hdr_alpha_unpack(input + 6, quantization_level, &alpha0, &alpha1);
+
+	output0->w = alpha0;
+	output1->w = alpha1;
+}
+
+
+
+
+
+
+void unpack_color_endpoints(astc_decode_mode decode_mode, int format, int quantization_level, const int *input, int *rgb_hdr, int *alpha_hdr, int *nan_endpoint, ushort4 * output0, ushort4 * output1)
+{
+	*nan_endpoint = 0;
+
+	switch (format)
+	{
+	case FMT_LUMINANCE:
+		*rgb_hdr = 0;
+		*alpha_hdr = 0;
+		luminance_unpack(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_DELTA:
+		*rgb_hdr = 0;
+		*alpha_hdr = 0;
+		luminance_delta_unpack(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_HDR_LUMINANCE_SMALL_RANGE:
+		*rgb_hdr = 1;
+		*alpha_hdr = -1;
+		hdr_luminance_small_range_unpack(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_HDR_LUMINANCE_LARGE_RANGE:
+		*rgb_hdr = 1;
+		*alpha_hdr = -1;
+		hdr_luminance_large_range_unpack(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_ALPHA:
+		*rgb_hdr = 0;
+		*alpha_hdr = 0;
+		luminance_alpha_unpack(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_ALPHA_DELTA:
+		*rgb_hdr = 0;
+		*alpha_hdr = 0;
+		luminance_alpha_delta_unpack(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_RGB_SCALE:
+		*rgb_hdr = 0;
+		*alpha_hdr = 0;
+		rgb_scale_unpack(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_RGB_SCALE_ALPHA:
+		*rgb_hdr = 0;
+		*alpha_hdr = 0;
+		rgb_scale_alpha_unpack(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_HDR_RGB_SCALE:
+		*rgb_hdr = 1;
+		*alpha_hdr = -1;
+		hdr_rgbo_unpack3(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_RGB:
+		*rgb_hdr = 0;
+		*alpha_hdr = 0;
+		rgb_unpack(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_RGB_DELTA:
+		*rgb_hdr = 0;
+		*alpha_hdr = 0;
+		rgb_delta_unpack(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_HDR_RGB:
+		*rgb_hdr = 1;
+		*alpha_hdr = -1;
+		hdr_rgb_unpack3(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_RGBA:
+		*rgb_hdr = 0;
+		*alpha_hdr = 0;
+		rgba_unpack(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_RGBA_DELTA:
+		*rgb_hdr = 0;
+		*alpha_hdr = 0;
+		rgba_delta_unpack(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_HDR_RGB_LDR_ALPHA:
+		*rgb_hdr = 1;
+		*alpha_hdr = 0;
+		hdr_rgb_ldr_alpha_unpack3(input, quantization_level, output0, output1);
+		break;
+
+	case FMT_HDR_RGBA:
+		*rgb_hdr = 1;
+		*alpha_hdr = 1;
+		hdr_rgb_hdr_alpha_unpack3(input, quantization_level, output0, output1);
+		break;
+
+	default:
+		ASTC_CODEC_INTERNAL_ERROR;
+	}
+
+
+
+	if (*alpha_hdr == -1)
+	{
+		if (alpha_force_use_of_hdr)
+		{
+			output0->w = 0x7800;
+			output1->w = 0x7800;
+			*alpha_hdr = 1;
+		}
+		else
+		{
+			output0->w = 0x00FF;
+			output1->w = 0x00FF;
+			*alpha_hdr = 0;
+		}
+	}
+
+
+
+	switch (decode_mode)
+	{
+	case DECODE_LDR_SRGB:
+		if (*rgb_hdr == 1)
+		{
+			output0->x = 0xFF00;
+			output0->y = 0x0000;
+			output0->z = 0xFF00;
+			output0->w = 0xFF00;
+			output1->x = 0xFF00;
+			output1->y = 0x0000;
+			output1->z = 0xFF00;
+			output1->w = 0xFF00;
+		}
+		else
+		{
+			output0->x *= 257;
+			output0->y *= 257;
+			output0->z *= 257;
+			output0->w *= 257;
+			output1->x *= 257;
+			output1->y *= 257;
+			output1->z *= 257;
+			output1->w *= 257;
+		}
+		*rgb_hdr = 0;
+		*alpha_hdr = 0;
+		break;
+
+	case DECODE_LDR:
+		if (*rgb_hdr == 1)
+		{
+			output0->x = 0xFFFF;
+			output0->y = 0xFFFF;
+			output0->z = 0xFFFF;
+			output0->w = 0xFFFF;
+			output1->x = 0xFFFF;
+			output1->y = 0xFFFF;
+			output1->z = 0xFFFF;
+			output1->w = 0xFFFF;
+			*nan_endpoint = 1;
+		}
+		else
+		{
+			output0->x *= 257;
+			output0->y *= 257;
+			output0->z *= 257;
+			output0->w *= 257;
+			output1->x *= 257;
+			output1->y *= 257;
+			output1->z *= 257;
+			output1->w *= 257;
+		}
+		*rgb_hdr = 0;
+		*alpha_hdr = 0;
+		break;
+
+	case DECODE_HDR:
+
+		if (*rgb_hdr == 0)
+		{
+			output0->x *= 257;
+			output0->y *= 257;
+			output0->z *= 257;
+			output1->x *= 257;
+			output1->y *= 257;
+			output1->z *= 257;
+		}
+		if (*alpha_hdr == 0)
+		{
+			output0->w *= 257;
+			output1->w *= 257;
+		}
+		break;
+	}
+}
--- a/3rdparty/astc/astc_compress_symbolic.cpp
+++ b/3rdparty/astc/astc_compress_symbolic.cpp
--- a/3rdparty/astc/astc_compute_variance.cpp
+++ b/3rdparty/astc/astc_compute_variance.cpp
@@ -0,0 +1,524 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	ASTC functions to calculate, for each pixel and each color component,
+ *			its variance within an NxN footprint; we want N to be parametric.
+ *
+ *			The routine below uses summed area tables in order to perform the
+ *			computation in O(1) time per pixel, independent of big N is.
+ */
+/*----------------------------------------------------------------------------*/
+
+#include "astc_codec_internals.h"
+
+#include <math.h>
+#include "mathlib.h"
+#include "softfloat.h"
+
+float4 *** input_averages;
+float  *** input_alpha_averages;
+float4 *** input_variances;
+
+#include <stdio.h>
+
+// routine to compute averages and variances for a pixel region.
+// The routine computes both in a single pass, using a summed-area table
+// to decouple the running time from the averaging/variance kernel size.
+
+static void compute_pixel_region_variance(const astc_codec_image * img, float rgb_power_to_use, float alpha_power_to_use, swizzlepattern swz, int use_z_axis,
+										  int source_xoffset,int source_yoffset, int source_zoffset, // position of upper-left pixel in data set
+										  int xsize, int ysize, int zsize, 	// the size of the region to actually compute averages and variances for.
+										  int avg_var_kernel_radius, int alpha_kernel_radius,
+										  int dest_xoffset, int dest_yoffset, int dest_zoffset)
+{
+	int x, y, z;
+
+	int kernel_radius = MAX(avg_var_kernel_radius, alpha_kernel_radius);
+	int kerneldim = 2 * kernel_radius + 1;
+
+	// allocate memory
+	int xpadsize = xsize + kerneldim;
+	int ypadsize = ysize + kerneldim;
+	int zpadsize = zsize + (use_z_axis ? kerneldim : 1);
+
+	double4 ***varbuf1 = new double4 **[zpadsize];
+	double4 ***varbuf2 = new double4 **[zpadsize];
+	varbuf1[0] = new double4 *[ypadsize * zpadsize];
+	varbuf2[0] = new double4 *[ypadsize * zpadsize];
+	varbuf1[0][0] = new double4[xpadsize * ypadsize * zpadsize];
+	varbuf2[0][0] = new double4[xpadsize * ypadsize * zpadsize];
+
+
+	for (z = 1; z < zpadsize; z++)
+	{
+		varbuf1[z] = varbuf1[0] + ypadsize * z;
+		varbuf2[z] = varbuf2[0] + ypadsize * z;
+		varbuf1[z][0] = varbuf1[0][0] + xpadsize * ypadsize * z;
+		varbuf2[z][0] = varbuf2[0][0] + xpadsize * ypadsize * z;
+	}
+
+	for (z = 0; z < zpadsize; z++)
+		for (y = 1; y < ypadsize; y++)
+		{
+			varbuf1[z][y] = varbuf1[z][0] + xpadsize * y;
+			varbuf2[z][y] = varbuf2[z][0] + xpadsize * y;
+		}
+
+	int powers_are_1 = (rgb_power_to_use == 1.0f) && (alpha_power_to_use == 1.0f);
+
+
+	// load x and x^2 values into the allocated buffers
+	if (img->imagedata8)
+	{
+		uint8_t data[6];
+		data[4] = 0;
+		data[5] = 255;
+
+		for (z = 0; z < zpadsize - 1; z++)
+		{
+			int z_src = z + source_zoffset - (use_z_axis ? kernel_radius : 0);
+			for (y = 0; y < ypadsize - 1; y++)
+			{
+				int y_src = y + source_yoffset - kernel_radius;
+				for (x = 0; x < xpadsize - 1; x++)
+				{
+					int x_src = x + source_xoffset - kernel_radius;
+					data[0] = img->imagedata8[z_src][y_src][4 * x_src + 0];
+					data[1] = img->imagedata8[z_src][y_src][4 * x_src + 1];
+					data[2] = img->imagedata8[z_src][y_src][4 * x_src + 2];
+					data[3] = img->imagedata8[z_src][y_src][4 * x_src + 3];
+
+					uint8_t r = data[swz.r];
+					uint8_t g = data[swz.g];
+					uint8_t b = data[swz.b];
+					uint8_t a = data[swz.a];
+
+					double4 d = double4(r * (1.0 / 255.0),
+										g * (1.0 / 255.0),
+										b * (1.0 / 255.0),
+										a * (1.0 / 255.0));
+
+					if (perform_srgb_transform)
+					{
+						d.x = (d.x <= 0.04045) ? d.x * (1.0 / 12.92) : (d.x <= 1) ? pow((d.x + 0.055) * (1.0 / 1.055), 2.4) : d.x;
+						d.y = (d.y <= 0.04045) ? d.y * (1.0 / 12.92) : (d.y <= 1) ? pow((d.y + 0.055) * (1.0 / 1.055), 2.4) : d.y;
+						d.z = (d.z <= 0.04045) ? d.z * (1.0 / 12.92) : (d.z <= 1) ? pow((d.z + 0.055) * (1.0 / 1.055), 2.4) : d.z;
+					}
+
+					if (!powers_are_1)
+					{
+						d.x = pow(MAX(d.x, 1e-6), (double)rgb_power_to_use);
+						d.y = pow(MAX(d.y, 1e-6), (double)rgb_power_to_use);
+						d.z = pow(MAX(d.z, 1e-6), (double)rgb_power_to_use);
+						d.w = pow(MAX(d.w, 1e-6), (double)alpha_power_to_use);
+					}
+
+					varbuf1[z][y][x] = d;
+					varbuf2[z][y][x] = d * d;
+				}
+			}
+		}
+	}
+	else
+	{
+		uint16_t data[6];
+		data[4] = 0;
+		data[5] = 0x3C00;		// 1.0 encoded as FP16.
+
+		for (z = 0; z < zpadsize - 1; z++)
+		{
+			int z_src = z + source_zoffset - (use_z_axis ? kernel_radius : 0);
+			for (y = 0; y < ypadsize - 1; y++)
+			{
+				int y_src = y + source_yoffset - kernel_radius;
+				for (x = 0; x < xpadsize - 1; x++)
+				{
+					int x_src = x + source_xoffset - kernel_radius;
+					data[0] = img->imagedata16[z_src][y_src][4 * x_src];
+					data[1] = img->imagedata16[z_src][y_src][4 * x_src + 1];
+					data[2] = img->imagedata16[z_src][y_src][4 * x_src + 2];
+					data[3] = img->imagedata16[z_src][y_src][4 * x_src + 3];
+
+					uint16_t r = data[swz.r];
+					uint16_t g = data[swz.g];
+					uint16_t b = data[swz.b];
+					uint16_t a = data[swz.a];
+
+					double4 d = double4(sf16_to_float(r),
+										sf16_to_float(g),
+										sf16_to_float(b),
+										sf16_to_float(a));
+
+					if (perform_srgb_transform)
+					{
+						d.x = (d.x <= 0.04045) ? d.x * (1.0 / 12.92) : (d.x <= 1) ? pow((d.x + 0.055) * (1.0 / 1.055), 2.4) : d.x;
+						d.y = (d.y <= 0.04045) ? d.y * (1.0 / 12.92) : (d.y <= 1) ? pow((d.y + 0.055) * (1.0 / 1.055), 2.4) : d.y;
+						d.z = (d.z <= 0.04045) ? d.z * (1.0 / 12.92) : (d.z <= 1) ? pow((d.z + 0.055) * (1.0 / 1.055), 2.4) : d.z;
+					}
+
+					if (!powers_are_1)
+					{
+						d.x = pow(MAX(d.x, 1e-6), (double)rgb_power_to_use);
+						d.y = pow(MAX(d.y, 1e-6), (double)rgb_power_to_use);
+						d.z = pow(MAX(d.z, 1e-6), (double)rgb_power_to_use);
+						d.w = pow(MAX(d.w, 1e-6), (double)alpha_power_to_use);
+					}
+
+					varbuf1[z][y][x] = d;
+					varbuf2[z][y][x] = d * d;
+				}
+			}
+		}
+	}
+
+
+
+	// pad out buffers with 0s
+	for (z = 0; z < zpadsize; z++)
+	{
+		for (y = 0; y < ypadsize; y++)
+		{
+			varbuf1[z][y][xpadsize - 1] = double4(0.0, 0.0, 0.0, 0.0);
+			varbuf2[z][y][xpadsize - 1] = double4(0.0, 0.0, 0.0, 0.0);
+		}
+		for (x = 0; x < xpadsize; x++)
+		{
+			varbuf1[z][ypadsize - 1][x] = double4(0.0, 0.0, 0.0, 0.0);
+			varbuf2[z][ypadsize - 1][x] = double4(0.0, 0.0, 0.0, 0.0);
+		}
+	}
+
+	if (use_z_axis)
+		for (y = 0; y < ypadsize; y++)
+			for (x = 0; x < xpadsize; x++)
+			{
+				varbuf1[zpadsize - 1][y][x] = double4(0.0, 0.0, 0.0, 0.0);
+				varbuf2[zpadsize - 1][y][x] = double4(0.0, 0.0, 0.0, 0.0);
+			}
+
+
+	// generate summed-area tables for x and x2; this is done in-place
+	for (z = 0; z < zpadsize; z++)
+		for (y = 0; y < ypadsize; y++)
+		{
+			double4 summa1 = double4(0.0, 0.0, 0.0, 0.0);
+			double4 summa2 = double4(0.0, 0.0, 0.0, 0.0);
+			for (x = 0; x < xpadsize; x++)
+			{
+				double4 val1 = varbuf1[z][y][x];
+				double4 val2 = varbuf2[z][y][x];
+				varbuf1[z][y][x] = summa1;
+				varbuf2[z][y][x] = summa2;
+				summa1 = summa1 + val1;
+				summa2 = summa2 + val2;
+			}
+		}
+
+	for (z = 0; z < zpadsize; z++)
+		for (x = 0; x < xpadsize; x++)
+		{
+			double4 summa1 = double4(0.0, 0.0, 0.0, 0.0);
+			double4 summa2 = double4(0.0, 0.0, 0.0, 0.0);
+			for (y = 0; y < ypadsize; y++)
+			{
+				double4 val1 = varbuf1[z][y][x];
+				double4 val2 = varbuf2[z][y][x];
+				varbuf1[z][y][x] = summa1;
+				varbuf2[z][y][x] = summa2;
+				summa1 = summa1 + val1;
+				summa2 = summa2 + val2;
+			}
+		}
+
+	if (use_z_axis)
+		for (y = 0; y < ypadsize; y++)
+			for (x = 0; x < xpadsize; x++)
+			{
+				double4 summa1 = double4(0.0, 0.0, 0.0, 0.0);
+				double4 summa2 = double4(0.0, 0.0, 0.0, 0.0);
+				for (z = 0; z < zpadsize; z++)
+				{
+					double4 val1 = varbuf1[z][y][x];
+					double4 val2 = varbuf2[z][y][x];
+					varbuf1[z][y][x] = summa1;
+					varbuf2[z][y][x] = summa2;
+					summa1 = summa1 + val1;
+					summa2 = summa2 + val2;
+				}
+			}
+
+
+	int avg_var_kerneldim = 2 * avg_var_kernel_radius + 1;
+	int alpha_kerneldim = 2 * alpha_kernel_radius + 1;
+
+
+	// compute a few constants used in the variance-calculation.
+	double avg_var_samples;
+	double alpha_rsamples;
+	double mul1;
+
+	if (use_z_axis)
+	{
+		avg_var_samples = avg_var_kerneldim * avg_var_kerneldim * avg_var_kerneldim;
+		alpha_rsamples = 1.0 / (alpha_kerneldim * alpha_kerneldim * alpha_kerneldim);
+	}
+	else
+	{
+		avg_var_samples = avg_var_kerneldim * avg_var_kerneldim;
+		alpha_rsamples = 1.0 / (alpha_kerneldim * alpha_kerneldim);
+	}
+
+
+	double avg_var_rsamples = 1.0 / avg_var_samples;
+	if (avg_var_samples == 1)
+		mul1 = 1.0;
+	else
+		mul1 = 1.0 / (avg_var_samples * (avg_var_samples - 1));
+
+
+	double mul2 = avg_var_samples * mul1;
+
+
+	// use the summed-area tables to compute variance for each sample-neighborhood
+	if (use_z_axis)
+	{
+		for (z = 0; z < zsize; z++)
+		{
+			int z_src = z + kernel_radius;
+			int z_dst = z + dest_zoffset;
+			for (y = 0; y < ysize; y++)
+			{
+				int y_src = y + kernel_radius;
+				int y_dst = y + dest_yoffset;
+
+				for (x = 0; x < xsize; x++)
+				{
+					int x_src = x + kernel_radius;
+					int x_dst = x + dest_xoffset;
+
+					// summed-area table lookups for alpha average
+					double vasum =
+						(varbuf1[z_src + 1][y_src - alpha_kernel_radius][x_src - alpha_kernel_radius].w
+						 - varbuf1[z_src + 1][y_src - alpha_kernel_radius][x_src + alpha_kernel_radius + 1].w
+						 - varbuf1[z_src + 1][y_src + alpha_kernel_radius + 1][x_src - alpha_kernel_radius].w
+						 + varbuf1[z_src + 1][y_src + alpha_kernel_radius + 1][x_src + alpha_kernel_radius + 1].w) -
+						(varbuf1[z_src][y_src - alpha_kernel_radius][x_src - alpha_kernel_radius].w
+						 - varbuf1[z_src][y_src - alpha_kernel_radius][x_src + alpha_kernel_radius + 1].w
+						 - varbuf1[z_src][y_src + alpha_kernel_radius + 1][x_src - alpha_kernel_radius].w + varbuf1[z_src][y_src + alpha_kernel_radius + 1][x_src + alpha_kernel_radius + 1].w);
+					input_alpha_averages[z_dst][y_dst][x_dst] = static_cast < float >(vasum * alpha_rsamples);
+
+
+					// summed-area table lookups for RGBA average
+					double4 v0sum =
+						(varbuf1[z_src + 1][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius]
+						 - varbuf1[z_src + 1][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1]
+						 - varbuf1[z_src + 1][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius]
+						 + varbuf1[z_src + 1][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]) -
+						(varbuf1[z_src][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius]
+						 - varbuf1[z_src][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1]
+						 - varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]);
+
+					double4 avg = v0sum * avg_var_rsamples;
+
+					float4 favg = float4(static_cast < float >(avg.x),
+										 static_cast < float >(avg.y),
+										 static_cast < float >(avg.z),
+										 static_cast < float >(avg.w));
+					input_averages[z_dst][y_dst][x_dst] = favg;
+
+
+					// summed-area table lookups for variance
+					double4 v1sum =
+						(varbuf1[z_src + 1][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius]
+						 - varbuf1[z_src + 1][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1]
+						 - varbuf1[z_src + 1][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius]
+						 + varbuf1[z_src + 1][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]) -
+						(varbuf1[z_src][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius]
+						 - varbuf1[z_src][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1]
+						 - varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]);
+					double4 v2sum =
+						(varbuf2[z_src + 1][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius]
+						 - varbuf2[z_src + 1][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1]
+						 - varbuf2[z_src + 1][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius]
+						 + varbuf2[z_src + 1][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]) -
+						(varbuf2[z_src][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius]
+						 - varbuf2[z_src][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1]
+						 - varbuf2[z_src][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf2[z_src][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]);
+
+					// the actual variance
+					double4 variance = mul2 * v2sum - mul1 * (v1sum * v1sum);
+
+					float4 fvar = float4(static_cast < float >(variance.x),
+										 static_cast < float >(variance.y),
+										 static_cast < float >(variance.z),
+										 static_cast < float >(variance.w));
+					input_variances[z_dst][y_dst][x_dst] = fvar;
+				}
+			}
+		}
+	}
+	else
+	{
+		for (z = 0; z < zsize; z++)
+		{
+			int z_src = z;
+			int z_dst = z + dest_zoffset;
+			for (y = 0; y < ysize; y++)
+			{
+				int y_src = y + kernel_radius;
+				int y_dst = y + dest_yoffset;
+
+				for (x = 0; x < xsize; x++)
+				{
+					int x_src = x + kernel_radius;
+					int x_dst = x + dest_xoffset;
+
+					// summed-area table lookups for alpha average
+					double vasum =
+						varbuf1[z_src][y_src - alpha_kernel_radius][x_src - alpha_kernel_radius].w
+						- varbuf1[z_src][y_src - alpha_kernel_radius][x_src + alpha_kernel_radius + 1].w
+						- varbuf1[z_src][y_src + alpha_kernel_radius + 1][x_src - alpha_kernel_radius].w + varbuf1[z_src][y_src + alpha_kernel_radius + 1][x_src + alpha_kernel_radius + 1].w;
+					input_alpha_averages[z_dst][y_dst][x_dst] = static_cast < float >(vasum * alpha_rsamples);
+
+
+					// summed-area table lookups for RGBA average
+					double4 v0sum =
+						varbuf1[z_src][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius]
+						- varbuf1[z_src][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1]
+						- varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1];
+
+					double4 avg = v0sum * avg_var_rsamples;
+
+					float4 favg = float4(static_cast < float >(avg.x),
+										 static_cast < float >(avg.y),
+										 static_cast < float >(avg.z),
+										 static_cast < float >(avg.w));
+					input_averages[z_dst][y_dst][x_dst] = favg;
+
+
+					// summed-area table lookups for variance
+					double4 v1sum =
+						varbuf1[z_src][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius]
+						- varbuf1[z_src][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1]
+						- varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1];
+					double4 v2sum =
+						varbuf2[z_src][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius]
+						- varbuf2[z_src][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1]
+						- varbuf2[z_src][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf2[z_src][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1];
+
+					// the actual variance
+					double4 variance = mul2 * v2sum - mul1 * (v1sum * v1sum);
+
+					float4 fvar = float4(static_cast < float >(variance.x),
+										 static_cast < float >(variance.y),
+										 static_cast < float >(variance.z),
+										 static_cast < float >(variance.w));
+					input_variances[z_dst][y_dst][x_dst] = fvar;
+				}
+			}
+		}
+	}
+	delete[]varbuf2[0][0];
+	delete[]varbuf1[0][0];
+	delete[]varbuf2[0];
+	delete[]varbuf1[0];
+	delete[]varbuf2;
+	delete[]varbuf1;
+}
+
+
+static void allocate_input_average_and_variance_buffers(int xsize, int ysize, int zsize)
+{
+	int y, z;
+	if (input_averages)
+	{
+		delete[]input_averages[0][0];
+		delete[]input_averages[0];
+		delete[]input_averages;
+	}
+	if (input_variances)
+	{
+		delete[]input_variances[0][0];
+		delete[]input_variances[0];
+		delete[]input_variances;
+	}
+	if (input_alpha_averages)
+	{
+		delete[]input_alpha_averages[0][0];
+		delete[]input_alpha_averages[0];
+		delete[]input_alpha_averages;
+	}
+
+	input_averages = new float4 **[zsize];
+	input_variances = new float4 **[zsize];
+	input_alpha_averages = new float **[zsize];
+
+
+	input_averages[0] = new float4 *[ysize * zsize];
+	input_variances[0] = new float4 *[ysize * zsize];
+	input_alpha_averages[0] = new float *[ysize * zsize];
+
+	input_averages[0][0] = new float4[xsize * ysize * zsize];
+	input_variances[0][0] = new float4[xsize * ysize * zsize];
+	input_alpha_averages[0][0] = new float[xsize * ysize * zsize];
+
+	for (z = 1; z < zsize; z++)
+	{
+		input_averages[z] = input_averages[0] + z * ysize;
+		input_variances[z] = input_variances[0] + z * ysize;
+		input_alpha_averages[z] = input_alpha_averages[0] + z * ysize;
+
+		input_averages[z][0] = input_averages[0][0] + z * ysize * xsize;
+		input_variances[z][0] = input_variances[0][0] + z * ysize * xsize;
+		input_alpha_averages[z][0] = input_alpha_averages[0][0] + z * ysize * xsize;
+	}
+
+	for (z = 0; z < zsize; z++)
+		for (y = 1; y < ysize; y++)
+		{
+			input_averages[z][y] = input_averages[z][0] + y * xsize;
+			input_variances[z][y] = input_variances[z][0] + y * xsize;
+			input_alpha_averages[z][y] = input_alpha_averages[z][0] + y * xsize;
+		}
+
+}
+
+
+// compute averages and variances for the current input image.
+void compute_averages_and_variances(const astc_codec_image * img, float rgb_power_to_use, float alpha_power_to_use, int avg_var_kernel_radius, int alpha_kernel_radius, swizzlepattern swz)
+{
+	int xsize = img->xsize;
+	int ysize = img->ysize;
+	int zsize = img->zsize;
+	allocate_input_average_and_variance_buffers(xsize, ysize, zsize);
+
+
+	int x, y, z;
+	for (z = 0; z < zsize; z += 32)
+	{
+		int zblocksize = MIN(32, zsize - z);
+		for (y = 0; y < ysize; y += 32)
+		{
+			int yblocksize = MIN(32, ysize - y);
+			for (x = 0; x < xsize; x += 32)
+			{
+				int xblocksize = MIN(32, xsize - x);
+				compute_pixel_region_variance(img,
+											  rgb_power_to_use,
+											  alpha_power_to_use,
+											  swz,
+											  (zsize > 1),
+											  x + img->padding,
+											  y + img->padding, z + (zsize > 1 ? img->padding : 0), xblocksize, yblocksize, zblocksize, avg_var_kernel_radius, alpha_kernel_radius, x, y, z);
+			}
+		}
+	}
+}
--- a/3rdparty/astc/astc_decompress_symbolic.cpp
+++ b/3rdparty/astc/astc_decompress_symbolic.cpp
@@ -0,0 +1,317 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Decompress a block of colors, expressed as a symbolic block,
+ *			for ASTC.
+ */
+/*----------------------------------------------------------------------------*/
+
+#include <math.h>
+
+#include "astc_codec_internals.h"
+
+#include "softfloat.h"
+#include <stdio.h>
+
+int compute_value_of_texel_int(int texel_to_get, const decimation_table * it, const int *weights)
+{
+	int i;
+	int summed_value = 8;
+	int weights_to_evaluate = it->texel_num_weights[texel_to_get];
+	for (i = 0; i < weights_to_evaluate; i++)
+	{
+		summed_value += weights[it->texel_weights[texel_to_get][i]] * it->texel_weights_int[texel_to_get][i];
+	}
+	return summed_value >> 4;
+}
+
+
+ushort4 lerp_color_int(astc_decode_mode decode_mode, ushort4 color0, ushort4 color1, int weight, int plane2_weight, int plane2_color_component	// -1 in 1-plane mode
+	)
+{
+	int4 ecolor0 = int4(color0.x, color0.y, color0.z, color0.w);
+	int4 ecolor1 = int4(color1.x, color1.y, color1.z, color1.w);
+
+	int4 eweight1 = int4(weight, weight, weight, weight);
+	switch (plane2_color_component)
+	{
+	case 0:
+		eweight1.x = plane2_weight;
+		break;
+	case 1:
+		eweight1.y = plane2_weight;
+		break;
+	case 2:
+		eweight1.z = plane2_weight;
+		break;
+	case 3:
+		eweight1.w = plane2_weight;
+		break;
+	default:
+		break;
+	}
+
+	int4 eweight0 = int4(64, 64, 64, 64) - eweight1;
+
+	if (decode_mode == DECODE_LDR_SRGB)
+	{
+		ecolor0 = ecolor0 >> 8;
+		ecolor1 = ecolor1 >> 8;
+	}
+	int4 color = (ecolor0 * eweight0) + (ecolor1 * eweight1) + int4(32, 32, 32, 32);
+	color = color >> 6;
+	if (decode_mode == DECODE_LDR_SRGB)
+		color = color | (color << 8);
+
+	ushort4 rcolor = ushort4(color.x, color.y, color.z, color.w);
+	return rcolor;
+}
+
+
+void decompress_symbolic_block(astc_decode_mode decode_mode,
+							   int xdim, int ydim, int zdim,   // dimensions of block
+							   int xpos, int ypos, int zpos,   // position of block
+							   const symbolic_compressed_block * scb, imageblock * blk)
+{
+	blk->xpos = xpos;
+	blk->ypos = ypos;
+	blk->zpos = zpos;
+
+	int i;
+
+	// if we detected an error-block, blow up immediately.
+	if (scb->error_block)
+	{
+		if (decode_mode == DECODE_LDR_SRGB)
+		{
+			for (i = 0; i < xdim * ydim * zdim; i++)
+			{
+				blk->orig_data[4 * i] = 1.0f;
+				blk->orig_data[4 * i + 1] = 0.0f;
+				blk->orig_data[4 * i + 2] = 1.0f;
+				blk->orig_data[4 * i + 3] = 1.0f;
+				blk->rgb_lns[i] = 0;
+				blk->alpha_lns[i] = 0;
+				blk->nan_texel[i] = 0;
+			}
+		}
+		else
+		{
+			for (i = 0; i < xdim * ydim * zdim; i++)
+			{
+				blk->orig_data[4 * i] = 0.0f;
+				blk->orig_data[4 * i + 1] = 0.0f;
+				blk->orig_data[4 * i + 2] = 0.0f;
+				blk->orig_data[4 * i + 3] = 0.0f;
+				blk->rgb_lns[i] = 0;
+				blk->alpha_lns[i] = 0;
+				blk->nan_texel[i] = 1;
+			}
+		}
+
+		imageblock_initialize_work_from_orig(blk, xdim * ydim * zdim);
+		update_imageblock_flags(blk, xdim, ydim, zdim);
+		return;
+	}
+
+
+
+
+	if (scb->block_mode < 0)
+	{
+		float red = 0, green = 0, blue = 0, alpha = 0;
+		int use_lns = 0;
+		int use_nan = 0;
+
+		if (scb->block_mode == -2)
+		{
+			// For sRGB decoding, we should return only the top 8 bits.
+			int mask = (decode_mode == DECODE_LDR_SRGB) ? 0xFF00 : 0xFFFF;
+
+			red = sf16_to_float(unorm16_to_sf16(scb->constant_color[0] & mask));
+			green = sf16_to_float(unorm16_to_sf16(scb->constant_color[1] & mask));
+			blue = sf16_to_float(unorm16_to_sf16(scb->constant_color[2] & mask));
+			alpha = sf16_to_float(unorm16_to_sf16(scb->constant_color[3] & mask));
+			use_lns = 0;
+			use_nan = 0;
+		}
+		else
+		{
+			switch (decode_mode)
+			{
+			case DECODE_LDR_SRGB:
+				red = 1.0f;
+				green = 0.0f;
+				blue = 1.0f;
+				alpha = 1.0f;
+				use_lns = 0;
+				use_nan = 0;
+				break;
+			case DECODE_LDR:
+				red = 0.0f;
+				green = 0.0f;
+				blue = 0.0f;
+				alpha = 0.0f;
+				use_lns = 0;
+				use_nan = 1;
+				break;
+			case DECODE_HDR:
+				// constant-color block; unpack from FP16 to FP32.
+				red = sf16_to_float(scb->constant_color[0]);
+				green = sf16_to_float(scb->constant_color[1]);
+				blue = sf16_to_float(scb->constant_color[2]);
+				alpha = sf16_to_float(scb->constant_color[3]);
+				use_lns = 1;
+				use_nan = 0;
+				break;
+			}
+		}
+
+		for (i = 0; i < xdim * ydim * zdim; i++)
+		{
+			blk->orig_data[4 * i] = red;
+			blk->orig_data[4 * i + 1] = green;
+			blk->orig_data[4 * i + 2] = blue;
+			blk->orig_data[4 * i + 3] = alpha;
+			blk->rgb_lns[i] = use_lns;
+			blk->alpha_lns[i] = use_lns;
+			blk->nan_texel[i] = use_nan;
+		}
+
+
+		imageblock_initialize_work_from_orig(blk, xdim * ydim * zdim);
+		update_imageblock_flags(blk, xdim, ydim, zdim);
+		return;
+	}
+
+
+	// get the appropriate partition-table entry
+	int partition_count = scb->partition_count;
+	const partition_info *pt = get_partition_table(xdim, ydim, zdim, partition_count);
+	pt += scb->partition_index;
+
+	// get the appropriate block descriptor
+	const block_size_descriptor *bsd = get_block_size_descriptor(xdim, ydim, zdim);
+	const decimation_table *const *ixtab2 = bsd->decimation_tables;
+
+
+	const decimation_table *it = ixtab2[bsd->block_modes[scb->block_mode].decimation_mode];
+
+	int is_dual_plane = bsd->block_modes[scb->block_mode].is_dual_plane;
+
+	int weight_quantization_level = bsd->block_modes[scb->block_mode].quantization_mode;
+
+
+	// decode the color endpoints
+	ushort4 color_endpoint0[4];
+	ushort4 color_endpoint1[4];
+	int rgb_hdr_endpoint[4];
+	int alpha_hdr_endpoint[4];
+	int nan_endpoint[4];
+
+	for (i = 0; i < partition_count; i++)
+		unpack_color_endpoints(decode_mode,
+							   scb->color_formats[i],
+							   scb->color_quantization_level, scb->color_values[i], &(rgb_hdr_endpoint[i]), &(alpha_hdr_endpoint[i]), &(nan_endpoint[i]), &(color_endpoint0[i]), &(color_endpoint1[i]));
+
+
+
+
+
+	// first unquantize the weights
+	int uq_plane1_weights[MAX_WEIGHTS_PER_BLOCK];
+	int uq_plane2_weights[MAX_WEIGHTS_PER_BLOCK];
+	int weight_count = it->num_weights;
+
+
+	const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quantization_level]);
+
+	for (i = 0; i < weight_count; i++)
+	{
+		uq_plane1_weights[i] = qat->unquantized_value[scb->plane1_weights[i]];
+	}
+	if (is_dual_plane)
+	{
+		for (i = 0; i < weight_count; i++)
+			uq_plane2_weights[i] = qat->unquantized_value[scb->plane2_weights[i]];
+	}
+
+
+	// then undecimate them.
+	int weights[MAX_TEXELS_PER_BLOCK];
+	int plane2_weights[MAX_TEXELS_PER_BLOCK];
+
+
+	int texels_per_block = xdim * ydim * zdim;
+	for (i = 0; i < texels_per_block; i++)
+		weights[i] = compute_value_of_texel_int(i, it, uq_plane1_weights);
+
+	if (is_dual_plane)
+		for (i = 0; i < texels_per_block; i++)
+			plane2_weights[i] = compute_value_of_texel_int(i, it, uq_plane2_weights);
+
+
+	int plane2_color_component = scb->plane2_color_component;
+
+
+	// now that we have endpoint colors and weights, we can unpack actual colors for
+	// each texel.
+	for (i = 0; i < texels_per_block; i++)
+	{
+		int partition = pt->partition_of_texel[i];
+
+		ushort4 color = lerp_color_int(decode_mode,
+									   color_endpoint0[partition],
+									   color_endpoint1[partition],
+									   weights[i],
+									   plane2_weights[i],
+									   is_dual_plane ? plane2_color_component : -1);
+
+		blk->rgb_lns[i] = rgb_hdr_endpoint[partition];
+		blk->alpha_lns[i] = alpha_hdr_endpoint[partition];
+		blk->nan_texel[i] = nan_endpoint[partition];
+
+		blk->work_data[4 * i] = color.x;
+		blk->work_data[4 * i + 1] = color.y;
+		blk->work_data[4 * i + 2] = color.z;
+		blk->work_data[4 * i + 3] = color.w;
+	}
+
+	imageblock_initialize_orig_from_work(blk, xdim * ydim * zdim);
+
+	update_imageblock_flags(blk, xdim, ydim, zdim);
+}
+
+
+
+float compute_imageblock_difference(int xdim, int ydim, int zdim, const imageblock * p1, const imageblock * p2, const error_weight_block * ewb)
+{
+	int i;
+	int texels_per_block = xdim * ydim * zdim;
+	float summa = 0.0f;
+	const float *f1 = p1->work_data;
+	const float *f2 = p2->work_data;
+	for (i = 0; i < texels_per_block; i++)
+	{
+		float rdiff = fabsf(f1[4 * i] - f2[4 * i]);
+		float gdiff = fabs(f1[4 * i + 1] - f2[4 * i + 1]);
+		float bdiff = fabs(f1[4 * i + 2] - f2[4 * i + 2]);
+		float adiff = fabs(f1[4 * i + 3] - f2[4 * i + 3]);
+		rdiff = MIN(rdiff, 1e15f);
+		gdiff = MIN(gdiff, 1e15f);
+		bdiff = MIN(bdiff, 1e15f);
+		adiff = MIN(adiff, 1e15f);
+
+		summa += rdiff * rdiff * ewb->error_weights[i].x + gdiff * gdiff * ewb->error_weights[i].y + bdiff * bdiff * ewb->error_weights[i].z + adiff * adiff * ewb->error_weights[i].w;
+	}
+
+	return summa;
+}
--- a/3rdparty/astc/astc_encoding_choice_error.cpp
+++ b/3rdparty/astc/astc_encoding_choice_error.cpp
@@ -0,0 +1,310 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Determine color errors for ASTC compression.
+ *
+ *			We assume that there are two independent sources of color error in
+ *			any given partition.
+ *
+ *			These are:
+ *			* quantization errors
+ *			* encoding choice errors
+ *
+ *			Encoding choice errors are errors that come due to encoding choice,
+ *			such as:
+ *			* using luminance instead of RGB
+ *			* using RGB-scale instead of two RGB endpoints.
+ *			* dropping Alpha
+ *
+ *			Quantization errors occur due to the limited precision we use for
+ *			storing numbers.
+ *
+ *			Quantization errors generally scale with quantization level, but are
+ *			not actually independent of color encoding. In particular:
+ *			* if we can use offset encoding then quantization error is halved.
+ *			* if we can use blue-contraction, quantization error for red and
+ *			  green is halved.
+ *			* quantization error is higher for the HDR endpoint modes.
+ *
+ *			Other than these errors, quantization error is assumed to be
+ *			proportional to the quantization step.
+ */
+/*----------------------------------------------------------------------------*/
+
+#include "astc_codec_internals.h"
+
+#include <math.h>
+
+#ifdef DEBUG_PRINT_DIAGNOSTICS
+	#include <stdio.h>
+#endif
+
+// helper function to merge two endpoint-colors
+void merge_endpoints(const endpoints * ep1,	// contains three of the color components
+					 const endpoints * ep2,	// contains the remaining color component
+					 int separate_component, endpoints * res)
+{
+	int i;
+	int partition_count = ep1->partition_count;
+	res->partition_count = partition_count;
+	for (i = 0; i < partition_count; i++)
+	{
+		res->endpt0[i] = ep1->endpt0[i];
+		res->endpt1[i] = ep1->endpt1[i];
+	}
+
+	switch (separate_component)
+	{
+	case 0:
+		for (i = 0; i < partition_count; i++)
+		{
+			res->endpt0[i].x = ep2->endpt0[i].x;
+			res->endpt1[i].x = ep2->endpt1[i].x;
+		}
+		break;
+	case 1:
+		for (i = 0; i < partition_count; i++)
+		{
+			res->endpt0[i].y = ep2->endpt0[i].y;
+			res->endpt1[i].y = ep2->endpt1[i].y;
+		}
+		break;
+	case 2:
+		for (i = 0; i < partition_count; i++)
+		{
+			res->endpt0[i].z = ep2->endpt0[i].z;
+			res->endpt1[i].z = ep2->endpt1[i].z;
+		}
+		break;
+	case 3:
+		for (i = 0; i < partition_count; i++)
+		{
+			res->endpt0[i].w = ep2->endpt0[i].w;
+			res->endpt1[i].w = ep2->endpt1[i].w;
+		}
+		break;
+	}
+}
+
+
+
+/*
+   for a given set of input colors and a given partitioning, determine: color error that results
+   from RGB-scale encoding (relevant for LDR only) color error that results from RGB-lumashift encoding
+   (relevant for HDR only) color error that results from luminance-encoding color error that results
+   form dropping alpha. whether we are eligible for offset encoding whether we are eligible for
+   blue-contraction
+
+   The input data are: color data partitioning error-weight data
+ */
+
+
+void compute_encoding_choice_errors(int xdim, int ydim, int zdim, const imageblock * pb, const partition_info * pi, const error_weight_block * ewb,
+									int separate_component,	// component that is separated out in 2-plane mode, -1 in 1-plane mode
+									encoding_choice_errors * eci)
+{
+	int i;
+
+	int partition_count = pi->partition_count;
+
+	int texels_per_block = xdim * ydim * zdim;
+
+	#ifdef DEBUG_PRINT_DIAGNOSTICS
+		if (print_diagnostics)
+		{
+			printf("%s : texels-per-block=%dx%dx%d, separate_component=%d, partition-count=%d\n", __func__, xdim, ydim, zdim, separate_component, partition_count);
+		}
+	#endif
+
+	float3 averages[4];
+	float3 directions_rgb[4];
+	float2 directions_rg[4];
+	float2 directions_rb[4];
+	float2 directions_gb[4];
+
+	float4 error_weightings[4];
+	float4 color_scalefactors[4];
+	float4 inverse_color_scalefactors[4];
+
+	compute_partition_error_color_weightings(xdim, ydim, zdim, ewb, pi, error_weightings, color_scalefactors);
+
+	compute_averages_and_directions_rgb(pi, pb, ewb, color_scalefactors, averages, directions_rgb, directions_rg, directions_rb, directions_gb);
+
+	line3 uncorr_rgb_lines[4];
+	line3 samechroma_rgb_lines[4];	// for LDR-RGB-scale
+	line3 rgb_luma_lines[4];	// for HDR-RGB-scale
+	line3 luminance_lines[4];
+
+	processed_line3 proc_uncorr_rgb_lines[4];
+	processed_line3 proc_samechroma_rgb_lines[4];	// for LDR-RGB-scale
+	processed_line3 proc_rgb_luma_lines[4];	// for HDR-RGB-scale
+	processed_line3 proc_luminance_lines[4];
+
+
+	for (i = 0; i < partition_count; i++)
+	{
+		inverse_color_scalefactors[i].x = 1.0f / MAX(color_scalefactors[i].x, 1e-7f);
+		inverse_color_scalefactors[i].y = 1.0f / MAX(color_scalefactors[i].y, 1e-7f);
+		inverse_color_scalefactors[i].z = 1.0f / MAX(color_scalefactors[i].z, 1e-7f);
+		inverse_color_scalefactors[i].w = 1.0f / MAX(color_scalefactors[i].w, 1e-7f);
+
+
+		uncorr_rgb_lines[i].a = averages[i];
+		if (dot(directions_rgb[i], directions_rgb[i]) == 0.0f)
+			uncorr_rgb_lines[i].b = normalize(float3(color_scalefactors[i].xyz));
+		else
+			uncorr_rgb_lines[i].b = normalize(directions_rgb[i]);
+
+		samechroma_rgb_lines[i].a = float3(0, 0, 0);
+		if (dot(averages[i], averages[i]) < 1e-20)
+			samechroma_rgb_lines[i].b = normalize(float3(color_scalefactors[i].xyz));
+		else
+			samechroma_rgb_lines[i].b = normalize(averages[i]);
+
+		rgb_luma_lines[i].a = averages[i];
+		rgb_luma_lines[i].b = normalize(color_scalefactors[i].xyz);
+
+		luminance_lines[i].a = float3(0, 0, 0);
+		luminance_lines[i].b = normalize(color_scalefactors[i].xyz);
+
+		#ifdef DEBUG_PRINT_DIAGNOSTICS
+			if (print_diagnostics)
+			{
+				printf("Partition %d\n", i);
+				printf("Average = <%g %g %g>\n", averages[i].x, averages[i].y, averages[i].z);
+				printf("Uncorr-rgb-line = <%g %g %g> + t<%g %g %g>\n",
+					uncorr_rgb_lines[i].a.x, uncorr_rgb_lines[i].a.y, uncorr_rgb_lines[i].a.z, uncorr_rgb_lines[i].b.x, uncorr_rgb_lines[i].b.y, uncorr_rgb_lines[i].b.z);
+				printf("Samechroma-line = t<%g %g %g>\n", samechroma_rgb_lines[i].b.x, samechroma_rgb_lines[i].b.y, samechroma_rgb_lines[i].b.z);
+			}
+		#endif
+
+		proc_uncorr_rgb_lines[i].amod = (uncorr_rgb_lines[i].a - uncorr_rgb_lines[i].b * dot(uncorr_rgb_lines[i].a, uncorr_rgb_lines[i].b)) * inverse_color_scalefactors[i].xyz;
+		proc_uncorr_rgb_lines[i].bs = uncorr_rgb_lines[i].b * color_scalefactors[i].xyz;
+		proc_uncorr_rgb_lines[i].bis = uncorr_rgb_lines[i].b * inverse_color_scalefactors[i].xyz;
+
+		proc_samechroma_rgb_lines[i].amod = (samechroma_rgb_lines[i].a - samechroma_rgb_lines[i].b * dot(samechroma_rgb_lines[i].a, samechroma_rgb_lines[i].b)) * inverse_color_scalefactors[i].xyz;
+		proc_samechroma_rgb_lines[i].bs = samechroma_rgb_lines[i].b * color_scalefactors[i].xyz;
+		proc_samechroma_rgb_lines[i].bis = samechroma_rgb_lines[i].b * inverse_color_scalefactors[i].xyz;
+
+		proc_rgb_luma_lines[i].amod = (rgb_luma_lines[i].a - rgb_luma_lines[i].b * dot(rgb_luma_lines[i].a, rgb_luma_lines[i].b)) * inverse_color_scalefactors[i].xyz;
+		proc_rgb_luma_lines[i].bs = rgb_luma_lines[i].b * color_scalefactors[i].xyz;
+		proc_rgb_luma_lines[i].bis = rgb_luma_lines[i].b * inverse_color_scalefactors[i].xyz;
+
+		proc_luminance_lines[i].amod = (luminance_lines[i].a - luminance_lines[i].b * dot(luminance_lines[i].a, luminance_lines[i].b)) * inverse_color_scalefactors[i].xyz;
+		proc_luminance_lines[i].bs = luminance_lines[i].b * color_scalefactors[i].xyz;
+		proc_luminance_lines[i].bis = luminance_lines[i].b * inverse_color_scalefactors[i].xyz;
+
+	}
+
+
+
+	float uncorr_rgb_error[4];
+	float samechroma_rgb_error[4];
+	float rgb_luma_error[4];
+	float luminance_rgb_error[4];
+
+
+	for (i = 0; i < partition_count; i++)
+	{
+
+		uncorr_rgb_error[i] = compute_error_squared_rgb_single_partition(i, xdim, ydim, zdim, pi, pb, ewb, &(proc_uncorr_rgb_lines[i]));
+
+		samechroma_rgb_error[i] = compute_error_squared_rgb_single_partition(i, xdim, ydim, zdim, pi, pb, ewb, &(proc_samechroma_rgb_lines[i]));
+
+		rgb_luma_error[i] = compute_error_squared_rgb_single_partition(i, xdim, ydim, zdim, pi, pb, ewb, &(proc_rgb_luma_lines[i]));
+
+		luminance_rgb_error[i] = compute_error_squared_rgb_single_partition(i, xdim, ydim, zdim, pi, pb, ewb, &(proc_luminance_lines[i]));
+
+		#ifdef DEBUG_PRINT_DIAGNOSTICS
+			if (print_diagnostics)
+			{
+				printf("Partition %d : uncorr-error=%g  samechroma-error=%g  rgb-luma-error=%g  lum-error=%g\n",
+					i, uncorr_rgb_error[i], samechroma_rgb_error[i], rgb_luma_error[i], luminance_rgb_error[i]);
+			}
+		#endif
+	}
+
+	// compute the error that arises from just ditching alpha and RGB
+	float alpha_drop_error[4];
+	float rgb_drop_error[4];
+	for (i = 0; i < partition_count; i++)
+	{
+		alpha_drop_error[i] = 0;
+		rgb_drop_error[i] = 0;
+	}
+	for (i = 0; i < texels_per_block; i++)
+	{
+		int partition = pi->partition_of_texel[i];
+		float alpha = pb->work_data[4 * i + 3];
+		float default_alpha = pb->alpha_lns[i] ? (float)0x7800 : (float)0xFFFF;
+
+		float omalpha = alpha - default_alpha;
+		alpha_drop_error[partition] += omalpha * omalpha * ewb->error_weights[i].w;
+		float red = pb->work_data[4 * i];
+		float green = pb->work_data[4 * i + 1];
+		float blue = pb->work_data[4 * i + 2];
+		rgb_drop_error[partition] += red * red * ewb->error_weights[i].x + green * green * ewb->error_weights[i].y + blue * blue * ewb->error_weights[i].z;
+	}
+
+	// check if we are eligible for blue-contraction and offset-encoding
+
+	endpoints ep;
+	if (separate_component == -1)
+	{
+		endpoints_and_weights ei;
+		compute_endpoints_and_ideal_weights_1_plane(xdim, ydim, zdim, pi, pb, ewb, &ei);
+		ep = ei.ep;
+	}
+	else
+	{
+		endpoints_and_weights ei1, ei2;
+		compute_endpoints_and_ideal_weights_2_planes(xdim, ydim, zdim, pi, pb, ewb, separate_component, &ei1, &ei2);
+
+		merge_endpoints(&(ei1.ep), &(ei2.ep), separate_component, &ep);
+	}
+
+	int eligible_for_offset_encode[4];
+	int eligible_for_blue_contraction[4];
+
+	for (i = 0; i < partition_count; i++)
+	{
+		float4 endpt0 = ep.endpt0[i];
+		float4 endpt1 = ep.endpt1[i];
+		float4 endpt_dif = endpt1 - endpt0;
+		if (fabs(endpt_dif.x) < (0.12 * 65535.0f) && fabs(endpt_dif.y) < (0.12 * 65535.0f) && fabs(endpt_dif.z) < (0.12 * 65535.0f))
+			eligible_for_offset_encode[i] = 1;
+		else
+			eligible_for_offset_encode[i] = 0;
+		endpt0.x += (endpt0.x - endpt0.z);
+		endpt0.y += (endpt0.y - endpt0.z);
+		endpt1.x += (endpt1.x - endpt1.z);
+		endpt1.y += (endpt1.y - endpt1.z);
+		if (endpt0.x > (0.01f * 65535.0f) && endpt0.x < (0.99f * 65535.0f)
+			&& endpt1.x > (0.01f * 65535.0f) && endpt1.x < (0.99f * 65535.0f)
+			&& endpt0.y > (0.01f * 65535.0f) && endpt0.y < (0.99f * 65535.0f) && endpt1.y > (0.01f * 65535.0f) && endpt1.y < (0.99f * 65535.0f))
+			eligible_for_blue_contraction[i] = 1;
+		else
+			eligible_for_blue_contraction[i] = 0;
+	}
+
+
+	// finally, gather up our results
+	for (i = 0; i < partition_count; i++)
+	{
+		eci[i].rgb_scale_error = (samechroma_rgb_error[i] - uncorr_rgb_error[i]) * 0.7f;	// empirical
+		eci[i].rgb_luma_error = (rgb_luma_error[i] - uncorr_rgb_error[i]) * 1.5f;	// wild guess
+		eci[i].luminance_error = (luminance_rgb_error[i] - uncorr_rgb_error[i]) * 3.0f;	// empirical
+		eci[i].alpha_drop_error = alpha_drop_error[i] * 3.0f;
+		eci[i].rgb_drop_error = rgb_drop_error[i] * 3.0f;
+		eci[i].can_offset_encode = eligible_for_offset_encode[i];
+		eci[i].can_blue_contract = eligible_for_blue_contraction[i];
+	}
+}
--- a/3rdparty/astc/astc_find_best_partitioning.cpp
+++ b/3rdparty/astc/astc_find_best_partitioning.cpp
@@ -0,0 +1,865 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	ASTC encoding of texture
+ *
+ *			major step 1:
+ *			* find best partitioning assuming uncorrelated colors
+ *			* find best partitioning assuming RGBS color representation
+ *
+ *			finding best partitioning for a block:
+ *			* for each available partitioning:
+ *			* compute mean-color-value and dominant direction.
+ *			* this defines two lines, both of which go through the
+ *			  mean-color-value:
+ *			* one line has a direction defined by the dominant direction;
+ *			  this line is used to assess the error from using an uncorrelated
+ *			  color representation.
+ *			* the other line goes through (0,0,0,1) and is used to assess the
+ *			  error from using an RGBS color representation.
+ *			* we then compute, as a sum across the block, the squared-errors
+ *			  that result from using the dominant-direction-lines and the
+ *			  squared-errors that result from using the 0001-lines.
+ */
+/*----------------------------------------------------------------------------*/
+
+/*
+ *	Partition table representation:
+ *	We have 3 tables, each with 1024 partitionings
+ *	(these correspond to the 3x128 hardware partitionings crossed with all the
+ *	partition-transform modes in the hardware.)
+ *
+ *	For each partitioning, we have:
+ *	* a 4-entry table indicating how many texels there are in each of the 4
+ *	  partitions. this may be from 2 to about 60 or so.
+ *	* a 64-entry table indicating the partition index of each of the 64 texels
+ *	  in the block. each index may be 0, 1, 2 or 3.
+ *
+ * each element in the table is an uint8_t indicating partition index (0, 1, 2 or 3)
+ */
+
+#include <math.h>
+
+#include "astc_codec_internals.h"
+
+#ifdef DEBUG_PRINT_DIAGNOSTICS
+	#include <stdio.h>
+#endif
+
+#include "mathlib.h"
+
+int imageblock_uses_alpha(int xdim, int ydim, int zdim, const imageblock * pb)
+{
+	IGNORE(xdim);
+	IGNORE(ydim);
+	IGNORE(zdim);
+
+	return pb->alpha_max != pb->alpha_min;
+}
+
+static void compute_alpha_minmax(int xdim, int ydim, int zdim, const partition_info * pt, const imageblock * blk, const error_weight_block * ewb, float *alpha_min, float *alpha_max)
+{
+	int i;
+	int partition_count = pt->partition_count;
+
+	int texels_per_block = xdim * ydim * zdim;
+
+	for (i = 0; i < partition_count; i++)
+	{
+		alpha_min[i] = 1e38f;
+		alpha_max[i] = -1e38f;
+	}
+
+	for (i = 0; i < texels_per_block; i++)
+	{
+		if (ewb->texel_weight[i] > 1e-10)
+		{
+			int partition = pt->partition_of_texel[i];
+			float alphaval = blk->work_data[4 * i + 3];
+			if (alphaval > alpha_max[partition])
+				alpha_max[partition] = alphaval;
+			if (alphaval < alpha_min[partition])
+				alpha_min[partition] = alphaval;
+		}
+	}
+
+	for (i = 0; i < partition_count; i++)
+	{
+		if (alpha_min[i] >= alpha_max[i])
+		{
+			alpha_min[i] = 0;
+			alpha_max[i] = 1e-10f;
+		}
+	}
+}
+
+
+static void compute_rgb_minmax(int xdim,
+							   int ydim,
+							   int zdim,
+							   const partition_info * pt,
+							   const imageblock * blk, const error_weight_block * ewb, float *red_min, float *red_max, float *green_min, float *green_max, float *blue_min, float *blue_max)
+{
+	int i;
+	int partition_count = pt->partition_count;
+	int texels_per_block = xdim * ydim * zdim;
+
+	for (i = 0; i < partition_count; i++)
+	{
+		red_min[i] = 1e38f;
+		red_max[i] = -1e38f;
+		green_min[i] = 1e38f;
+		green_max[i] = -1e38f;
+		blue_min[i] = 1e38f;
+		blue_max[i] = -1e38f;
+	}
+
+	for (i = 0; i < texels_per_block; i++)
+	{
+		if (ewb->texel_weight[i] > 1e-10f)
+		{
+			int partition = pt->partition_of_texel[i];
+			float redval = blk->work_data[4 * i];
+			float greenval = blk->work_data[4 * i + 1];
+			float blueval = blk->work_data[4 * i + 2];
+			if (redval > red_max[partition])
+				red_max[partition] = redval;
+			if (redval < red_min[partition])
+				red_min[partition] = redval;
+			if (greenval > green_max[partition])
+				green_max[partition] = greenval;
+			if (greenval < green_min[partition])
+				green_min[partition] = greenval;
+			if (blueval > blue_max[partition])
+				blue_max[partition] = blueval;
+			if (blueval < blue_min[partition])
+				blue_min[partition] = blueval;
+		}
+	}
+	for (i = 0; i < partition_count; i++)
+	{
+		if (red_min[i] >= red_max[i])
+		{
+			red_min[i] = 0.0f;
+			red_max[i] = 1e-10f;
+		}
+		if (green_min[i] >= green_max[i])
+		{
+			green_min[i] = 0.0f;
+			green_max[i] = 1e-10f;
+		}
+		if (blue_min[i] >= blue_max[i])
+		{
+			blue_min[i] = 0.0f;
+			blue_max[i] = 1e-10f;
+		}
+	}
+}
+
+
+
+void compute_partition_error_color_weightings(int xdim, int ydim, int zdim, const error_weight_block * ewb, const partition_info * pi, float4 error_weightings[4], float4 color_scalefactors[4])
+{
+	int i;
+	int texels_per_block = xdim * ydim * zdim;
+	int pcnt = pi->partition_count;
+	for (i = 0; i < pcnt; i++)
+		error_weightings[i] = float4(1e-12f, 1e-12f, 1e-12f, 1e-12f);
+	for (i = 0; i < texels_per_block; i++)
+	{
+		int part = pi->partition_of_texel[i];
+		error_weightings[part] = error_weightings[part] + ewb->error_weights[i];
+	}
+	for (i = 0; i < pcnt; i++)
+	{
+		error_weightings[i] = error_weightings[i] * (1.0f / pi->texels_per_partition[i]);
+	}
+	for (i = 0; i < pcnt; i++)
+	{
+		color_scalefactors[i].x = sqrt(error_weightings[i].x);
+		color_scalefactors[i].y = sqrt(error_weightings[i].y);
+		color_scalefactors[i].z = sqrt(error_weightings[i].z);
+		color_scalefactors[i].w = sqrt(error_weightings[i].w);
+	}
+
+}
+
+
+/*
+   main function to identify the best partitioning for a given number of texels */
+
+
+void find_best_partitionings(int partition_search_limit, int xdim, int ydim, int zdim, int partition_count,
+							 const imageblock * pb, const error_weight_block * ewb, int candidates_to_return,
+							 // best partitionings to use if the endpoint colors are assumed to be uncorrelated
+							 int *best_partitions_uncorrellated,
+							 // best partitionings to use if the endpoint colors have the same chroma
+							 int *best_partitions_samechroma,
+							 // best partitionings to use if using dual plane of weights
+							 int *best_partitions_dual_weight_planes)
+{
+
+
+	int i, j;
+
+	int texels_per_block = xdim * ydim * zdim;
+
+	// constant used to estimate quantization error for a given partitioning;
+	// the optimal value for this constant depends on bitrate.
+	// These constants have been determined empirically.
+
+	float weight_imprecision_estim = 100;
+
+	if (texels_per_block <= 20)
+		weight_imprecision_estim = 0.03f;
+	else if (texels_per_block <= 31)
+		weight_imprecision_estim = 0.04f;
+	else if (texels_per_block <= 41)
+		weight_imprecision_estim = 0.05f;
+	else
+		weight_imprecision_estim = 0.055f;
+
+
+	int partition_sequence[PARTITION_COUNT];
+
+	kmeans_compute_partition_ordering(xdim, ydim, zdim, partition_count, pb, partition_sequence);
+
+
+	float weight_imprecision_estim_squared = weight_imprecision_estim * weight_imprecision_estim;
+
+#ifdef DEBUG_PRINT_DIAGNOSTICS
+	if (print_diagnostics)
+		printf("weight_imprecision_estim = %g\n", weight_imprecision_estim);
+#endif
+
+	int uses_alpha = imageblock_uses_alpha(xdim, ydim, zdim, pb);
+
+	const partition_info *ptab = get_partition_table(xdim, ydim, zdim, partition_count);
+
+	// partitioning errors assuming uncorrelated-chrominance endpoints
+	float uncorr_errors[PARTITION_COUNT];
+	// partitioning errors assuming same-chrominance endpoints
+	float samechroma_errors[PARTITION_COUNT];
+
+	// partitioning errors assuming that one of the color channels
+	// is uncorrelated from all the other ones
+	float separate_errors[4 * PARTITION_COUNT];
+
+
+	float *separate_red_errors = separate_errors;
+	float *separate_green_errors = separate_errors + PARTITION_COUNT;
+	float *separate_blue_errors = separate_errors + 2 * PARTITION_COUNT;
+	float *separate_alpha_errors = separate_errors + 3 * PARTITION_COUNT;
+
+	int defacto_search_limit = PARTITION_COUNT - 1;
+
+	if (uses_alpha)
+	{
+
+		#ifdef DEBUG_PRINT_DIAGNOSTICS
+			if (print_diagnostics)
+				printf("Partition testing with alpha, %d partitions\n\n", partition_count);
+		#endif
+
+		for (i = 0; i < PARTITION_COUNT; i++)
+		{
+			int partition = partition_sequence[i];
+			int bk_partition_count = ptab[partition].partition_count;
+
+			if (bk_partition_count < partition_count)
+			{
+				#ifdef DEBUG_PRINT_DIAGNOSTICS
+					if (print_diagnostics)
+						printf("Partitioning %d-%d: invalid\n", partition_count, partition);
+				#endif
+
+				uncorr_errors[i] = 1e35f;
+				samechroma_errors[i] = 1e35f;
+				separate_red_errors[i] = 1e35f;
+				separate_green_errors[i] = 1e35f;
+				separate_blue_errors[i] = 1e35f;
+				separate_alpha_errors[i] = 1e35f;
+				continue;
+			}
+			// the sentinel value for partitions above the search limit must be smaller
+			// than the sentinel value for invalid partitions
+			if (i >= partition_search_limit)
+			{
+				#ifdef DEBUG_PRINT_DIAGNOSTICS
+					if (print_diagnostics)
+						printf("Partitioning %d-%d: excluded from testing\n", partition_count, partition);
+				#endif
+
+				defacto_search_limit = i;
+
+				uncorr_errors[i] = 1e34f;
+				samechroma_errors[i] = 1e34f;
+				separate_red_errors[i] = 1e34f;
+				separate_green_errors[i] = 1e34f;
+				separate_blue_errors[i] = 1e34f;
+				separate_alpha_errors[i] = 1e34f;
+				break;
+			}
+
+			// compute the weighting to give to each color channel
+			// in each partition.
+			float4 error_weightings[4];
+			float4 color_scalefactors[4];
+			float4 inverse_color_scalefactors[4];
+			compute_partition_error_color_weightings(xdim, ydim, zdim, ewb, ptab + partition, error_weightings, color_scalefactors);
+
+			for (j = 0; j < partition_count; j++)
+			{
+				inverse_color_scalefactors[j].x = 1.0f / MAX(color_scalefactors[j].x, 1e-7f);
+				inverse_color_scalefactors[j].y = 1.0f / MAX(color_scalefactors[j].y, 1e-7f);
+				inverse_color_scalefactors[j].z = 1.0f / MAX(color_scalefactors[j].z, 1e-7f);
+				inverse_color_scalefactors[j].w = 1.0f / MAX(color_scalefactors[j].w, 1e-7f);
+			}
+
+			float4 averages[4];
+			float4 directions_rgba[4];
+			float3 directions_gba[4];
+			float3 directions_rba[4];
+			float3 directions_rga[4];
+			float3 directions_rgb[4];
+
+			compute_averages_and_directions_rgba(ptab + partition, pb, ewb, color_scalefactors, averages, directions_rgba, directions_gba, directions_rba, directions_rga, directions_rgb);
+
+			line4 uncorr_lines[4];
+			line4 samechroma_lines[4];
+			line3 separate_red_lines[4];
+			line3 separate_green_lines[4];
+			line3 separate_blue_lines[4];
+			line3 separate_alpha_lines[4];
+
+			processed_line4 proc_uncorr_lines[4];
+			processed_line4 proc_samechroma_lines[4];
+			processed_line3 proc_separate_red_lines[4];
+			processed_line3 proc_separate_green_lines[4];
+			processed_line3 proc_separate_blue_lines[4];
+			processed_line3 proc_separate_alpha_lines[4];
+
+			float uncorr_linelengths[4];
+			float samechroma_linelengths[4];
+			float separate_red_linelengths[4];
+			float separate_green_linelengths[4];
+			float separate_blue_linelengths[4];
+			float separate_alpha_linelengths[4];
+
+
+
+			for (j = 0; j < partition_count; j++)
+			{
+				uncorr_lines[j].a = averages[j];
+				if (dot(directions_rgba[j], directions_rgba[j]) == 0.0f)
+					uncorr_lines[j].b = normalize(float4(1, 1, 1, 1));
+				else
+					uncorr_lines[j].b = normalize(directions_rgba[j]);
+
+				proc_uncorr_lines[j].amod = (uncorr_lines[j].a - uncorr_lines[j].b * dot(uncorr_lines[j].a, uncorr_lines[j].b)) * inverse_color_scalefactors[j];
+				proc_uncorr_lines[j].bs = (uncorr_lines[j].b * color_scalefactors[j]);
+				proc_uncorr_lines[j].bis = (uncorr_lines[j].b * inverse_color_scalefactors[j]);
+
+
+				samechroma_lines[j].a = float4(0, 0, 0, 0);
+				if (dot(averages[j], averages[j]) == 0)
+					samechroma_lines[j].b = normalize(float4(1, 1, 1, 1));
+				else
+					samechroma_lines[j].b = normalize(averages[j]);
+
+				proc_samechroma_lines[j].amod = (samechroma_lines[j].a - samechroma_lines[j].b * dot(samechroma_lines[j].a, samechroma_lines[j].b)) * inverse_color_scalefactors[j];
+				proc_samechroma_lines[j].bs = (samechroma_lines[j].b * color_scalefactors[j]);
+				proc_samechroma_lines[j].bis = (samechroma_lines[j].b * inverse_color_scalefactors[j]);
+
+				separate_red_lines[j].a = averages[j].yzw;
+				if (dot(directions_gba[j], directions_gba[j]) == 0.0f)
+					separate_red_lines[j].b = normalize(float3(1, 1, 1));
+				else
+					separate_red_lines[j].b = normalize(directions_gba[j]);
+
+				separate_green_lines[j].a = averages[j].xzw;
+				if (dot(directions_rba[j], directions_rba[j]) == 0.0f)
+					separate_green_lines[j].b = normalize(float3(1, 1, 1));
+				else
+					separate_green_lines[j].b = normalize(directions_rba[j]);
+
+				separate_blue_lines[j].a = averages[j].xyw;
+				if (dot(directions_rga[j], directions_rga[j]) == 0.0f)
+					separate_blue_lines[j].b = normalize(float3(1, 1, 1));
+				else
+					separate_blue_lines[j].b = normalize(directions_rga[j]);
+
+				separate_alpha_lines[j].a = averages[j].xyz;
+				if (dot(directions_rgb[j], directions_rgb[j]) == 0.0f)
+					separate_alpha_lines[j].b = normalize(float3(1, 1, 1));
+				else
+					separate_alpha_lines[j].b = normalize(directions_rgb[j]);
+
+				proc_separate_red_lines[j].amod = (separate_red_lines[j].a - separate_red_lines[j].b * dot(separate_red_lines[j].a, separate_red_lines[j].b)) * inverse_color_scalefactors[j].yzw;
+				proc_separate_red_lines[j].bs = (separate_red_lines[j].b * color_scalefactors[j].yzw);
+				proc_separate_red_lines[j].bis = (separate_red_lines[j].b * inverse_color_scalefactors[j].yzw);
+
+				proc_separate_green_lines[j].amod =
+					(separate_green_lines[j].a - separate_green_lines[j].b * dot(separate_green_lines[j].a, separate_green_lines[j].b)) * inverse_color_scalefactors[j].xzw;
+				proc_separate_green_lines[j].bs = (separate_green_lines[j].b * color_scalefactors[j].xzw);
+				proc_separate_green_lines[j].bis = (separate_green_lines[j].b * inverse_color_scalefactors[j].xzw);
+
+				proc_separate_blue_lines[j].amod = (separate_blue_lines[j].a - separate_blue_lines[j].b * dot(separate_blue_lines[j].a, separate_blue_lines[j].b)) * inverse_color_scalefactors[j].xyw;
+				proc_separate_blue_lines[j].bs = (separate_blue_lines[j].b * color_scalefactors[j].xyw);
+				proc_separate_blue_lines[j].bis = (separate_blue_lines[j].b * inverse_color_scalefactors[j].xyw);
+
+				proc_separate_alpha_lines[j].amod =
+					(separate_alpha_lines[j].a - separate_alpha_lines[j].b * dot(separate_alpha_lines[j].a, separate_alpha_lines[j].b)) * inverse_color_scalefactors[j].xyz;
+				proc_separate_alpha_lines[j].bs = (separate_alpha_lines[j].b * color_scalefactors[j].xyz);
+				proc_separate_alpha_lines[j].bis = (separate_alpha_lines[j].b * inverse_color_scalefactors[j].xyz);
+
+			}
+
+			float uncorr_error = compute_error_squared_rgba(ptab + partition,
+															pb,
+															ewb,
+															proc_uncorr_lines,
+															uncorr_linelengths);
+			float samechroma_error = compute_error_squared_rgba(ptab + partition,
+																pb,
+																ewb,
+																proc_samechroma_lines,
+																samechroma_linelengths);
+
+
+			float separate_red_error = compute_error_squared_gba(ptab + partition,
+																 pb,
+																 ewb,
+																 proc_separate_red_lines,
+																 separate_red_linelengths);
+
+			float separate_green_error = compute_error_squared_rba(ptab + partition,
+																   pb,
+																   ewb,
+																   proc_separate_green_lines,
+																   separate_green_linelengths);
+
+			float separate_blue_error = compute_error_squared_rga(ptab + partition,
+																  pb,
+																  ewb,
+																  proc_separate_blue_lines,
+																  separate_blue_linelengths);
+
+			float separate_alpha_error = compute_error_squared_rgb(ptab + partition,
+																   pb,
+																   ewb,
+																   proc_separate_alpha_lines,
+																   separate_alpha_linelengths);
+
+			// compute minimum & maximum alpha values in each partition
+			float red_min[4], red_max[4];
+			float green_min[4], green_max[4];
+			float blue_min[4], blue_max[4];
+			float alpha_min[4], alpha_max[4];
+			compute_alpha_minmax(xdim, ydim, zdim, ptab + partition, pb, ewb, alpha_min, alpha_max);
+
+			compute_rgb_minmax(xdim, ydim, zdim, ptab + partition, pb, ewb, red_min, red_max, green_min, green_max, blue_min, blue_max);
+
+			/*
+			   Compute an estimate of error introduced by weight quantization imprecision.
+			   This error is computed as follows, for each partition
+			   1: compute the principal-axis vector (full length) in error-space
+			   2: convert the principal-axis vector to regular RGB-space
+			   3: scale the vector by a constant that estimates average quantization error
+			   4: for each texel, square the vector, then do a dot-product with the texel's error weight;
+			      sum up the results across all texels.
+			   4(optimized): square the vector once, then do a dot-product with the average texel error,
+			      then multiply by the number of texels.
+			 */
+
+			for (j = 0; j < partition_count; j++)
+			{
+				float tpp = (float)(ptab[partition].texels_per_partition[j]);
+
+				float4 ics = inverse_color_scalefactors[j];
+				float4 error_weights = error_weightings[j] * (tpp * weight_imprecision_estim_squared);
+
+				float4 uncorr_vector = (uncorr_lines[j].b * uncorr_linelengths[j]) * ics;
+				float4 samechroma_vector = (samechroma_lines[j].b * samechroma_linelengths[j]) * ics;
+				float3 separate_red_vector = (separate_red_lines[j].b * separate_red_linelengths[j]) * ics.yzw;
+				float3 separate_green_vector = (separate_green_lines[j].b * separate_green_linelengths[j]) * ics.xzw;
+				float3 separate_blue_vector = (separate_blue_lines[j].b * separate_blue_linelengths[j]) * ics.xyw;
+				float3 separate_alpha_vector = (separate_alpha_lines[j].b * separate_alpha_linelengths[j]) * ics.xyz;
+
+				uncorr_vector = uncorr_vector * uncorr_vector;
+				samechroma_vector = samechroma_vector * samechroma_vector;
+				separate_red_vector = separate_red_vector * separate_red_vector;
+				separate_green_vector = separate_green_vector * separate_green_vector;
+				separate_blue_vector = separate_blue_vector * separate_blue_vector;
+				separate_alpha_vector = separate_alpha_vector * separate_alpha_vector;
+
+				uncorr_error += dot(uncorr_vector, error_weights);
+				samechroma_error += dot(samechroma_vector, error_weights);
+				separate_red_error += dot(separate_red_vector, error_weights.yzw);
+				separate_green_error += dot(separate_green_vector, error_weights.xzw);
+				separate_blue_error += dot(separate_blue_vector, error_weights.xyw);
+				separate_alpha_error += dot(separate_alpha_vector, error_weights.xyz);
+
+				float red_scalar = (red_max[j] - red_min[j]);
+				float green_scalar = (green_max[j] - green_min[j]);
+				float blue_scalar = (blue_max[j] - blue_min[j]);
+				float alpha_scalar = (alpha_max[j] - alpha_min[j]);
+				red_scalar *= red_scalar;
+				green_scalar *= green_scalar;
+				blue_scalar *= blue_scalar;
+				alpha_scalar *= alpha_scalar;
+				separate_red_error += red_scalar * error_weights.x;
+				separate_green_error += green_scalar * error_weights.y;
+				separate_blue_error += blue_scalar * error_weights.z;
+				separate_alpha_error += alpha_scalar * error_weights.w;
+			}
+
+			uncorr_errors[i] = uncorr_error;
+			samechroma_errors[i] = samechroma_error;
+			separate_red_errors[i] = separate_red_error;
+			separate_green_errors[i] = separate_green_error;
+			separate_blue_errors[i] = separate_blue_error;
+			separate_alpha_errors[i] = separate_alpha_error;
+
+			#ifdef DEBUG_PRINT_DIAGNOSTICS
+				if (print_diagnostics)
+					printf("Partitioning %d-%d errors: uncorr=%g, samechroma=%g, sep-alpha=%g\n", partition_count, i, uncorr_error, samechroma_error, separate_alpha_error);
+			#endif
+		}
+	}
+	else
+	{
+
+		#ifdef DEBUG_PRINT_DIAGNOSTICS
+			if (print_diagnostics)
+				printf("Partition testing without alpha, %d partitions\n", partition_count);
+		#endif
+
+
+
+		for (i = 0; i < PARTITION_COUNT; i++)
+		{
+
+			int partition = partition_sequence[i];
+
+			int bk_partition_count = ptab[partition].partition_count;
+			if (bk_partition_count < partition_count)
+			{
+				#ifdef DEBUG_PRINT_DIAGNOSTICS
+					if (print_diagnostics)
+						printf("Partitioning %d-%d: invalid\n", partition_count, i);
+				#endif
+
+				uncorr_errors[i] = 1e35f;
+				samechroma_errors[i] = 1e35f;
+				separate_red_errors[i] = 1e35f;
+				separate_green_errors[i] = 1e35f;
+				separate_blue_errors[i] = 1e35f;
+				continue;
+			}
+			// the sentinel value for valid partitions above the search limit must be smaller
+			// than the sentinel value for invalid partitions
+			if (i >= partition_search_limit)
+			{
+				#ifdef DEBUG_PRINT_DIAGNOSTICS
+					if (print_diagnostics)
+						printf(" Partitioning %d-%d: excluded from testing\n", partition_count, partition);
+				#endif
+
+				defacto_search_limit = i;
+				uncorr_errors[i] = 1e34f;
+				samechroma_errors[i] = 1e34f;
+				separate_red_errors[i] = 1e34f;
+				separate_green_errors[i] = 1e34f;
+				separate_blue_errors[i] = 1e34f;
+				break;
+
+			}
+
+			// compute the weighting to give to each color channel
+			// in each partition.
+			float4 error_weightings[4];
+			float4 color_scalefactors[4];
+			float4 inverse_color_scalefactors[4];
+
+			compute_partition_error_color_weightings(xdim, ydim, zdim, ewb, ptab + partition, error_weightings, color_scalefactors);
+
+			for (j = 0; j < partition_count; j++)
+			{
+				inverse_color_scalefactors[j].x = 1.0f / MAX(color_scalefactors[j].x, 1e-7f);
+				inverse_color_scalefactors[j].y = 1.0f / MAX(color_scalefactors[j].y, 1e-7f);
+				inverse_color_scalefactors[j].z = 1.0f / MAX(color_scalefactors[j].z, 1e-7f);
+				inverse_color_scalefactors[j].w = 1.0f / MAX(color_scalefactors[j].w, 1e-7f);
+			}
+
+			float3 averages[4];
+			float3 directions_rgb[4];
+			float2 directions_rg[4];
+			float2 directions_rb[4];
+			float2 directions_gb[4];
+
+			compute_averages_and_directions_rgb(ptab + partition, pb, ewb, color_scalefactors, averages, directions_rgb, directions_rg, directions_rb, directions_gb);
+
+			line3 uncorr_lines[4];
+			line3 samechroma_lines[4];
+			line2 separate_red_lines[4];
+			line2 separate_green_lines[4];
+			line2 separate_blue_lines[4];
+
+			processed_line3 proc_uncorr_lines[4];
+			processed_line3 proc_samechroma_lines[4];
+
+			processed_line2 proc_separate_red_lines[4];
+			processed_line2 proc_separate_green_lines[4];
+			processed_line2 proc_separate_blue_lines[4];
+
+			float uncorr_linelengths[4];
+			float samechroma_linelengths[4];
+			float separate_red_linelengths[4];
+			float separate_green_linelengths[4];
+			float separate_blue_linelengths[4];
+
+			for (j = 0; j < partition_count; j++)
+			{
+				uncorr_lines[j].a = averages[j];
+				if (dot(directions_rgb[j], directions_rgb[j]) == 0.0f)
+					uncorr_lines[j].b = normalize(float3(1, 1, 1));
+				else
+					uncorr_lines[j].b = normalize(directions_rgb[j]);
+
+
+				samechroma_lines[j].a = float3(0, 0, 0);
+
+				if (dot(averages[j], averages[j]) == 0.0f)
+					samechroma_lines[j].b = normalize(float3(1, 1, 1));
+				else
+					samechroma_lines[j].b = normalize(averages[j]);
+
+				proc_uncorr_lines[j].amod = (uncorr_lines[j].a - uncorr_lines[j].b * dot(uncorr_lines[j].a, uncorr_lines[j].b)) * inverse_color_scalefactors[j].xyz;
+				proc_uncorr_lines[j].bs = (uncorr_lines[j].b * color_scalefactors[j].xyz);
+				proc_uncorr_lines[j].bis = (uncorr_lines[j].b * inverse_color_scalefactors[j].xyz);
+
+				proc_samechroma_lines[j].amod = (samechroma_lines[j].a - samechroma_lines[j].b * dot(samechroma_lines[j].a, samechroma_lines[j].b)) * inverse_color_scalefactors[j].xyz;
+				proc_samechroma_lines[j].bs = (samechroma_lines[j].b * color_scalefactors[j].xyz);
+				proc_samechroma_lines[j].bis = (samechroma_lines[j].b * inverse_color_scalefactors[j].xyz);
+
+				separate_red_lines[j].a = averages[j].yz;
+				if (dot(directions_gb[j], directions_gb[j]) == 0.0f)
+					separate_red_lines[j].b = normalize(float2(1, 1));
+				else
+					separate_red_lines[j].b = normalize(directions_gb[j]);
+
+				separate_green_lines[j].a = averages[j].xz;
+				if (dot(directions_rb[j], directions_rb[j]) == 0.0f)
+					separate_green_lines[j].b = normalize(float2(1, 1));
+				else
+					separate_green_lines[j].b = normalize(directions_rb[j]);
+
+				separate_blue_lines[j].a = averages[j].xy;
+				if (dot(directions_rg[j], directions_rg[j]) == 0.0f)
+					separate_blue_lines[j].b = normalize(float2(1, 1));
+				else
+					separate_blue_lines[j].b = normalize(directions_rg[j]);
+
+				proc_separate_red_lines[j].amod = (separate_red_lines[j].a - separate_red_lines[j].b * dot(separate_red_lines[j].a, separate_red_lines[j].b)) * inverse_color_scalefactors[j].yz;
+				proc_separate_red_lines[j].bs = (separate_red_lines[j].b * color_scalefactors[j].yz);
+				proc_separate_red_lines[j].bis = (separate_red_lines[j].b * inverse_color_scalefactors[j].yz);
+
+				proc_separate_green_lines[j].amod =
+					(separate_green_lines[j].a - separate_green_lines[j].b * dot(separate_green_lines[j].a, separate_green_lines[j].b)) * inverse_color_scalefactors[j].xz;
+				proc_separate_green_lines[j].bs = (separate_green_lines[j].b * color_scalefactors[j].xz);
+				proc_separate_green_lines[j].bis = (separate_green_lines[j].b * inverse_color_scalefactors[j].xz);
+
+				proc_separate_blue_lines[j].amod = (separate_blue_lines[j].a - separate_blue_lines[j].b * dot(separate_blue_lines[j].a, separate_blue_lines[j].b)) * inverse_color_scalefactors[j].xy;
+				proc_separate_blue_lines[j].bs = (separate_blue_lines[j].b * color_scalefactors[j].xy);
+				proc_separate_blue_lines[j].bis = (separate_blue_lines[j].b * inverse_color_scalefactors[j].xy);
+
+			}
+
+			float uncorr_error = compute_error_squared_rgb(ptab + partition,
+														   pb,
+														   ewb,
+														   proc_uncorr_lines,
+														   uncorr_linelengths);
+			float samechroma_error = compute_error_squared_rgb(ptab + partition,
+															   pb,
+															   ewb,
+															   proc_samechroma_lines,
+															   samechroma_linelengths);
+
+			float separate_red_error = compute_error_squared_gb(ptab + partition,
+																pb,
+																ewb,
+																proc_separate_red_lines,
+																separate_red_linelengths);
+
+			float separate_green_error = compute_error_squared_rb(ptab + partition,
+																  pb,
+																  ewb,
+																  proc_separate_green_lines,
+																  separate_green_linelengths);
+
+			float separate_blue_error = compute_error_squared_rg(ptab + partition,
+																 pb,
+																 ewb,
+																 proc_separate_blue_lines,
+																 separate_blue_linelengths);
+
+			float red_min[4], red_max[4];
+			float green_min[4], green_max[4];
+			float blue_min[4], blue_max[4];
+
+
+			compute_rgb_minmax(xdim, ydim, zdim, ptab + partition, pb, ewb, red_min, red_max, green_min, green_max, blue_min, blue_max);
+
+
+
+			/*
+			   compute an estimate of error introduced by weight imprecision.
+			   This error is computed as follows, for each partition
+			   1: compute the principal-axis vector (full length) in error-space
+			   2: convert the principal-axis vector to regular RGB-space
+			   3: scale the vector by a constant that estimates average quantization error.
+			   4: for each texel, square the vector, then do a dot-product with the texel's error weight;
+			      sum up the results across all texels.
+			   4(optimized): square the vector once, then do a dot-product with the average texel error,
+			     then multiply by the number of texels.
+			 */
+
+
+			for (j = 0; j < partition_count; j++)
+			{
+				float tpp = (float)(ptab[partition].texels_per_partition[j]);
+
+				float3 ics = inverse_color_scalefactors[j].xyz;
+				float3 error_weights = error_weightings[j].xyz * (tpp * weight_imprecision_estim_squared);
+
+				float3 uncorr_vector = (uncorr_lines[j].b * uncorr_linelengths[j]) * ics;
+				float3 samechroma_vector = (samechroma_lines[j].b * samechroma_linelengths[j]) * ics;
+
+				float2 separate_red_vector = (separate_red_lines[j].b * separate_red_linelengths[j]) * ics.yz;
+				float2 separate_green_vector = (separate_green_lines[j].b * separate_green_linelengths[j]) * ics.xz;
+				float2 separate_blue_vector = (separate_blue_lines[j].b * separate_blue_linelengths[j]) * ics.xy;
+
+				uncorr_vector = uncorr_vector * uncorr_vector;
+				samechroma_vector = samechroma_vector * samechroma_vector;
+				separate_red_vector = separate_red_vector * separate_red_vector;
+				separate_green_vector = separate_green_vector * separate_green_vector;
+				separate_blue_vector = separate_blue_vector * separate_blue_vector;
+
+				uncorr_error += dot(uncorr_vector, error_weights);
+				samechroma_error += dot(samechroma_vector, error_weights);
+				separate_red_error += dot(separate_red_vector, error_weights.yz);
+				separate_green_error += dot(separate_green_vector, error_weights.xz);
+				separate_blue_error += dot(separate_blue_vector, error_weights.xy);
+
+				float red_scalar = (red_max[j] - red_min[j]);
+				float green_scalar = (green_max[j] - green_min[j]);
+				float blue_scalar = (blue_max[j] - blue_min[j]);
+
+				red_scalar *= red_scalar;
+				green_scalar *= green_scalar;
+				blue_scalar *= blue_scalar;
+
+				separate_red_error += red_scalar * error_weights.x;
+				separate_green_error += green_scalar * error_weights.y;
+				separate_blue_error += blue_scalar * error_weights.z;
+			}
+
+
+			uncorr_errors[i] = uncorr_error;
+			samechroma_errors[i] = samechroma_error;
+
+			separate_red_errors[i] = separate_red_error;
+			separate_green_errors[i] = separate_green_error;
+			separate_blue_errors[i] = separate_blue_error;
+
+			#ifdef DEBUG_PRINT_DIAGNOSTICS
+				if (print_diagnostics)
+					printf("Partitioning %d-%d errors: uncorr=%f, samechroma=%f, sep-red=%f, sep-green=%f, sep-blue=%f\n",
+						   partition_count, partition, uncorr_error, samechroma_error, separate_red_error, separate_green_error, separate_blue_error);
+			#endif
+		}
+	}
+
+
+	for (i = 0; i < candidates_to_return; i++)
+	{
+		int best_uncorr_partition = 0;
+		int best_samechroma_partition = 0;
+		float best_uncorr_error = 1e30f;
+		float best_samechroma_error = 1e30f;
+		for (j = 0; j <= defacto_search_limit; j++)
+		{
+			if (uncorr_errors[j] < best_uncorr_error)
+			{
+				best_uncorr_partition = j;
+				best_uncorr_error = uncorr_errors[j];
+			}
+		}
+		best_partitions_uncorrellated[i] = partition_sequence[best_uncorr_partition];
+		uncorr_errors[best_uncorr_partition] = 1e30f;
+		samechroma_errors[best_uncorr_partition] = 1e30f;
+
+		for (j = 0; j <= defacto_search_limit; j++)
+		{
+			if (samechroma_errors[j] < best_samechroma_error)
+			{
+				best_samechroma_partition = j;
+				best_samechroma_error = samechroma_errors[j];
+			}
+		}
+		best_partitions_samechroma[i] = partition_sequence[best_samechroma_partition];
+		samechroma_errors[best_samechroma_partition] = 1e30f;
+		uncorr_errors[best_samechroma_partition] = 1e30f;
+	}
+
+	for (i = 0; i < 2 * candidates_to_return; i++)
+	{
+		int best_partition = 0;
+		float best_partition_error = 1e30f;
+
+		for (j = 0; j <= defacto_search_limit; j++)
+		{
+			if (1 || !uses_alpha)
+			{
+				if (separate_errors[j] < best_partition_error)
+				{
+					best_partition = j;
+					best_partition_error = separate_errors[j];
+				}
+				if (separate_errors[j + PARTITION_COUNT] < best_partition_error)
+				{
+					best_partition = j + PARTITION_COUNT;
+					best_partition_error = separate_errors[j + PARTITION_COUNT];
+				}
+				if (separate_errors[j + 2 * PARTITION_COUNT] < best_partition_error)
+				{
+					best_partition = j + 2 * PARTITION_COUNT;
+					best_partition_error = separate_errors[j + 2 * PARTITION_COUNT];
+				}
+			}
+			if (uses_alpha)
+			{
+				if (separate_errors[j + 3 * PARTITION_COUNT] < best_partition_error)
+				{
+					best_partition = j + 3 * PARTITION_COUNT;
+					best_partition_error = separate_errors[j + 3 * PARTITION_COUNT];
+				}
+			}
+		}
+
+		separate_errors[best_partition] = 1e30f;
+		best_partition = ((best_partition >> PARTITION_BITS) << PARTITION_BITS) | partition_sequence[best_partition & (PARTITION_COUNT - 1)];
+		best_partitions_dual_weight_planes[i] = best_partition;
+	}
+
+}
--- a/3rdparty/astc/astc_ideal_endpoints_and_weights.cpp
+++ b/3rdparty/astc/astc_ideal_endpoints_and_weights.cpp
--- a/3rdparty/astc/astc_imageblock.cpp
+++ b/3rdparty/astc/astc_imageblock.cpp
@@ -0,0 +1,324 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Functions for managing ASTC codec images.
+ */
+/*----------------------------------------------------------------------------*/
+
+#include <math.h>
+
+#include "astc_codec_internals.h"
+
+#include "softfloat.h"
+#include <stdint.h>
+#include <stdio.h>
+
+// conversion functions between the LNS representation and the FP16 representation.
+
+float float_to_lns(float p)
+{
+
+	if (astc_isnan(p) || p <= 1.0f / 67108864.0f)
+	{
+		// underflow or NaN value, return 0.
+		// We count underflow if the input value is smaller than 2^-26.
+		return 0;
+	}
+
+	if (fabs(p) >= 65536.0f)
+	{
+		// overflow, return a +INF value
+		return 65535;
+	}
+
+	int expo;
+	float normfrac = frexp(p, &expo);
+	float p1;
+	if (expo < -13)
+	{
+		// input number is smaller than 2^-14. In this case, multiply by 2^25.
+		p1 = p * 33554432.0f;
+		expo = 0;
+	}
+	else
+	{
+		expo += 14;
+		p1 = (normfrac - 0.5f) * 4096.0f;
+	}
+
+	if (p1 < 384.0f)
+		p1 *= 4.0f / 3.0f;
+	else if (p1 <= 1408.0f)
+		p1 += 128.0f;
+	else
+		p1 = (p1 + 512.0f) * (4.0f / 5.0f);
+
+	p1 += expo * 2048.0f;
+	return p1 + 1.0f;
+}
+
+
+
+uint16_t lns_to_sf16(uint16_t p)
+{
+
+	uint16_t mc = p & 0x7FF;
+	uint16_t ec = p >> 11;
+	uint16_t mt;
+	if (mc < 512)
+		mt = 3 * mc;
+	else if (mc < 1536)
+		mt = 4 * mc - 512;
+	else
+		mt = 5 * mc - 2048;
+
+	uint16_t res = (ec << 10) | (mt >> 3);
+	if (res >= 0x7BFF)
+		res = 0x7BFF;
+	return res;
+}
+
+
+// conversion function from 16-bit LDR value to FP16.
+// note: for LDR interpolation, it is impossible to get a denormal result;
+// this simplifies the conversion.
+// FALSE; we can receive a very small UNORM16 through the constant-block.
+uint16_t unorm16_to_sf16(uint16_t p)
+{
+	if (p == 0xFFFF)
+		return 0x3C00;			// value of 1.0 .
+	if (p < 4)
+		return p << 8;
+
+	int lz = clz32(p) - 16;
+	p <<= (lz + 1);
+	p >>= 6;
+	p |= (14 - lz) << 10;
+	return p;
+}
+
+
+
+
+
+void imageblock_initialize_deriv_from_work_and_orig(imageblock * pb, int pixelcount)
+{
+	int i;
+
+	const float *fptr = pb->orig_data;
+	const float *wptr = pb->work_data;
+	float *dptr = pb->deriv_data;
+
+	for (i = 0; i < pixelcount; i++)
+	{
+
+		// compute derivatives for RGB first
+		if (pb->rgb_lns[i])
+		{
+			float r = MAX(fptr[0], 6e-5f);
+			float g = MAX(fptr[1], 6e-5f);
+			float b = MAX(fptr[2], 6e-5f);
+
+			float rderiv = (float_to_lns(r * 1.05f) - float_to_lns(r)) / (r * 0.05f);
+			float gderiv = (float_to_lns(g * 1.05f) - float_to_lns(g)) / (g * 0.05f);
+			float bderiv = (float_to_lns(b * 1.05f) - float_to_lns(b)) / (b * 0.05f);
+
+			// the derivative may not actually take values smaller than 1/32 or larger than 2^25;
+			// if it does, we clamp it.
+			if (rderiv < (1.0f / 32.0f))
+				rderiv = (1.0f / 32.0f);
+			else if (rderiv > 33554432.0f)
+				rderiv = 33554432.0f;
+
+			if (gderiv < (1.0f / 32.0f))
+				gderiv = (1.0f / 32.0f);
+			else if (gderiv > 33554432.0f)
+				gderiv = 33554432.0f;
+
+			if (bderiv < (1.0f / 32.0f))
+				bderiv = (1.0f / 32.0f);
+			else if (bderiv > 33554432.0f)
+				bderiv = 33554432.0f;
+
+			dptr[0] = rderiv;
+			dptr[1] = gderiv;
+			dptr[2] = bderiv;
+		}
+		else
+		{
+			dptr[0] = 65535.0f;
+			dptr[1] = 65535.0f;
+			dptr[2] = 65535.0f;
+		}
+
+
+		// then compute derivatives for Alpha
+		if (pb->alpha_lns[i])
+		{
+			float a = MAX(fptr[3], 6e-5f);
+			float aderiv = (float_to_lns(a * 1.05f) - float_to_lns(a)) / (a * 0.05f);
+			// the derivative may not actually take values smaller than 1/32 or larger than 2^25;
+			// if it does, we clamp it.
+			if (aderiv < (1.0f / 32.0f))
+				aderiv = (1.0f / 32.0f);
+			else if (aderiv > 33554432.0f)
+				aderiv = 33554432.0f;
+
+			dptr[3] = aderiv;
+		}
+		else
+		{
+			dptr[3] = 65535.0f;
+		}
+
+		fptr += 4;
+		wptr += 4;
+		dptr += 4;
+	}
+}
+
+
+
+
+// helper function to initialize the work-data from the orig-data
+void imageblock_initialize_work_from_orig(imageblock * pb, int pixelcount)
+{
+	int i;
+	float *fptr = pb->orig_data;
+	float *wptr = pb->work_data;
+
+	for (i = 0; i < pixelcount; i++)
+	{
+		if (pb->rgb_lns[i])
+		{
+			wptr[0] = float_to_lns(fptr[0]);
+			wptr[1] = float_to_lns(fptr[1]);
+			wptr[2] = float_to_lns(fptr[2]);
+		}
+		else
+		{
+			wptr[0] = fptr[0] * 65535.0f;
+			wptr[1] = fptr[1] * 65535.0f;
+			wptr[2] = fptr[2] * 65535.0f;
+		}
+
+		if (pb->alpha_lns[i])
+		{
+			wptr[3] = float_to_lns(fptr[3]);
+		}
+		else
+		{
+			wptr[3] = fptr[3] * 65535.0f;
+		}
+		fptr += 4;
+		wptr += 4;
+	}
+
+	imageblock_initialize_deriv_from_work_and_orig(pb, pixelcount);
+}
+
+
+
+
+// helper function to initialize the orig-data from the work-data
+void imageblock_initialize_orig_from_work(imageblock * pb, int pixelcount)
+{
+	int i;
+	float *fptr = pb->orig_data;
+	float *wptr = pb->work_data;
+
+	for (i = 0; i < pixelcount; i++)
+	{
+		if (pb->rgb_lns[i])
+		{
+			fptr[0] = sf16_to_float(lns_to_sf16((uint16_t) wptr[0]));
+			fptr[1] = sf16_to_float(lns_to_sf16((uint16_t) wptr[1]));
+			fptr[2] = sf16_to_float(lns_to_sf16((uint16_t) wptr[2]));
+		}
+		else
+		{
+			fptr[0] = sf16_to_float(unorm16_to_sf16((uint16_t) wptr[0]));
+			fptr[1] = sf16_to_float(unorm16_to_sf16((uint16_t) wptr[1]));
+			fptr[2] = sf16_to_float(unorm16_to_sf16((uint16_t) wptr[2]));
+		}
+
+		if (pb->alpha_lns[i])
+		{
+			fptr[3] = sf16_to_float(lns_to_sf16((uint16_t) wptr[3]));
+		}
+		else
+		{
+			fptr[3] = sf16_to_float(unorm16_to_sf16((uint16_t) wptr[3]));
+		}
+
+		fptr += 4;
+		wptr += 4;
+	}
+
+	imageblock_initialize_deriv_from_work_and_orig(pb, pixelcount);
+}
+
+
+/*
+   For an imageblock, update its flags.
+
+   The updating is done based on work_data, not orig_data.
+*/
+void update_imageblock_flags(imageblock * pb, int xdim, int ydim, int zdim)
+{
+	int i;
+	float red_min = 1e38f, red_max = -1e38f;
+	float green_min = 1e38f, green_max = -1e38f;
+	float blue_min = 1e38f, blue_max = -1e38f;
+	float alpha_min = 1e38f, alpha_max = -1e38f;
+
+	int texels_per_block = xdim * ydim * zdim;
+
+	int grayscale = 1;
+
+	for (i = 0; i < texels_per_block; i++)
+	{
+		float red = pb->work_data[4 * i];
+		float green = pb->work_data[4 * i + 1];
+		float blue = pb->work_data[4 * i + 2];
+		float alpha = pb->work_data[4 * i + 3];
+		if (red < red_min)
+			red_min = red;
+		if (red > red_max)
+			red_max = red;
+		if (green < green_min)
+			green_min = green;
+		if (green > green_max)
+			green_max = green;
+		if (blue < blue_min)
+			blue_min = blue;
+		if (blue > blue_max)
+			blue_max = blue;
+		if (alpha < alpha_min)
+			alpha_min = alpha;
+		if (alpha > alpha_max)
+			alpha_max = alpha;
+
+		if (grayscale == 1 && (red != green || red != blue))
+			grayscale = 0;
+	}
+
+	pb->red_min = red_min;
+	pb->red_max = red_max;
+	pb->green_min = green_min;
+	pb->green_max = green_max;
+	pb->blue_min = blue_min;
+	pb->blue_max = blue_max;
+	pb->alpha_min = alpha_min;
+	pb->alpha_max = alpha_max;
+	pb->grayscale = grayscale;
+}
+
--- a/3rdparty/astc/astc_integer_sequence.cpp
+++ b/3rdparty/astc/astc_integer_sequence.cpp
@@ -0,0 +1,649 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Functions to encode/decode data using Bounded Integer Sequence
+ *			Encoding.
+ */
+/*----------------------------------------------------------------------------*/
+#include "astc_codec_internals.h"
+	// unpacked quint triplets <low,middle,high> for each packed-quint value
+static const uint8_t quints_of_integer[128][3] = {
+	{0, 0, 0},	{1, 0, 0},	{2, 0, 0},	{3, 0, 0},
+	{4, 0, 0},	{0, 4, 0},	{4, 4, 0},	{4, 4, 4},
+	{0, 1, 0},	{1, 1, 0},	{2, 1, 0},	{3, 1, 0},
+	{4, 1, 0},	{1, 4, 0},	{4, 4, 1},	{4, 4, 4},
+	{0, 2, 0},	{1, 2, 0},	{2, 2, 0},	{3, 2, 0},
+	{4, 2, 0},	{2, 4, 0},	{4, 4, 2},	{4, 4, 4},
+	{0, 3, 0},	{1, 3, 0},	{2, 3, 0},	{3, 3, 0},
+	{4, 3, 0},	{3, 4, 0},	{4, 4, 3},	{4, 4, 4},
+	{0, 0, 1},	{1, 0, 1},	{2, 0, 1},	{3, 0, 1},
+	{4, 0, 1},	{0, 4, 1},	{4, 0, 4},	{0, 4, 4},
+	{0, 1, 1},	{1, 1, 1},	{2, 1, 1},	{3, 1, 1},
+	{4, 1, 1},	{1, 4, 1},	{4, 1, 4},	{1, 4, 4},
+	{0, 2, 1},	{1, 2, 1},	{2, 2, 1},	{3, 2, 1},
+	{4, 2, 1},	{2, 4, 1},	{4, 2, 4},	{2, 4, 4},
+	{0, 3, 1},	{1, 3, 1},	{2, 3, 1},	{3, 3, 1},
+	{4, 3, 1},	{3, 4, 1},	{4, 3, 4},	{3, 4, 4},
+	{0, 0, 2},	{1, 0, 2},	{2, 0, 2},	{3, 0, 2},
+	{4, 0, 2},	{0, 4, 2},	{2, 0, 4},	{3, 0, 4},
+	{0, 1, 2},	{1, 1, 2},	{2, 1, 2},	{3, 1, 2},
+	{4, 1, 2},	{1, 4, 2},	{2, 1, 4},	{3, 1, 4},
+	{0, 2, 2},	{1, 2, 2},	{2, 2, 2},	{3, 2, 2},
+	{4, 2, 2},	{2, 4, 2},	{2, 2, 4},	{3, 2, 4},
+	{0, 3, 2},	{1, 3, 2},	{2, 3, 2},	{3, 3, 2},
+	{4, 3, 2},	{3, 4, 2},	{2, 3, 4},	{3, 3, 4},
+	{0, 0, 3},	{1, 0, 3},	{2, 0, 3},	{3, 0, 3},
+	{4, 0, 3},	{0, 4, 3},	{0, 0, 4},	{1, 0, 4},
+	{0, 1, 3},	{1, 1, 3},	{2, 1, 3},	{3, 1, 3},
+	{4, 1, 3},	{1, 4, 3},	{0, 1, 4},	{1, 1, 4},
+	{0, 2, 3},	{1, 2, 3},	{2, 2, 3},	{3, 2, 3},
+	{4, 2, 3},	{2, 4, 3},	{0, 2, 4},	{1, 2, 4},
+	{0, 3, 3},	{1, 3, 3},	{2, 3, 3},	{3, 3, 3},
+	{4, 3, 3},	{3, 4, 3},	{0, 3, 4},	{1, 3, 4},
+};
+
+// packed quint-value for every unpacked quint-triplet
+// indexed by [high][middle][low]
+static const uint8_t integer_of_quints[5][5][5] = {
+	{
+	 {0, 1, 2, 3, 4,},
+	 {8, 9, 10, 11, 12,},
+	 {16, 17, 18, 19, 20,},
+	 {24, 25, 26, 27, 28,},
+	 {5, 13, 21, 29, 6,},
+	 },
+	{
+	 {32, 33, 34, 35, 36,},
+	 {40, 41, 42, 43, 44,},
+	 {48, 49, 50, 51, 52,},
+	 {56, 57, 58, 59, 60,},
+	 {37, 45, 53, 61, 14,},
+	 },
+	{
+	 {64, 65, 66, 67, 68,},
+	 {72, 73, 74, 75, 76,},
+	 {80, 81, 82, 83, 84,},
+	 {88, 89, 90, 91, 92,},
+	 {69, 77, 85, 93, 22,},
+	 },
+	{
+	 {96, 97, 98, 99, 100,},
+	 {104, 105, 106, 107, 108,},
+	 {112, 113, 114, 115, 116,},
+	 {120, 121, 122, 123, 124,},
+	 {101, 109, 117, 125, 30,},
+	 },
+	{
+	 {102, 103, 70, 71, 38,},
+	 {110, 111, 78, 79, 46,},
+	 {118, 119, 86, 87, 54,},
+	 {126, 127, 94, 95, 62,},
+	 {39, 47, 55, 63, 31,},
+	 },
+};
+
+// unpacked trit quintuplets <low,_,_,_,high> for each packed-quint value
+static const uint8_t trits_of_integer[256][5] = {
+	{0, 0, 0, 0, 0},	{1, 0, 0, 0, 0},	{2, 0, 0, 0, 0},	{0, 0, 2, 0, 0},
+	{0, 1, 0, 0, 0},	{1, 1, 0, 0, 0},	{2, 1, 0, 0, 0},	{1, 0, 2, 0, 0},
+	{0, 2, 0, 0, 0},	{1, 2, 0, 0, 0},	{2, 2, 0, 0, 0},	{2, 0, 2, 0, 0},
+	{0, 2, 2, 0, 0},	{1, 2, 2, 0, 0},	{2, 2, 2, 0, 0},	{2, 0, 2, 0, 0},
+	{0, 0, 1, 0, 0},	{1, 0, 1, 0, 0},	{2, 0, 1, 0, 0},	{0, 1, 2, 0, 0},
+	{0, 1, 1, 0, 0},	{1, 1, 1, 0, 0},	{2, 1, 1, 0, 0},	{1, 1, 2, 0, 0},
+	{0, 2, 1, 0, 0},	{1, 2, 1, 0, 0},	{2, 2, 1, 0, 0},	{2, 1, 2, 0, 0},
+	{0, 0, 0, 2, 2},	{1, 0, 0, 2, 2},	{2, 0, 0, 2, 2},	{0, 0, 2, 2, 2},
+	{0, 0, 0, 1, 0},	{1, 0, 0, 1, 0},	{2, 0, 0, 1, 0},	{0, 0, 2, 1, 0},
+	{0, 1, 0, 1, 0},	{1, 1, 0, 1, 0},	{2, 1, 0, 1, 0},	{1, 0, 2, 1, 0},
+	{0, 2, 0, 1, 0},	{1, 2, 0, 1, 0},	{2, 2, 0, 1, 0},	{2, 0, 2, 1, 0},
+	{0, 2, 2, 1, 0},	{1, 2, 2, 1, 0},	{2, 2, 2, 1, 0},	{2, 0, 2, 1, 0},
+	{0, 0, 1, 1, 0},	{1, 0, 1, 1, 0},	{2, 0, 1, 1, 0},	{0, 1, 2, 1, 0},
+	{0, 1, 1, 1, 0},	{1, 1, 1, 1, 0},	{2, 1, 1, 1, 0},	{1, 1, 2, 1, 0},
+	{0, 2, 1, 1, 0},	{1, 2, 1, 1, 0},	{2, 2, 1, 1, 0},	{2, 1, 2, 1, 0},
+	{0, 1, 0, 2, 2},	{1, 1, 0, 2, 2},	{2, 1, 0, 2, 2},	{1, 0, 2, 2, 2},
+	{0, 0, 0, 2, 0},	{1, 0, 0, 2, 0},	{2, 0, 0, 2, 0},	{0, 0, 2, 2, 0},
+	{0, 1, 0, 2, 0},	{1, 1, 0, 2, 0},	{2, 1, 0, 2, 0},	{1, 0, 2, 2, 0},
+	{0, 2, 0, 2, 0},	{1, 2, 0, 2, 0},	{2, 2, 0, 2, 0},	{2, 0, 2, 2, 0},
+	{0, 2, 2, 2, 0},	{1, 2, 2, 2, 0},	{2, 2, 2, 2, 0},	{2, 0, 2, 2, 0},
+	{0, 0, 1, 2, 0},	{1, 0, 1, 2, 0},	{2, 0, 1, 2, 0},	{0, 1, 2, 2, 0},
+	{0, 1, 1, 2, 0},	{1, 1, 1, 2, 0},	{2, 1, 1, 2, 0},	{1, 1, 2, 2, 0},
+	{0, 2, 1, 2, 0},	{1, 2, 1, 2, 0},	{2, 2, 1, 2, 0},	{2, 1, 2, 2, 0},
+	{0, 2, 0, 2, 2},	{1, 2, 0, 2, 2},	{2, 2, 0, 2, 2},	{2, 0, 2, 2, 2},
+	{0, 0, 0, 0, 2},	{1, 0, 0, 0, 2},	{2, 0, 0, 0, 2},	{0, 0, 2, 0, 2},
+	{0, 1, 0, 0, 2},	{1, 1, 0, 0, 2},	{2, 1, 0, 0, 2},	{1, 0, 2, 0, 2},
+	{0, 2, 0, 0, 2},	{1, 2, 0, 0, 2},	{2, 2, 0, 0, 2},	{2, 0, 2, 0, 2},
+	{0, 2, 2, 0, 2},	{1, 2, 2, 0, 2},	{2, 2, 2, 0, 2},	{2, 0, 2, 0, 2},
+	{0, 0, 1, 0, 2},	{1, 0, 1, 0, 2},	{2, 0, 1, 0, 2},	{0, 1, 2, 0, 2},
+	{0, 1, 1, 0, 2},	{1, 1, 1, 0, 2},	{2, 1, 1, 0, 2},	{1, 1, 2, 0, 2},
+	{0, 2, 1, 0, 2},	{1, 2, 1, 0, 2},	{2, 2, 1, 0, 2},	{2, 1, 2, 0, 2},
+	{0, 2, 2, 2, 2},	{1, 2, 2, 2, 2},	{2, 2, 2, 2, 2},	{2, 0, 2, 2, 2},
+	{0, 0, 0, 0, 1},	{1, 0, 0, 0, 1},	{2, 0, 0, 0, 1},	{0, 0, 2, 0, 1},
+	{0, 1, 0, 0, 1},	{1, 1, 0, 0, 1},	{2, 1, 0, 0, 1},	{1, 0, 2, 0, 1},
+	{0, 2, 0, 0, 1},	{1, 2, 0, 0, 1},	{2, 2, 0, 0, 1},	{2, 0, 2, 0, 1},
+	{0, 2, 2, 0, 1},	{1, 2, 2, 0, 1},	{2, 2, 2, 0, 1},	{2, 0, 2, 0, 1},
+	{0, 0, 1, 0, 1},	{1, 0, 1, 0, 1},	{2, 0, 1, 0, 1},	{0, 1, 2, 0, 1},
+	{0, 1, 1, 0, 1},	{1, 1, 1, 0, 1},	{2, 1, 1, 0, 1},	{1, 1, 2, 0, 1},
+	{0, 2, 1, 0, 1},	{1, 2, 1, 0, 1},	{2, 2, 1, 0, 1},	{2, 1, 2, 0, 1},
+	{0, 0, 1, 2, 2},	{1, 0, 1, 2, 2},	{2, 0, 1, 2, 2},	{0, 1, 2, 2, 2},
+	{0, 0, 0, 1, 1},	{1, 0, 0, 1, 1},	{2, 0, 0, 1, 1},	{0, 0, 2, 1, 1},
+	{0, 1, 0, 1, 1},	{1, 1, 0, 1, 1},	{2, 1, 0, 1, 1},	{1, 0, 2, 1, 1},
+	{0, 2, 0, 1, 1},	{1, 2, 0, 1, 1},	{2, 2, 0, 1, 1},	{2, 0, 2, 1, 1},
+	{0, 2, 2, 1, 1},	{1, 2, 2, 1, 1},	{2, 2, 2, 1, 1},	{2, 0, 2, 1, 1},
+	{0, 0, 1, 1, 1},	{1, 0, 1, 1, 1},	{2, 0, 1, 1, 1},	{0, 1, 2, 1, 1},
+	{0, 1, 1, 1, 1},	{1, 1, 1, 1, 1},	{2, 1, 1, 1, 1},	{1, 1, 2, 1, 1},
+	{0, 2, 1, 1, 1},	{1, 2, 1, 1, 1},	{2, 2, 1, 1, 1},	{2, 1, 2, 1, 1},
+	{0, 1, 1, 2, 2},	{1, 1, 1, 2, 2},	{2, 1, 1, 2, 2},	{1, 1, 2, 2, 2},
+	{0, 0, 0, 2, 1},	{1, 0, 0, 2, 1},	{2, 0, 0, 2, 1},	{0, 0, 2, 2, 1},
+	{0, 1, 0, 2, 1},	{1, 1, 0, 2, 1},	{2, 1, 0, 2, 1},	{1, 0, 2, 2, 1},
+	{0, 2, 0, 2, 1},	{1, 2, 0, 2, 1},	{2, 2, 0, 2, 1},	{2, 0, 2, 2, 1},
+	{0, 2, 2, 2, 1},	{1, 2, 2, 2, 1},	{2, 2, 2, 2, 1},	{2, 0, 2, 2, 1},
+	{0, 0, 1, 2, 1},	{1, 0, 1, 2, 1},	{2, 0, 1, 2, 1},	{0, 1, 2, 2, 1},
+	{0, 1, 1, 2, 1},	{1, 1, 1, 2, 1},	{2, 1, 1, 2, 1},	{1, 1, 2, 2, 1},
+	{0, 2, 1, 2, 1},	{1, 2, 1, 2, 1},	{2, 2, 1, 2, 1},	{2, 1, 2, 2, 1},
+	{0, 2, 1, 2, 2},	{1, 2, 1, 2, 2},	{2, 2, 1, 2, 2},	{2, 1, 2, 2, 2},
+	{0, 0, 0, 1, 2},	{1, 0, 0, 1, 2},	{2, 0, 0, 1, 2},	{0, 0, 2, 1, 2},
+	{0, 1, 0, 1, 2},	{1, 1, 0, 1, 2},	{2, 1, 0, 1, 2},	{1, 0, 2, 1, 2},
+	{0, 2, 0, 1, 2},	{1, 2, 0, 1, 2},	{2, 2, 0, 1, 2},	{2, 0, 2, 1, 2},
+	{0, 2, 2, 1, 2},	{1, 2, 2, 1, 2},	{2, 2, 2, 1, 2},	{2, 0, 2, 1, 2},
+	{0, 0, 1, 1, 2},	{1, 0, 1, 1, 2},	{2, 0, 1, 1, 2},	{0, 1, 2, 1, 2},
+	{0, 1, 1, 1, 2},	{1, 1, 1, 1, 2},	{2, 1, 1, 1, 2},	{1, 1, 2, 1, 2},
+	{0, 2, 1, 1, 2},	{1, 2, 1, 1, 2},	{2, 2, 1, 1, 2},	{2, 1, 2, 1, 2},
+	{0, 2, 2, 2, 2},	{1, 2, 2, 2, 2},	{2, 2, 2, 2, 2},	{2, 1, 2, 2, 2},
+};
+
+// packed trit-value for every unpacked trit-quintuplet
+// indexed by [high][][][][low]
+static const uint8_t integer_of_trits[3][3][3][3][3] = {
+	{
+	 {
+	  {
+	   {0, 1, 2,},
+	   {4, 5, 6,},
+	   {8, 9, 10,},
+	   },
+	  {
+	   {16, 17, 18,},
+	   {20, 21, 22,},
+	   {24, 25, 26,},
+	   },
+	  {
+	   {3, 7, 15,},
+	   {19, 23, 27,},
+	   {12, 13, 14,},
+	   },
+	  },
+	 {
+	  {
+	   {32, 33, 34,},
+	   {36, 37, 38,},
+	   {40, 41, 42,},
+	   },
+	  {
+	   {48, 49, 50,},
+	   {52, 53, 54,},
+	   {56, 57, 58,},
+	   },
+	  {
+	   {35, 39, 47,},
+	   {51, 55, 59,},
+	   {44, 45, 46,},
+	   },
+	  },
+	 {
+	  {
+	   {64, 65, 66,},
+	   {68, 69, 70,},
+	   {72, 73, 74,},
+	   },
+	  {
+	   {80, 81, 82,},
+	   {84, 85, 86,},
+	   {88, 89, 90,},
+	   },
+	  {
+	   {67, 71, 79,},
+	   {83, 87, 91,},
+	   {76, 77, 78,},
+	   },
+	  },
+	 },
+	{
+	 {
+	  {
+	   {128, 129, 130,},
+	   {132, 133, 134,},
+	   {136, 137, 138,},
+	   },
+	  {
+	   {144, 145, 146,},
+	   {148, 149, 150,},
+	   {152, 153, 154,},
+	   },
+	  {
+	   {131, 135, 143,},
+	   {147, 151, 155,},
+	   {140, 141, 142,},
+	   },
+	  },
+	 {
+	  {
+	   {160, 161, 162,},
+	   {164, 165, 166,},
+	   {168, 169, 170,},
+	   },
+	  {
+	   {176, 177, 178,},
+	   {180, 181, 182,},
+	   {184, 185, 186,},
+	   },
+	  {
+	   {163, 167, 175,},
+	   {179, 183, 187,},
+	   {172, 173, 174,},
+	   },
+	  },
+	 {
+	  {
+	   {192, 193, 194,},
+	   {196, 197, 198,},
+	   {200, 201, 202,},
+	   },
+	  {
+	   {208, 209, 210,},
+	   {212, 213, 214,},
+	   {216, 217, 218,},
+	   },
+	  {
+	   {195, 199, 207,},
+	   {211, 215, 219,},
+	   {204, 205, 206,},
+	   },
+	  },
+	 },
+	{
+	 {
+	  {
+	   {96, 97, 98,},
+	   {100, 101, 102,},
+	   {104, 105, 106,},
+	   },
+	  {
+	   {112, 113, 114,},
+	   {116, 117, 118,},
+	   {120, 121, 122,},
+	   },
+	  {
+	   {99, 103, 111,},
+	   {115, 119, 123,},
+	   {108, 109, 110,},
+	   },
+	  },
+	 {
+	  {
+	   {224, 225, 226,},
+	   {228, 229, 230,},
+	   {232, 233, 234,},
+	   },
+	  {
+	   {240, 241, 242,},
+	   {244, 245, 246,},
+	   {248, 249, 250,},
+	   },
+	  {
+	   {227, 231, 239,},
+	   {243, 247, 251,},
+	   {236, 237, 238,},
+	   },
+	  },
+	 {
+	  {
+	   {28, 29, 30,},
+	   {60, 61, 62,},
+	   {92, 93, 94,},
+	   },
+	  {
+	   {156, 157, 158,},
+	   {188, 189, 190,},
+	   {220, 221, 222,},
+	   },
+	  {
+	   {31, 63, 127,},
+	   {159, 191, 255,},
+	   {252, 253, 254,},
+	   },
+	  },
+	 },
+};
+
+
+
+void find_number_of_bits_trits_quints(int quantization_level, int *bits, int *trits, int *quints)
+{
+	*bits = 0;
+	*trits = 0;
+	*quints = 0;
+	switch (quantization_level)
+	{
+	case QUANT_2:
+		*bits = 1;
+		break;
+	case QUANT_3:
+		*bits = 0;
+		*trits = 1;
+		break;
+	case QUANT_4:
+		*bits = 2;
+		break;
+	case QUANT_5:
+		*bits = 0;
+		*quints = 1;
+		break;
+	case QUANT_6:
+		*bits = 1;
+		*trits = 1;
+		break;
+	case QUANT_8:
+		*bits = 3;
+		break;
+	case QUANT_10:
+		*bits = 1;
+		*quints = 1;
+		break;
+	case QUANT_12:
+		*bits = 2;
+		*trits = 1;
+		break;
+	case QUANT_16:
+		*bits = 4;
+		break;
+	case QUANT_20:
+		*bits = 2;
+		*quints = 1;
+		break;
+	case QUANT_24:
+		*bits = 3;
+		*trits = 1;
+		break;
+	case QUANT_32:
+		*bits = 5;
+		break;
+	case QUANT_40:
+		*bits = 3;
+		*quints = 1;
+		break;
+	case QUANT_48:
+		*bits = 4;
+		*trits = 1;
+		break;
+	case QUANT_64:
+		*bits = 6;
+		break;
+	case QUANT_80:
+		*bits = 4;
+		*quints = 1;
+		break;
+	case QUANT_96:
+		*bits = 5;
+		*trits = 1;
+		break;
+	case QUANT_128:
+		*bits = 7;
+		break;
+	case QUANT_160:
+		*bits = 5;
+		*quints = 1;
+		break;
+	case QUANT_192:
+		*bits = 6;
+		*trits = 1;
+		break;
+	case QUANT_256:
+		*bits = 8;
+		break;
+	}
+}
+
+
+// routine to write up to 8 bits
+static inline void write_bits(int value, int bitcount, int bitoffset, uint8_t * ptr)
+{
+	int mask = (1 << bitcount) - 1;
+	value &= mask;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	value <<= bitoffset;
+	mask <<= bitoffset;
+	mask = ~mask;
+
+	ptr[0] &= mask;
+	ptr[0] |= value;
+	ptr[1] &= mask >> 8;
+	ptr[1] |= value >> 8;
+}
+
+
+// routine to read up to 8 bits
+static inline int read_bits(int bitcount, int bitoffset, const uint8_t * ptr)
+{
+	int mask = (1 << bitcount) - 1;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	int value = ptr[0] | (ptr[1] << 8);
+	value >>= bitoffset;
+	value &= mask;
+	return value;
+}
+
+
+
+
+void encode_ise(int quantization_level, int elements, const uint8_t * input_data, uint8_t * output_data, int bit_offset)
+{
+	int i;
+	uint8_t lowparts[64];
+	uint8_t highparts[69];		// 64 elements + 5 elements for padding
+	uint8_t tq_blocks[22];		// trit-blocks or quint-blocks
+
+	int bits, trits, quints;
+	find_number_of_bits_trits_quints(quantization_level, &bits, &trits, &quints);
+
+	for (i = 0; i < elements; i++)
+	{
+		lowparts[i] = input_data[i] & ((1 << bits) - 1);
+		highparts[i] = input_data[i] >> bits;
+	}
+	for (i = elements; i < elements + 5; i++)
+		highparts[i] = 0;		// padding before we start constructing trit-blocks or quint-blocks
+
+	// construct trit-blocks or quint-blocks as necessary
+	if (trits)
+	{
+		int trit_blocks = (elements + 4) / 5;
+		for (i = 0; i < trit_blocks; i++)
+			tq_blocks[i] = integer_of_trits[highparts[5 * i + 4]][highparts[5 * i + 3]][highparts[5 * i + 2]][highparts[5 * i + 1]][highparts[5 * i]];
+	}
+	if (quints)
+	{
+		int quint_blocks = (elements + 2) / 3;
+		for (i = 0; i < quint_blocks; i++)
+			tq_blocks[i] = integer_of_quints[highparts[3 * i + 2]][highparts[3 * i + 1]][highparts[3 * i]];
+	}
+
+	// then, write out the actual bits.
+	int lcounter = 0;
+	int hcounter = 0;
+	for (i = 0; i < elements; i++)
+	{
+		write_bits(lowparts[i], bits, bit_offset, output_data);
+		bit_offset += bits;
+		if (trits)
+		{
+			static const int bits_to_write[5] = { 2, 2, 1, 2, 1 };
+			static const int block_shift[5] = { 0, 2, 4, 5, 7 };
+			static const int next_lcounter[5] = { 1, 2, 3, 4, 0 };
+			static const int hcounter_incr[5] = { 0, 0, 0, 0, 1 };
+			write_bits(tq_blocks[hcounter] >> block_shift[lcounter], bits_to_write[lcounter], bit_offset, output_data);
+			bit_offset += bits_to_write[lcounter];
+			hcounter += hcounter_incr[lcounter];
+			lcounter = next_lcounter[lcounter];
+		}
+		if (quints)
+		{
+			static const int bits_to_write[3] = { 3, 2, 2 };
+			static const int block_shift[3] = { 0, 3, 5 };
+			static const int next_lcounter[3] = { 1, 2, 0 };
+			static const int hcounter_incr[3] = { 0, 0, 1 };
+			write_bits(tq_blocks[hcounter] >> block_shift[lcounter], bits_to_write[lcounter], bit_offset, output_data);
+			bit_offset += bits_to_write[lcounter];
+			hcounter += hcounter_incr[lcounter];
+			lcounter = next_lcounter[lcounter];
+		}
+	}
+}
+
+
+
+
+void decode_ise(int quantization_level, int elements, const uint8_t * input_data, uint8_t * output_data, int bit_offset)
+{
+	int i;
+	// note: due to how the trit/quint-block unpacking is done in this function,
+	// we may write more temporary results than the number of outputs
+	// The maximum actual number of results is 64 bit, but we keep 4 additional elements
+	// of padding.
+	uint8_t results[68];
+	uint8_t tq_blocks[22];		// trit-blocks or quint-blocks
+
+	int bits, trits, quints;
+	find_number_of_bits_trits_quints(quantization_level, &bits, &trits, &quints);
+
+	int lcounter = 0;
+	int hcounter = 0;
+
+	// trit-blocks or quint-blocks must be zeroed out before we collect them in the loop below.
+	for (i = 0; i < 22; i++)
+		tq_blocks[i] = 0;
+
+	// collect bits for each element, as well as bits for any trit-blocks and quint-blocks.
+	for (i = 0; i < elements; i++)
+	{
+		results[i] = read_bits(bits, bit_offset, input_data);
+		bit_offset += bits;
+		if (trits)
+		{
+			static const int bits_to_read[5] = { 2, 2, 1, 2, 1 };
+			static const int block_shift[5] = { 0, 2, 4, 5, 7 };
+			static const int next_lcounter[5] = { 1, 2, 3, 4, 0 };
+			static const int hcounter_incr[5] = { 0, 0, 0, 0, 1 };
+			int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
+			bit_offset += bits_to_read[lcounter];
+			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
+			hcounter += hcounter_incr[lcounter];
+			lcounter = next_lcounter[lcounter];
+		}
+		if (quints)
+		{
+			static const int bits_to_read[3] = { 3, 2, 2 };
+			static const int block_shift[3] = { 0, 3, 5 };
+			static const int next_lcounter[3] = { 1, 2, 0 };
+			static const int hcounter_incr[3] = { 0, 0, 1 };
+			int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
+			bit_offset += bits_to_read[lcounter];
+			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
+			hcounter += hcounter_incr[lcounter];
+			lcounter = next_lcounter[lcounter];
+		}
+	}
+
+
+	// unpack trit-blocks or quint-blocks as needed
+	if (trits)
+	{
+		int trit_blocks = (elements + 4) / 5;
+		for (i = 0; i < trit_blocks; i++)
+		{
+			const uint8_t *tritptr = trits_of_integer[tq_blocks[i]];
+			results[5 * i] |= tritptr[0] << bits;
+			results[5 * i + 1] |= tritptr[1] << bits;
+			results[5 * i + 2] |= tritptr[2] << bits;
+			results[5 * i + 3] |= tritptr[3] << bits;
+			results[5 * i + 4] |= tritptr[4] << bits;
+		}
+	}
+
+	if (quints)
+	{
+		int quint_blocks = (elements + 2) / 3;
+		for (i = 0; i < quint_blocks; i++)
+		{
+			const uint8_t *quintptr = quints_of_integer[tq_blocks[i]];
+			results[3 * i] |= quintptr[0] << bits;
+			results[3 * i + 1] |= quintptr[1] << bits;
+			results[3 * i + 2] |= quintptr[2] << bits;
+		}
+	}
+
+	for (i = 0; i < elements; i++)
+		output_data[i] = results[i];
+}
+
+
+
+
+int compute_ise_bitcount(int items, quantization_method quant)
+{
+	switch (quant)
+	{
+	case QUANT_2:
+		return items;
+	case QUANT_3:
+		return (8 * items + 4) / 5;
+	case QUANT_4:
+		return 2 * items;
+	case QUANT_5:
+		return (7 * items + 2) / 3;
+	case QUANT_6:
+		return (13 * items + 4) / 5;
+	case QUANT_8:
+		return 3 * items;
+	case QUANT_10:
+		return (10 * items + 2) / 3;
+	case QUANT_12:
+		return (18 * items + 4) / 5;
+	case QUANT_16:
+		return items * 4;
+	case QUANT_20:
+		return (13 * items + 2) / 3;
+	case QUANT_24:
+		return (23 * items + 4) / 5;
+	case QUANT_32:
+		return 5 * items;
+	case QUANT_40:
+		return (16 * items + 2) / 3;
+	case QUANT_48:
+		return (28 * items + 4) / 5;
+	case QUANT_64:
+		return 6 * items;
+	case QUANT_80:
+		return (19 * items + 2) / 3;
+	case QUANT_96:
+		return (33 * items + 4) / 5;
+	case QUANT_128:
+		return 7 * items;
+	case QUANT_160:
+		return (22 * items + 2) / 3;
+	case QUANT_192:
+		return (38 * items + 4) / 5;
+	case QUANT_256:
+		return 8 * items;
+	default:
+		return 100000;
+	}
+}
--- a/3rdparty/astc/astc_kmeans_partitioning.cpp
+++ b/3rdparty/astc/astc_kmeans_partitioning.cpp
@@ -0,0 +1,520 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	approximate k-means cluster partitioning. Do this in 2 stages
+ *
+ *			1: basic clustering, a couple of passes just to get a few clusters
+ *			2: clustering based on line, a few passes until it seems to
+ *			   stabilize.
+ *
+ *			After clustering is done, we use the clustering result to construct
+ *			one bitmap for each partition. We then scan though the partition table,
+ *			counting how well the bitmaps matched.
+ */
+/*----------------------------------------------------------------------------*/
+
+#include "astc_codec_internals.h"
+
+// for k++ means, we need pseudo-random numbers, however using random numbers directly
+// results in irreproducible encoding results. As such, we will instead
+// just supply a handful of numbers from random.org, and apply an algorithm similar
+// to XKCD #221. (http://xkcd.com/221/)
+// cluster the texels using the k++ means clustering initialization algorithm.
+
+void kpp_initialize(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, float4 * cluster_centers)
+{
+	int i;
+
+	int texels_per_block = xdim * ydim * zdim;
+
+	int cluster_center_samples[4];
+	// pick a random sample as first center-point.
+	cluster_center_samples[0] = 145897 /* number from random.org */  % texels_per_block;
+	int samples_selected = 1;
+
+	float distances[MAX_TEXELS_PER_BLOCK];
+
+	// compute the distance to the first point.
+	int sample = cluster_center_samples[0];
+	float4 center_color = float4(blk->work_data[4 * sample],
+								 blk->work_data[4 * sample + 1],
+								 blk->work_data[4 * sample + 2],
+								 blk->work_data[4 * sample + 3]);
+
+	float distance_sum = 0.0f;
+	for (i = 0; i < texels_per_block; i++)
+	{
+		float4 color = float4(blk->work_data[4 * i],
+							  blk->work_data[4 * i + 1],
+							  blk->work_data[4 * i + 2],
+							  blk->work_data[4 * i + 3]);
+		float4 diff = color - center_color;
+		float distance = dot(diff, diff);
+		distance_sum += distance;
+		distances[i] = distance;
+	}
+
+	// more numbers from random.org
+	float cluster_cutoffs[25] = {
+		0.952312f, 0.206893f, 0.835984f, 0.507813f, 0.466170f,
+		0.872331f, 0.488028f, 0.866394f, 0.363093f, 0.467905f,
+		0.812967f, 0.626220f, 0.932770f, 0.275454f, 0.832020f,
+		0.362217f, 0.318558f, 0.240113f, 0.009190f, 0.983995f,
+		0.566812f, 0.347661f, 0.731960f, 0.156391f, 0.297786f
+	};
+
+	while (1)
+	{
+		// pick a point in a weighted-random fashion.
+		float summa = 0.0f;
+		float distance_cutoff = distance_sum * cluster_cutoffs[samples_selected + 5 * partition_count];
+		for (i = 0; i < texels_per_block; i++)
+		{
+			summa += distances[i];
+			if (summa >= distance_cutoff)
+				break;
+		}
+		sample = i;
+		if (sample >= texels_per_block)
+			sample = texels_per_block - 1;
+
+
+		cluster_center_samples[samples_selected] = sample;
+		samples_selected++;
+		if (samples_selected >= partition_count)
+			break;
+
+		// update the distances with the new point.
+		center_color = float4(blk->work_data[4 * sample], blk->work_data[4 * sample + 1], blk->work_data[4 * sample + 2], blk->work_data[4 * sample + 3]);
+
+		distance_sum = 0.0f;
+		for (i = 0; i < texels_per_block; i++)
+		{
+			float4 color = float4(blk->work_data[4 * i],
+								  blk->work_data[4 * i + 1],
+								  blk->work_data[4 * i + 2],
+								  blk->work_data[4 * i + 3]);
+			float4 diff = color - center_color;
+			float distance = dot(diff, diff);
+			distance = MIN(distance, distances[i]);
+			distance_sum += distance;
+			distances[i] = distance;
+		}
+	}
+
+	// finally, gather up the results.
+	for (i = 0; i < partition_count; i++)
+	{
+		int sample = cluster_center_samples[i];
+		float4 color = float4(blk->work_data[4 * sample],
+							  blk->work_data[4 * sample + 1],
+							  blk->work_data[4 * sample + 2],
+							  blk->work_data[4 * sample + 3]);
+		cluster_centers[i] = color;
+	}
+}
+
+
+// basic K-means clustering: given a set of cluster centers,
+// assign each texel to a partition
+void basic_kmeans_assign_pass(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, const float4 * cluster_centers, int *partition_of_texel)
+{
+	int i, j;
+
+	int texels_per_block = xdim * ydim * zdim;
+
+	float distances[MAX_TEXELS_PER_BLOCK];
+	float4 center_color = cluster_centers[0];
+
+	int texels_per_partition[4];
+
+	texels_per_partition[0] = texels_per_block;
+	for (i = 1; i < partition_count; i++)
+		texels_per_partition[i] = 0;
+
+
+	for (i = 0; i < texels_per_block; i++)
+	{
+		float4 color = float4(blk->work_data[4 * i],
+							  blk->work_data[4 * i + 1],
+							  blk->work_data[4 * i + 2],
+							  blk->work_data[4 * i + 3]);
+		float4 diff = color - center_color;
+		float distance = dot(diff, diff);
+		distances[i] = distance;
+		partition_of_texel[i] = 0;
+	}
+
+
+
+	for (j = 1; j < partition_count; j++)
+	{
+		float4 center_color = cluster_centers[j];
+
+		for (i = 0; i < texels_per_block; i++)
+		{
+			float4 color = float4(blk->work_data[4 * i],
+								  blk->work_data[4 * i + 1],
+								  blk->work_data[4 * i + 2],
+								  blk->work_data[4 * i + 3]);
+			float4 diff = color - center_color;
+			float distance = dot(diff, diff);
+			if (distance < distances[i])
+			{
+				distances[i] = distance;
+				texels_per_partition[partition_of_texel[i]]--;
+				texels_per_partition[j]++;
+				partition_of_texel[i] = j;
+			}
+		}
+	}
+
+	// it is possible to get a situation where one of the partitions ends up
+	// without any texels. In this case, we assign texel N to partition N;
+	// this is silly, but ensures that every partition retains at least one texel.
+	// Reassigning a texel in this manner may cause another partition to go empty,
+	// so if we actually did a reassignment, we run the whole loop over again.
+	int problem_case;
+	do
+	{
+		problem_case = 0;
+		for (i = 0; i < partition_count; i++)
+		{
+			if (texels_per_partition[i] == 0)
+			{
+				texels_per_partition[partition_of_texel[i]]--;
+				texels_per_partition[i]++;
+				partition_of_texel[i] = i;
+				problem_case = 1;
+			}
+		}
+	}
+	while (problem_case != 0);
+
+}
+
+
+// basic k-means clustering: given a set of cluster assignments
+// for the texels, find the center position of each cluster.
+void basic_kmeans_update(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, const int *partition_of_texel, float4 * cluster_centers)
+{
+	int i;
+
+	int texels_per_block = xdim * ydim * zdim;
+
+	float4 color_sum[4];
+	int weight_sum[4];
+
+	for (i = 0; i < partition_count; i++)
+	{
+		color_sum[i] = float4(0, 0, 0, 0);
+		weight_sum[i] = 0;
+	}
+
+
+	// first, find the center-of-gravity in each cluster
+	for (i = 0; i < texels_per_block; i++)
+	{
+		float4 color = float4(blk->work_data[4 * i],
+							  blk->work_data[4 * i + 1],
+							  blk->work_data[4 * i + 2],
+							  blk->work_data[4 * i + 3]);
+		int part = partition_of_texel[i];
+		color_sum[part] = color_sum[part] + color;
+		weight_sum[part]++;
+	}
+
+	for (i = 0; i < partition_count; i++)
+	{
+		cluster_centers[i] = color_sum[i] * (1.0f / weight_sum[i]);
+	}
+}
+
+
+
+
+// after a few rounds of k-means-clustering, we should have a set of 2, 3 or 4 partitions;
+// we then turn this set into 2, 3 or 4 bitmaps. Then, for each of the 1024 partitions,
+// we try to match the bitmaps as well as possible.
+
+
+
+
+static inline int bitcount(uint64_t p)
+{
+	if (sizeof(void *) > 4)
+	{
+		uint64_t mask1 = 0x5555555555555555ULL;
+		uint64_t mask2 = 0x3333333333333333ULL;
+		uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
+		// best-known algorithm for 64-bit bitcount, assuming 64-bit processor
+		// should probably be adapted for use with 32-bit processors and/or processors
+		// with a POPCNT instruction, but leave that for later.
+		p -= (p >> 1) & mask1;
+		p = (p & mask2) + ((p >> 2) & mask2);
+		p += p >> 4;
+		p &= mask3;
+		p *= 0x0101010101010101ULL;
+		p >>= 56;
+		return (int)p;
+	}
+	else
+	{
+		// on 32-bit processor, split the 64-bit input argument in two,
+		// and bitcount each half separately.
+		uint32_t p1 = (uint32_t) p;
+		uint32_t p2 = (uint32_t) (p >> 32);
+		uint32_t mask1 = 0x55555555U;
+		uint32_t mask2 = 0x33333333U;
+		uint32_t mask3 = 0x0F0F0F0FU;
+		p1 = p1 - ((p1 >> 1) & mask1);
+		p2 = p2 - ((p2 >> 1) & mask1);
+		p1 = (p1 & mask2) + ((p1 >> 2) & mask2);
+		p2 = (p2 & mask2) + ((p2 >> 2) & mask2);
+		p1 += p1 >> 4;
+		p2 += p2 >> 4;
+		p1 &= mask3;
+		p2 &= mask3;
+		p1 += p2;
+		p1 *= 0x01010101U;
+		p1 >>= 24;
+		return (int)p1;
+	}
+}
+
+
+// compute the bit-mismatch for a partitioning in 2-partition mode
+static inline int partition_mismatch2(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
+{
+	int v1 = bitcount(a0 ^ b0) + bitcount(a1 ^ b1);
+	int v2 = bitcount(a0 ^ b1) + bitcount(a1 ^ b0);
+	return MIN(v1, v2);
+}
+
+
+// compute the bit-mismatch for a partitioning in 3-partition mode
+static inline int partition_mismatch3(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t b0, uint64_t b1, uint64_t b2)
+{
+	int p00 = bitcount(a0 ^ b0);
+	int p01 = bitcount(a0 ^ b1);
+	int p02 = bitcount(a0 ^ b2);
+
+	int p10 = bitcount(a1 ^ b0);
+	int p11 = bitcount(a1 ^ b1);
+	int p12 = bitcount(a1 ^ b2);
+
+	int p20 = bitcount(a2 ^ b0);
+	int p21 = bitcount(a2 ^ b1);
+	int p22 = bitcount(a2 ^ b2);
+
+	int s0 = p11 + p22;
+	int s1 = p12 + p21;
+	int v0 = MIN(s0, s1) + p00;
+
+	int s2 = p10 + p22;
+	int s3 = p12 + p20;
+	int v1 = MIN(s2, s3) + p01;
+
+	int s4 = p10 + p21;
+	int s5 = p11 + p20;
+	int v2 = MIN(s4, s5) + p02;
+
+	if (v1 < v0)
+		v0 = v1;
+	if (v2 < v0)
+		v0 = v2;
+
+	// 9 add, 5 MIN
+
+	return v0;
+}
+
+static inline int MIN3(int a, int b, int c)
+{
+	int d = MIN(a, b);
+	return MIN(c, d);
+}
+
+// compute the bit-mismatch for a partitioning in 4-partition mode
+static inline int partition_mismatch4(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3, uint64_t b0, uint64_t b1, uint64_t b2, uint64_t b3)
+{
+	int p00 = bitcount(a0 ^ b0);
+	int p01 = bitcount(a0 ^ b1);
+	int p02 = bitcount(a0 ^ b2);
+	int p03 = bitcount(a0 ^ b3);
+
+	int p10 = bitcount(a1 ^ b0);
+	int p11 = bitcount(a1 ^ b1);
+	int p12 = bitcount(a1 ^ b2);
+	int p13 = bitcount(a1 ^ b3);
+
+	int p20 = bitcount(a2 ^ b0);
+	int p21 = bitcount(a2 ^ b1);
+	int p22 = bitcount(a2 ^ b2);
+	int p23 = bitcount(a2 ^ b3);
+
+	int p30 = bitcount(a3 ^ b0);
+	int p31 = bitcount(a3 ^ b1);
+	int p32 = bitcount(a3 ^ b2);
+	int p33 = bitcount(a3 ^ b3);
+
+	int mx23 = MIN(p22 + p33, p23 + p32);
+	int mx13 = MIN(p21 + p33, p23 + p31);
+	int mx12 = MIN(p21 + p32, p22 + p31);
+	int mx03 = MIN(p20 + p33, p23 + p30);
+	int mx02 = MIN(p20 + p32, p22 + p30);
+	int mx01 = MIN(p21 + p30, p20 + p31);
+
+	int v0 = p00 + MIN3(p11 + mx23, p12 + mx13, p13 + mx12);
+	int v1 = p01 + MIN3(p10 + mx23, p12 + mx03, p13 + mx02);
+	int v2 = p02 + MIN3(p11 + mx03, p10 + mx13, p13 + mx01);
+	int v3 = p03 + MIN3(p11 + mx02, p12 + mx01, p10 + mx12);
+
+	int x0 = MIN(v0, v1);
+	int x1 = MIN(v2, v3);
+	return MIN(x0, x1);
+
+	// 16 bitcount, 17 MIN, 28 ADD
+}
+
+
+
+void count_partition_mismatch_bits(int xdim, int ydim, int zdim, int partition_count, const uint64_t bitmaps[4], int bitcounts[PARTITION_COUNT])
+{
+	int i;
+	const partition_info *pi = get_partition_table(xdim, ydim, zdim, partition_count);
+
+	if (partition_count == 2)
+	{
+		uint64_t bm0 = bitmaps[0];
+		uint64_t bm1 = bitmaps[1];
+		for (i = 0; i < PARTITION_COUNT; i++)
+		{
+			if (pi->partition_count == 2)
+			{
+				bitcounts[i] = partition_mismatch2(bm0, bm1, pi->coverage_bitmaps[0], pi->coverage_bitmaps[1]);
+			}
+			else
+				bitcounts[i] = 255;
+			pi++;
+		}
+	}
+	else if (partition_count == 3)
+	{
+		uint64_t bm0 = bitmaps[0];
+		uint64_t bm1 = bitmaps[1];
+		uint64_t bm2 = bitmaps[2];
+		for (i = 0; i < PARTITION_COUNT; i++)
+		{
+			if (pi->partition_count == 3)
+			{
+				bitcounts[i] = partition_mismatch3(bm0, bm1, bm2, pi->coverage_bitmaps[0], pi->coverage_bitmaps[1], pi->coverage_bitmaps[2]);
+			}
+			else
+				bitcounts[i] = 255;
+			pi++;
+		}
+	}
+	else if (partition_count == 4)
+	{
+		uint64_t bm0 = bitmaps[0];
+		uint64_t bm1 = bitmaps[1];
+		uint64_t bm2 = bitmaps[2];
+		uint64_t bm3 = bitmaps[3];
+		for (i = 0; i < PARTITION_COUNT; i++)
+		{
+			if (pi->partition_count == 4)
+			{
+				bitcounts[i] = partition_mismatch4(bm0, bm1, bm2, bm3, pi->coverage_bitmaps[0], pi->coverage_bitmaps[1], pi->coverage_bitmaps[2], pi->coverage_bitmaps[3]);
+			}
+			else
+				bitcounts[i] = 255;
+			pi++;
+		}
+	}
+
+}
+
+
+// counting-sort on the mismatch-bits, thereby
+// sorting the partitions into an ordering.
+
+void get_partition_ordering_by_mismatch_bits(const int mismatch_bits[PARTITION_COUNT], int partition_ordering[PARTITION_COUNT])
+{
+	int i;
+
+	int mscount[256];
+	for (i = 0; i < 256; i++)
+		mscount[i] = 0;
+
+	for (i = 0; i < PARTITION_COUNT; i++)
+		mscount[mismatch_bits[i]]++;
+
+	int summa = 0;
+	for (i = 0; i < 256; i++)
+	{
+		int cnt = mscount[i];
+		mscount[i] = summa;
+		summa += cnt;
+	}
+
+	for (i = 0; i < PARTITION_COUNT; i++)
+	{
+		int idx = mscount[mismatch_bits[i]]++;
+		partition_ordering[idx] = i;
+	}
+}
+
+
+
+
+void kmeans_compute_partition_ordering(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, int *ordering)
+{
+	int i;
+
+	const block_size_descriptor *bsd = get_block_size_descriptor(xdim, ydim, zdim);
+
+	float4 cluster_centers[4];
+	int partition_of_texel[MAX_TEXELS_PER_BLOCK];
+
+	// 3 passes of plain k-means partitioning
+	for (i = 0; i < 3; i++)
+	{
+		if (i == 0)
+			kpp_initialize(xdim, ydim, zdim, partition_count, blk, cluster_centers);
+		else
+			basic_kmeans_update(xdim, ydim, zdim, partition_count, blk, partition_of_texel, cluster_centers);
+
+		basic_kmeans_assign_pass(xdim, ydim, zdim, partition_count, blk, cluster_centers, partition_of_texel);
+	}
+
+	// at this point, we have a near-ideal partitioning.
+
+	// construct bitmaps
+	uint64_t bitmaps[4];
+	for (i = 0; i < 4; i++)
+		bitmaps[i] = 0ULL;
+
+	int texels_to_process = bsd->texelcount_for_bitmap_partitioning;
+	for (i = 0; i < texels_to_process; i++)
+	{
+		int idx = bsd->texels_for_bitmap_partitioning[i];
+		bitmaps[partition_of_texel[idx]] |= 1ULL << i;
+	}
+
+	int bitcounts[PARTITION_COUNT];
+	// for each entry in the partition table, count bits of partition-mismatch.
+	count_partition_mismatch_bits(xdim, ydim, zdim, partition_count, bitmaps, bitcounts);
+
+	// finally, sort the partitions by bits-of-partition-mismatch
+	get_partition_ordering_by_mismatch_bits(bitcounts, ordering);
+
+}
--- a/3rdparty/astc/astc_lib.cpp
+++ b/3rdparty/astc/astc_lib.cpp
@@ -0,0 +1,681 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *  @author Andrew Willmott
+ *
+ *  @brief  Library api for astc codec, to be used as an alternative to astc_toplevel.cpp
+ */
+/*----------------------------------------------------------------------------*/
+
+
+#include "astc_lib.h"
+
+#include "astc_codec_internals.h"
+
+#include <math.h>
+#include <stdio.h>
+
+// Globals declared in astc_codec_internals.h
+int perform_srgb_transform = 0;
+int alpha_force_use_of_hdr = 0;
+int rgb_force_use_of_hdr = 0;
+int print_tile_errors = 0;
+
+#ifdef DEBUG_PRINT_DIAGNOSTICS
+    int print_diagnostics = 0;
+    int diagnostics_tile = -1;
+#endif
+
+// ASTC code expects this to be defined
+void astc_codec_internal_error(const char* filename, int line)
+{
+    fprintf(stderr, "ASTC encode error @ %s:%d\n", filename, line);
+}
+
+// @todo add HDR variants
+
+namespace
+{
+    static bool s_tables_initialised = false;
+
+    inline void init_tables()
+    {
+        if (!s_tables_initialised)
+        {
+            prepare_angular_tables();
+            build_quantization_mode_table();
+
+            s_tables_initialised = true;
+        }
+    }
+
+    const swizzlepattern k_swizzles[] =
+    {
+        { 0, 1, 2, 3 }, // ASTC_RGBA
+        { 2, 1, 0, 3 }, // ASTC_BGRA
+    };
+
+    void alloc_temp_buffers(compress_symbolic_block_buffers* temp_buffers)
+    {
+        temp_buffers->ewb                                                = new error_weight_block;
+        temp_buffers->ewbo                                               = new error_weight_block_orig;
+        temp_buffers->tempblocks                                         = new symbolic_compressed_block[4];
+        temp_buffers->temp                                               = new imageblock;
+
+        temp_buffers->planes2                                            = new compress_fixed_partition_buffers;
+        temp_buffers->planes2->ei1                                       = new endpoints_and_weights;
+        temp_buffers->planes2->ei2                                       = new endpoints_and_weights;
+        temp_buffers->planes2->eix1                                      = new endpoints_and_weights[MAX_DECIMATION_MODES];
+        temp_buffers->planes2->eix2                                      = new endpoints_and_weights[MAX_DECIMATION_MODES];
+        temp_buffers->planes2->decimated_quantized_weights               = new float[2 * MAX_DECIMATION_MODES * MAX_WEIGHTS_PER_BLOCK];
+        temp_buffers->planes2->decimated_weights                         = new float[2 * MAX_DECIMATION_MODES * MAX_WEIGHTS_PER_BLOCK];
+        temp_buffers->planes2->flt_quantized_decimated_quantized_weights = new float[2 * MAX_WEIGHT_MODES * MAX_WEIGHTS_PER_BLOCK];
+        temp_buffers->planes2->u8_quantized_decimated_quantized_weights  = new uint8_t[2 * MAX_WEIGHT_MODES * MAX_WEIGHTS_PER_BLOCK];
+        temp_buffers->plane1                                             = temp_buffers->planes2;
+    }
+
+    void free_temp_buffers(compress_symbolic_block_buffers* temp_buffers)
+    {
+        delete[] temp_buffers->planes2->decimated_quantized_weights;
+        delete[] temp_buffers->planes2->decimated_weights;
+        delete[] temp_buffers->planes2->flt_quantized_decimated_quantized_weights;
+        delete[] temp_buffers->planes2->u8_quantized_decimated_quantized_weights;
+        delete[] temp_buffers->planes2->eix1;
+        delete[] temp_buffers->planes2->eix2;
+        delete   temp_buffers->planes2->ei1;
+        delete   temp_buffers->planes2->ei2;
+        delete   temp_buffers->planes2;
+
+        delete[] temp_buffers->tempblocks;
+        delete   temp_buffers->temp;
+        delete   temp_buffers->ewbo;
+        delete   temp_buffers->ewb;
+    }
+
+
+    // More direct version of the astc_codec_image routine, which operates on a
+    // more conventional 2D image layout. Doesn't support padding, so
+    // mean_stdev_radius and alpha_radius etc. must be zero.
+    void to_imageblock
+    (
+        imageblock*    pb,
+        const uint8_t* src_data,
+        int            src_stride,
+        int            xpos,
+        int            ypos,
+        int            xsize,
+        int            ysize,
+        int            xdim,
+        int            ydim,
+        swizzlepattern swz,
+        bool           srgb
+    )
+    {
+        float* fptr = pb->orig_data;
+
+        pb->xpos = xpos;
+        pb->ypos = ypos;
+        pb->zpos = 0;
+
+        float data[6];
+        data[4] = 0;
+        data[5] = 1;
+
+        for (int y = 0; y < ydim; y++)
+        {
+            for (int x = 0; x < xdim; x++)
+            {
+                int xi = xpos + x;
+                int yi = ypos + y;
+
+                if (xi >= xsize)
+                    xi = xsize - 1;
+                if (yi >= ysize)
+                    yi = ysize - 1;
+
+                int offset = src_stride * yi + 4 * xi;
+
+                int r = src_data[offset + 0];
+                int g = src_data[offset + 1];
+                int b = src_data[offset + 2];
+                int a = src_data[offset + 3];
+
+                data[0] = r / 255.0f;
+                data[1] = g / 255.0f;
+                data[2] = b / 255.0f;
+                data[3] = a / 255.0f;
+
+                fptr[0] = data[swz.r];
+                fptr[1] = data[swz.g];
+                fptr[2] = data[swz.b];
+                fptr[3] = data[swz.a];
+
+                fptr += 4;
+            }
+        }
+
+        // perform sRGB-to-linear transform on input data, if requested.
+        int pixelcount = xdim * ydim;
+
+        if (srgb)
+        {
+            fptr = pb->orig_data;
+
+            for (int i = 0; i < pixelcount; i++)
+            {
+                float r = fptr[0];
+                float g = fptr[1];
+                float b = fptr[2];
+
+                if (r <= 0.04045f)
+                    r = r * (1.0f / 12.92f);
+                else if (r <= 1)
+                    r = pow((r + 0.055f) * (1.0f / 1.055f), 2.4f);
+
+                if (g <= 0.04045f)
+                    g = g * (1.0f / 12.92f);
+                else if (g <= 1)
+                    g = pow((g + 0.055f) * (1.0f / 1.055f), 2.4f);
+
+                if (b <= 0.04045f)
+                    b = b * (1.0f / 12.92f);
+                else if (b <= 1)
+                    b = pow((b + 0.055f) * (1.0f / 1.055f), 2.4f);
+
+                fptr[0] = r;
+                fptr[1] = g;
+                fptr[2] = b;
+
+                fptr += 4;
+            }
+        }
+
+        for (int i = 0; i < pixelcount; i++)
+        {
+            pb->rgb_lns  [i] = 0;
+            pb->alpha_lns[i] = 0;
+            pb->nan_texel[i] = 0;
+        }
+
+        imageblock_initialize_work_from_orig(pb, pixelcount);
+
+        update_imageblock_flags(pb, xdim, ydim, 1);
+    }
+
+    void encode_astc
+    (
+        const uint8_t*                  src,
+        int                             src_stride,
+        swizzlepattern                  src_swz,
+        int                             xsize,
+        int                             ysize,
+        int                             xdim,
+        int                             ydim,
+        const error_weighting_params*   ewp,
+        astc_decode_mode                decode_mode,
+        uint8_t*                        dst
+    )
+    {
+        int xblocks = (xsize + xdim - 1) / xdim;
+        int yblocks = (ysize + ydim - 1) / ydim;
+
+        get_block_size_descriptor(xdim, ydim, 1);
+        get_partition_table(xdim, ydim, 1, 0);
+
+        imageblock pb;
+
+        compress_symbolic_block_buffers temp_buffers;
+        alloc_temp_buffers(&temp_buffers);
+
+        astc_codec_image image_info = { nullptr, nullptr, xsize, ysize, 1, 0 };
+
+        for (int y = 0; y < yblocks; y++)
+            for (int x = 0; x < xblocks; x++)
+            {
+                to_imageblock(&pb, src, src_stride, x * xdim, y * ydim, xsize, ysize, xdim, ydim, src_swz, decode_mode == DECODE_LDR_SRGB);
+
+                symbolic_compressed_block scb;
+                compress_symbolic_block(&image_info, decode_mode, xdim, ydim, 1, ewp, &pb, &scb, &temp_buffers);
+
+                physical_compressed_block pcb = symbolic_to_physical(xdim, ydim, 1, &scb);
+
+                uint8_t* dst_block = dst + (y * xblocks + x) * 16;
+
+                *(physical_compressed_block*) dst_block = pcb;
+            }
+
+        free_temp_buffers(&temp_buffers);
+    }
+
+    void init_ewp(error_weighting_params& ewp)
+    {
+        ewp.rgb_power                   = 1.0f;
+        ewp.alpha_power                 = 1.0f;
+        ewp.rgb_base_weight             = 1.0f;
+        ewp.alpha_base_weight           = 1.0f;
+        ewp.rgb_mean_weight             = 0.0f;
+        ewp.rgb_stdev_weight            = 0.0f;
+        ewp.alpha_mean_weight           = 0.0f;
+        ewp.alpha_stdev_weight          = 0.0f;
+
+        ewp.rgb_mean_and_stdev_mixing   = 0.0f;
+        ewp.mean_stdev_radius           = 0;
+        ewp.enable_rgb_scale_with_alpha = 0;
+        ewp.alpha_radius                = 0;
+
+        ewp.block_artifact_suppression  = 0.0f;
+        ewp.rgba_weights[0]             = 1.0f;
+        ewp.rgba_weights[1]             = 1.0f;
+        ewp.rgba_weights[2]             = 1.0f;
+        ewp.rgba_weights[3]             = 1.0f;
+        ewp.ra_normal_angular_scale     = 0;
+    }
+
+    void setup_ewp(ASTC_COMPRESS_MODE mode, int ydim, int xdim, error_weighting_params& ewp)
+    {
+        float oplimit_autoset    = 0.0;
+        float dblimit_autoset_2d = 0.0;
+        float bmc_autoset        = 0.0;
+        float mincorrel_autoset  = 0.0;
+
+        int plimit_autoset       = -1;
+        int maxiters_autoset     = 0;
+        int pcdiv                = 1;
+
+        float log10_texels_2d = log((float)(xdim * ydim)) / log(10.0f);
+
+        if (mode == ASTC_COMPRESS_VERY_FAST)
+        {
+            plimit_autoset = 2;
+            oplimit_autoset = 1.0;
+            dblimit_autoset_2d = MAX(70 - 35 * log10_texels_2d, 53 - 19 * log10_texels_2d);
+            bmc_autoset = 25;
+            mincorrel_autoset = 0.5;
+            maxiters_autoset = 1;
+
+            switch (ydim)
+            {
+            case 4:
+                pcdiv = 240;
+                break;
+            case 5:
+                pcdiv = 56;
+                break;
+            case 6:
+                pcdiv = 64;
+                break;
+            case 8:
+                pcdiv = 47;
+                break;
+            case 10:
+                pcdiv = 36;
+                break;
+            case 12:
+                pcdiv = 30;
+                break;
+            default:
+                pcdiv = 30;
+                break;
+            }
+        }
+        else if (mode == ASTC_COMPRESS_FAST)
+        {
+            plimit_autoset = 4;
+            oplimit_autoset = 1.0;
+            mincorrel_autoset = 0.5;
+            dblimit_autoset_2d = MAX(85 - 35 * log10_texels_2d, 63 - 19 * log10_texels_2d);
+            bmc_autoset = 50;
+            maxiters_autoset = 1;
+
+            switch (ydim)
+            {
+            case 4:
+                pcdiv = 60;
+                break;
+            case 5:
+                pcdiv = 27;
+                break;
+            case 6:
+                pcdiv = 30;
+                break;
+            case 8:
+                pcdiv = 24;
+                break;
+            case 10:
+                pcdiv = 16;
+                break;
+            case 12:
+                pcdiv = 20;
+                break;
+            default:
+                pcdiv = 20;
+                break;
+            };
+        }
+        else if (mode == ASTC_COMPRESS_MEDIUM)
+        {
+            plimit_autoset = 25;
+            oplimit_autoset = 1.2f;
+            mincorrel_autoset = 0.75f;
+            dblimit_autoset_2d = MAX(95 - 35 * log10_texels_2d, 70 - 19 * log10_texels_2d);
+            bmc_autoset = 75;
+            maxiters_autoset = 2;
+
+            switch (ydim)
+            {
+            case 4:
+                pcdiv = 25;
+                break;
+            case 5:
+                pcdiv = 15;
+                break;
+            case 6:
+                pcdiv = 15;
+                break;
+            case 8:
+                pcdiv = 10;
+                break;
+            case 10:
+                pcdiv = 8;
+                break;
+            case 12:
+                pcdiv = 6;
+                break;
+            default:
+                pcdiv = 6;
+                break;
+            };
+        }
+        else if (mode == ASTC_COMPRESS_THOROUGH)
+        {
+            plimit_autoset = 100;
+            oplimit_autoset = 2.5f;
+            mincorrel_autoset = 0.95f;
+            dblimit_autoset_2d = MAX(105 - 35 * log10_texels_2d, 77 - 19 * log10_texels_2d);
+            bmc_autoset = 95;
+            maxiters_autoset = 4;
+
+            switch (ydim)
+            {
+            case 4:
+                pcdiv = 12;
+                break;
+            case 5:
+                pcdiv = 7;
+                break;
+            case 6:
+                pcdiv = 7;
+                break;
+            case 8:
+                pcdiv = 5;
+                break;
+            case 10:
+                pcdiv = 4;
+                break;
+            case 12:
+                pcdiv = 3;
+                break;
+            default:
+                pcdiv = 3;
+                break;
+            };
+        }
+        else if (mode == ASTC_COMPRESS_EXHAUSTIVE)
+        {
+            plimit_autoset = PARTITION_COUNT;
+            oplimit_autoset = 1000.0f;
+            mincorrel_autoset = 0.99f;
+            dblimit_autoset_2d = 999.0f;
+            bmc_autoset = 100;
+            maxiters_autoset = 4;
+
+            switch (ydim)
+            {
+            case 4:
+                pcdiv = 3;
+                break;
+            case 5:
+                pcdiv = 1;
+                break;
+            case 6:
+                pcdiv = 1;
+                break;
+            case 8:
+                pcdiv = 1;
+                break;
+            case 10:
+                pcdiv = 1;
+                break;
+            case 12:
+                pcdiv = 1;
+                break;
+            default:
+                pcdiv = 1;
+                break;
+            }
+        }
+
+        int partitions_to_test = plimit_autoset;
+        float dblimit_2d = dblimit_autoset_2d;
+        float oplimit = oplimit_autoset;
+        float mincorrel = mincorrel_autoset;
+
+        int maxiters = maxiters_autoset;
+        ewp.max_refinement_iters = maxiters;
+
+        ewp.block_mode_cutoff = bmc_autoset / 100.0f;
+
+        float texel_avg_error_limit_2d;
+
+        if (rgb_force_use_of_hdr == 0)
+        {
+            texel_avg_error_limit_2d = pow(0.1f, dblimit_2d * 0.1f) * 65535.0f * 65535.0f;
+        }
+        else
+        {
+            texel_avg_error_limit_2d = 0.0f;
+        }
+        ewp.partition_1_to_2_limit = oplimit;
+        ewp.lowest_correlation_cutoff = mincorrel;
+
+        if (partitions_to_test < 1)
+            partitions_to_test = 1;
+        else if (partitions_to_test > PARTITION_COUNT)
+            partitions_to_test = PARTITION_COUNT;
+        ewp.partition_search_limit = partitions_to_test;
+
+        ewp.texel_avg_error_limit = texel_avg_error_limit_2d;
+
+        expand_block_artifact_suppression(xdim, ydim, 1, &ewp);
+    }
+}
+
+size_t astc_compressed_size(int w, int h, int bw, int bh)
+{
+    int nx = (w + bw - 1) / bw;
+    int ny = (h + bh - 1) / bh;
+
+    return nx * ny * 16;
+}
+
+void astc_compress
+(
+    int                src_width,
+    int                src_height,
+    const uint8_t*     src_data,
+    ASTC_CHANNELS      src_channels,
+    int                src_stride,
+
+    int                block_width,
+    int                block_height,
+    ASTC_COMPRESS_MODE compress_mode,
+    ASTC_DECODE_MODE   decode_mode,
+    uint8_t*           dst_data
+)
+{
+    init_tables();
+
+    error_weighting_params ewp;
+    init_ewp(ewp);
+    setup_ewp(compress_mode, block_width, block_height, ewp);
+
+    if (src_stride == 0)
+        src_stride = src_width * 4;
+
+    encode_astc
+    (
+        src_data,
+        src_stride,
+        k_swizzles[src_channels],
+        src_width, src_height,
+        block_width, block_height,
+        &ewp,
+        (astc_decode_mode) decode_mode,
+        dst_data
+    );
+}
+
+namespace
+{
+    // More direct version of the astc_codec_image routine, which operates on a
+    // more conventional 2D image layout.
+    void from_imageblock(int xdim, int ydim, const imageblock* pb, bool srgb, swizzlepattern swz, uint8_t* dst_data, int dst_stride)
+    {
+        const float*   fptr = pb->orig_data;
+        const uint8_t* nptr = pb->nan_texel;
+
+        for (int y = 0; y < ydim; y++)
+        {
+            for (int x = 0; x < xdim; x++)
+            {
+                if (*nptr)
+                {
+                    // NaN-pixel, but we can't display it. Display purple instead.
+                    dst_data[4 * x + swz.r] = 0xFF;
+                    dst_data[4 * x + swz.g] = 0x00;
+                    dst_data[4 * x + swz.b] = 0xFF;
+                    dst_data[4 * x + swz.a] = 0xFF;
+                }
+                else
+                {
+                    float r = fptr[0];
+                    float g = fptr[1];
+                    float b = fptr[2];
+                    float a = fptr[3];
+
+                    if (srgb)
+                    {
+                        if (r <= 0.0031308f)
+                            r = r * 12.92f;
+                        else if (r <= 1)
+                            r = 1.055f * pow(r, (1.0f / 2.4f)) - 0.055f;
+
+                        if (g <= 0.0031308f)
+                            g = g * 12.92f;
+                        else if (g <= 1)
+                            g = 1.055f * pow(g, (1.0f / 2.4f)) - 0.055f;
+
+                        if (b <= 0.0031308f)
+                            b = b * 12.92f;
+                        else if (b <= 1)
+                            b = 1.055f * pow(b, (1.0f / 2.4f)) - 0.055f;
+                    }
+
+                    // clamp to [0,1]
+                    if (r > 1.0f)
+                        r = 1.0f;
+                    if (g > 1.0f)
+                        g = 1.0f;
+                    if (b > 1.0f)
+                        b = 1.0f;
+                    if (a > 1.0f)
+                        a = 1.0f;
+
+                    // pack the data
+                    dst_data[4 * x + swz.r] = uint8_t(floorf(r * 255.0f + 0.5f));
+                    dst_data[4 * x + swz.g] = uint8_t(floorf(g * 255.0f + 0.5f));
+                    dst_data[4 * x + swz.b] = uint8_t(floorf(b * 255.0f + 0.5f));
+                    dst_data[4 * x + swz.a] = uint8_t(floorf(a * 255.0f + 0.5f));
+                }
+
+                fptr += 4;
+                nptr++;
+            }
+
+            dst_data += dst_stride;
+        }
+    }
+}
+
+void astc_decompress
+(
+    const uint8_t*     src_data,
+    int                xdim,
+    int                ydim,
+    ASTC_DECODE_MODE   decode_mode,
+
+    int                xsize,
+    int                ysize,
+    uint8_t*           dst_data,
+    ASTC_CHANNELS      dst_channels,
+    int                dst_stride
+)
+{
+    init_tables();
+
+    int xblocks = (xsize + xdim - 1) / xdim;
+    int yblocks = (ysize + ydim - 1) / ydim;
+
+    if (dst_stride == 0)
+        dst_stride = 4 * xsize;
+
+    imageblock pb;
+
+    for (int y = 0; y < yblocks; y++)
+    {
+        int ypos = y * ydim;
+        int clamp_ydim = MIN(ysize - ypos, ydim);
+
+        uint8_t* dst_row = dst_data + ypos * dst_stride;
+
+        for (int x = 0; x < xblocks; x++)
+        {
+            int xpos = x * xdim;
+            int clamp_xdim = MIN(xsize - xpos, xdim);
+
+            physical_compressed_block pcb = *(const physical_compressed_block *) src_data;
+            symbolic_compressed_block scb;
+
+            physical_to_symbolic(xdim, ydim, 1, pcb, &scb);
+            decompress_symbolic_block((astc_decode_mode) decode_mode, xdim, ydim, 1, xpos, ypos, 0, &scb, &pb);
+
+            from_imageblock(clamp_xdim, clamp_ydim, &pb, decode_mode == ASTC_DECODE_LDR_SRGB, k_swizzles[dst_channels], dst_row + xpos * 4, dst_stride);
+
+            src_data += 16;
+        }
+    }
+}
+
+// Relevant astc source files. These aren't set up for a bulk build yet though.
+#ifdef DISABLED
+    #include "astc_block_sizes2.cpp"
+    #include "astc_color_quantize.cpp"
+    #include "astc_color_unquantize.cpp"
+    #include "astc_compress_symbolic.cpp"
+    #include "astc_compute_variance.cpp"
+    #include "astc_decompress_symbolic.cpp"
+    #include "astc_encoding_choice_error.cpp"
+    #include "astc_find_best_partitioning.cpp"
+    #include "astc_ideal_endpoints_and_weights.cpp"
+    #include "astc_imageblock.cpp"
+    #include "astc_integer_sequence.cpp"
+    #include "astc_kmeans_partitioning.cpp"
+    #include "astc_partition_tables.cpp"
+    #include "astc_percentile_tables.cpp"
+    #include "astc_pick_best_endpoint_format.cpp"
+    #include "astc_quantization.cpp"
+    #include "astc_symbolic_physical.cpp"
+    #include "astc_weight_align.cpp"
+    #include "astc_weight_quant_xfer_tables.cpp"
+    #include "mathlib.cpp"
+    #include "softfloat.cpp"
+#endif
--- a/3rdparty/astc/astc_lib.h
+++ b/3rdparty/astc/astc_lib.h
@@ -0,0 +1,73 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	@author Andrew Willmott
+ *
+ *	@brief	Library api for astc codec, to be used as an alternative to astc_toplevel.cpp
+ */
+/*----------------------------------------------------------------------------*/
+
+#ifndef ASTC_LIB_H
+#define ASTC_LIB_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+enum ASTC_COMPRESS_MODE     // Trade-off compression quality for speed
+{
+	ASTC_COMPRESS_VERY_FAST,
+	ASTC_COMPRESS_FAST,
+	ASTC_COMPRESS_MEDIUM,
+	ASTC_COMPRESS_THOROUGH,
+	ASTC_COMPRESS_EXHAUSTIVE,
+};
+
+enum ASTC_DECODE_MODE
+{
+	ASTC_DECODE_LDR_SRGB,   // texture will be decompressed to 8-bit SRGB
+	ASTC_DECODE_LDR_LINEAR, // texture will be decompressed to 8-bit linear
+	ASTC_DECODE_HDR         // texture will be decompressed to 16-bit linear
+};
+
+enum ASTC_CHANNELS
+{
+    ASTC_RGBA,
+    ASTC_BGRA
+};
+
+
+size_t astc_compressed_size(int block_width, int block_height, int width, int height);
+//!< Returns size of the compressed data for a width x height source image, assuming the given block size
+
+void astc_compress
+(
+    int                src_width,
+    int                src_height,
+    const uint8_t*     src_data,
+    ASTC_CHANNELS      src_channels,
+    int                src_stride,
+
+    int                block_width,
+    int                block_height,
+    ASTC_COMPRESS_MODE compress_mode,
+    ASTC_DECODE_MODE   decode_mode,
+    uint8_t*           dst_data
+);
+//!< Compress 8-bit rgba source image into dst_data (expected to be of size astc_compressed_size(...))
+
+void astc_decompress
+(
+    const uint8_t*     src_data,
+    int                block_width,
+    int                block_height,
+    ASTC_DECODE_MODE   decode_mode,
+
+    int                dst_width,
+    int                dst_height,
+    uint8_t*           dst_data,
+    ASTC_CHANNELS      dst_channels,
+    int                dst_stride
+);
+//!< Decompress astc source image into 8-bit rgba destination image.
+
+#endif
+
--- a/3rdparty/astc/astc_partition_tables.cpp
+++ b/3rdparty/astc/astc_partition_tables.cpp
@@ -0,0 +1,323 @@
+/*----------------------------------------------------------------------------*/  
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Functions to generate partition tables for ASTC.
+ *
+ *			We generate tables only for the block sizes that have actually been
+ *			specified to the codec.
+ */ 
+/*----------------------------------------------------------------------------*/ 
+
+#include "astc_codec_internals.h"
+
+static partition_info **partition_tables[4096];
+
+/*
+	Produce a canonicalized representation of a partition pattern
+
+	The largest possible such representation is 432 bits, equal to 7 uint64_t values.
+*/
+static void gen_canonicalized_partition_table(int texel_count, const uint8_t * partition_table, uint64_t canonicalized[7])
+{
+	int i;
+	for (i = 0; i < 7; i++)
+		canonicalized[i] = 0;
+
+	int mapped_index[4];
+	int map_weight_count = 0;
+	for (i = 0; i < 4; i++)
+		mapped_index[i] = -1;
+
+	for (i = 0; i < texel_count; i++)
+	{
+		int index = partition_table[i];
+		if (mapped_index[index] == -1)
+			mapped_index[index] = map_weight_count++;
+		uint64_t xlat_index = mapped_index[index];
+		canonicalized[i >> 5] |= xlat_index << (2 * (i & 0x1F));
+	}
+}
+
+
+static int compare_canonicalized_partition_tables(const uint64_t part1[7], const uint64_t part2[7])
+{
+	if (part1[0] != part2[0])
+		return 0;
+	if (part1[1] != part2[1])
+		return 0;
+	if (part1[2] != part2[2])
+		return 0;
+	if (part1[3] != part2[3])
+		return 0;
+	if (part1[4] != part2[4])
+		return 0;
+	if (part1[5] != part2[5])
+		return 0;
+	if (part1[6] != part2[6])
+		return 0;
+	return 1;
+}
+
+
+/* 
+   For a partition table, detect partitionings that are equivalent, then mark them as invalid. This reduces the number of partitions that the codec has to consider and thus improves encode
+   performance. */
+static void partition_table_zap_equal_elements(int xdim, int ydim, int zdim, partition_info * pi)
+{
+	int partition_tables_zapped = 0;
+
+	int texel_count = xdim * ydim * zdim;
+
+	int i, j;
+	uint64_t *canonicalizeds = new uint64_t[PARTITION_COUNT * 7];
+
+
+	for (i = 0; i < PARTITION_COUNT; i++)
+	{
+		gen_canonicalized_partition_table(texel_count, pi[i].partition_of_texel, canonicalizeds + i * 7);
+	}
+
+	for (i = 0; i < PARTITION_COUNT; i++)
+	{
+		for (j = 0; j < i; j++)
+		{
+			if (compare_canonicalized_partition_tables(canonicalizeds + 7 * i, canonicalizeds + 7 * j))
+			{
+				pi[i].partition_count = 0;
+				partition_tables_zapped++;
+				break;
+			}
+		}
+	}
+	delete[]canonicalizeds;
+}
+
+
+uint32_t hash52(uint32_t inp)
+{
+	inp ^= inp >> 15;
+
+	inp *= 0xEEDE0891;			// (2^4+1)*(2^7+1)*(2^17-1)
+	inp ^= inp >> 5;
+	inp += inp << 16;
+	inp ^= inp >> 7;
+	inp ^= inp >> 3;
+	inp ^= inp << 6;
+	inp ^= inp >> 17;
+	return inp;
+}
+
+
+
+int select_partition(int seed, int x, int y, int z, int partitioncount, int small_block)
+{
+	if (small_block)
+	{
+		x <<= 1;
+		y <<= 1;
+		z <<= 1;
+	}
+
+	seed += (partitioncount - 1) * 1024;
+
+	uint32_t rnum = hash52(seed);
+
+	uint8_t seed1 = rnum & 0xF;
+	uint8_t seed2 = (rnum >> 4) & 0xF;
+	uint8_t seed3 = (rnum >> 8) & 0xF;
+	uint8_t seed4 = (rnum >> 12) & 0xF;
+	uint8_t seed5 = (rnum >> 16) & 0xF;
+	uint8_t seed6 = (rnum >> 20) & 0xF;
+	uint8_t seed7 = (rnum >> 24) & 0xF;
+	uint8_t seed8 = (rnum >> 28) & 0xF;
+	uint8_t seed9 = (rnum >> 18) & 0xF;
+	uint8_t seed10 = (rnum >> 22) & 0xF;
+	uint8_t seed11 = (rnum >> 26) & 0xF;
+	uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
+
+	// squaring all the seeds in order to bias their distribution
+	// towards lower values.
+	seed1 *= seed1;
+	seed2 *= seed2;
+	seed3 *= seed3;
+	seed4 *= seed4;
+	seed5 *= seed5;
+	seed6 *= seed6;
+	seed7 *= seed7;
+	seed8 *= seed8;
+	seed9 *= seed9;
+	seed10 *= seed10;
+	seed11 *= seed11;
+	seed12 *= seed12;
+
+
+	int sh1, sh2, sh3;
+	if (seed & 1)
+	{
+		sh1 = (seed & 2 ? 4 : 5);
+		sh2 = (partitioncount == 3 ? 6 : 5);
+	}
+	else
+	{
+		sh1 = (partitioncount == 3 ? 6 : 5);
+		sh2 = (seed & 2 ? 4 : 5);
+	}
+	sh3 = (seed & 0x10) ? sh1 : sh2;
+
+	seed1 >>= sh1;
+	seed2 >>= sh2;
+	seed3 >>= sh1;
+	seed4 >>= sh2;
+	seed5 >>= sh1;
+	seed6 >>= sh2;
+	seed7 >>= sh1;
+	seed8 >>= sh2;
+
+	seed9 >>= sh3;
+	seed10 >>= sh3;
+	seed11 >>= sh3;
+	seed12 >>= sh3;
+
+
+
+	int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+	int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+	int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+	int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+
+
+	// apply the saw
+	a &= 0x3F;
+	b &= 0x3F;
+	c &= 0x3F;
+	d &= 0x3F;
+
+	// remove some of the components if we are to output < 4 partitions.
+	if (partitioncount <= 3)
+		d = 0;
+	if (partitioncount <= 2)
+		c = 0;
+	if (partitioncount <= 1)
+		b = 0;
+
+	int partition;
+	if (a >= b && a >= c && a >= d)
+		partition = 0;
+	else if (b >= c && b >= d)
+		partition = 1;
+	else if (c >= d)
+		partition = 2;
+	else
+		partition = 3;
+	return partition;
+}
+
+
+
+void generate_one_partition_table(int xdim, int ydim, int zdim, int partition_count, int partition_index, partition_info * pt)
+{
+	int small_block = (xdim * ydim * zdim) < 32;
+
+	uint8_t *partition_of_texel = pt->partition_of_texel;
+	int x, y, z, i;
+
+
+	for (z = 0; z < zdim; z++)
+		for (y = 0; y < ydim; y++)
+			for (x = 0; x < xdim; x++)
+			{
+				uint8_t part = select_partition(partition_index, x, y, z, partition_count, small_block);
+				*partition_of_texel++ = part;
+			}
+
+
+	int texels_per_block = xdim * ydim * zdim;
+
+	int counts[4];
+	for (i = 0; i < 4; i++)
+		counts[i] = 0;
+
+	for (i = 0; i < texels_per_block; i++)
+	{
+		int partition = pt->partition_of_texel[i];
+		pt->texels_of_partition[partition][counts[partition]++] = i;
+	}
+
+	for (i = 0; i < 4; i++)
+		pt->texels_per_partition[i] = counts[i];
+
+	if (counts[0] == 0)
+		pt->partition_count = 0;
+	else if (counts[1] == 0)
+		pt->partition_count = 1;
+	else if (counts[2] == 0)
+		pt->partition_count = 2;
+	else if (counts[3] == 0)
+		pt->partition_count = 3;
+	else
+		pt->partition_count = 4;
+
+
+
+	for (i = 0; i < 4; i++)
+		pt->coverage_bitmaps[i] = 0ULL;
+
+	const block_size_descriptor *bsd = get_block_size_descriptor(xdim, ydim, zdim);
+	int texels_to_process = bsd->texelcount_for_bitmap_partitioning;
+	for (i = 0; i < texels_to_process; i++)
+	{
+		int idx = bsd->texels_for_bitmap_partitioning[i];
+		pt->coverage_bitmaps[pt->partition_of_texel[idx]] |= 1ULL << i;
+	}
+
+}
+
+static void generate_partition_tables(int xdim, int ydim, int zdim)
+{
+	int i;
+
+
+	partition_info *one_partition = new partition_info;
+	partition_info *two_partitions = new partition_info[1024];
+	partition_info *three_partitions = new partition_info[1024];
+	partition_info *four_partitions = new partition_info[1024];
+
+	partition_info **partition_table = new partition_info *[5];
+	partition_table[0] = NULL;
+	partition_table[1] = one_partition;
+	partition_table[2] = two_partitions;
+	partition_table[3] = three_partitions;
+	partition_table[4] = four_partitions;
+
+	generate_one_partition_table(xdim, ydim, zdim, 1, 0, one_partition);
+	for (i = 0; i < 1024; i++)
+	{
+		generate_one_partition_table(xdim, ydim, zdim, 2, i, two_partitions + i);
+		generate_one_partition_table(xdim, ydim, zdim, 3, i, three_partitions + i);
+		generate_one_partition_table(xdim, ydim, zdim, 4, i, four_partitions + i);
+	}
+
+	partition_table_zap_equal_elements(xdim, ydim, zdim, two_partitions);
+	partition_table_zap_equal_elements(xdim, ydim, zdim, three_partitions);
+	partition_table_zap_equal_elements(xdim, ydim, zdim, four_partitions);
+
+	partition_tables[xdim + 16 * ydim + 256 * zdim] = partition_table;
+}
+
+
+const partition_info *get_partition_table(int xdim, int ydim, int zdim, int partition_count)
+{
+	int ptindex = xdim + 16 * ydim + 256 * zdim;
+	if (partition_tables[ptindex] == NULL)
+		generate_partition_tables(xdim, ydim, zdim);
+
+	return partition_tables[ptindex][partition_count];
+}
--- a/3rdparty/astc/astc_percentile_tables.cpp
+++ b/3rdparty/astc/astc_percentile_tables.cpp
--- a/3rdparty/astc/astc_pick_best_endpoint_format.cpp
+++ b/3rdparty/astc/astc_pick_best_endpoint_format.cpp
@@ -0,0 +1,938 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Functions to pick the best ASTC endpoint format for a given block.
+ */
+/*----------------------------------------------------------------------------*/
+#include "astc_codec_internals.h"
+
+#ifdef DEBUG_PRINT_DIAGNOSTICS
+	#include <stdio.h>
+#endif
+
+#include <math.h>
+
+/*
+   functions to determine, for a given partitioning, which color endpoint formats are the best to use.
+
+ */
+
+
+// for a given partition, compute for every (integer-component-count, quantization-level)
+// the color error.
+
+
+static void compute_color_error_for_every_integer_count_and_quantization_level(int encode_hdr_rgb,	// 1 = perform HDR encoding, 0 = perform LDR encoding.
+																			   int encode_hdr_alpha, int partition_index, const partition_info * pi,
+																				const encoding_choice_errors * eci,	// pointer to the structure for the CURRENT partition.
+																			   const endpoints * ep, float4 error_weightings[4],
+																			   // arrays to return results back through.
+																			   float best_error[21][4], int format_of_choice[21][4])
+{
+	int i, j;
+	int partition_size = pi->texels_per_partition[partition_index];
+
+	static const float baseline_quant_error[21] = {
+		(65536.0f * 65536.0f / 18.0f),				// 2 values, 1 step
+		(65536.0f * 65536.0f / 18.0f) / (2 * 2),	// 3 values, 2 steps
+		(65536.0f * 65536.0f / 18.0f) / (3 * 3),	// 4 values, 3 steps
+		(65536.0f * 65536.0f / 18.0f) / (4 * 4),	// 5 values
+		(65536.0f * 65536.0f / 18.0f) / (5 * 5),
+		(65536.0f * 65536.0f / 18.0f) / (7 * 7),
+		(65536.0f * 65536.0f / 18.0f) / (9 * 9),
+		(65536.0f * 65536.0f / 18.0f) / (11 * 11),
+		(65536.0f * 65536.0f / 18.0f) / (15 * 15),
+		(65536.0f * 65536.0f / 18.0f) / (19 * 19),
+		(65536.0f * 65536.0f / 18.0f) / (23 * 23),
+		(65536.0f * 65536.0f / 18.0f) / (31 * 31),
+		(65536.0f * 65536.0f / 18.0f) / (39 * 39),
+		(65536.0f * 65536.0f / 18.0f) / (47 * 47),
+		(65536.0f * 65536.0f / 18.0f) / (63 * 63),
+		(65536.0f * 65536.0f / 18.0f) / (79 * 79),
+		(65536.0f * 65536.0f / 18.0f) / (95 * 95),
+		(65536.0f * 65536.0f / 18.0f) / (127 * 127),
+		(65536.0f * 65536.0f / 18.0f) / (159 * 159),
+		(65536.0f * 65536.0f / 18.0f) / (191 * 191),
+		(65536.0f * 65536.0f / 18.0f) / (255 * 255)
+	};
+
+	float4 ep0 = ep->endpt0[partition_index];
+	float4 ep1 = ep->endpt1[partition_index];
+
+	float ep0_max = MAX(MAX(ep0.x, ep0.y), ep0.z);
+	float ep0_min = MIN(MIN(ep0.x, ep0.y), ep0.z);
+	float ep1_max = MAX(MAX(ep1.x, ep1.y), ep1.z);
+	float ep1_min = MIN(MIN(ep1.x, ep1.y), ep1.z);
+
+	ep0_min = MAX(ep0_min, 0.0f);
+	ep1_min = MAX(ep1_min, 0.0f);
+	ep0_max = MAX(ep0_max, 1e-10f);
+	ep1_max = MAX(ep1_max, 1e-10f);
+
+	float4 error_weight = error_weightings[partition_index];
+
+	float error_weight_rgbsum = error_weight.x + error_weight.y + error_weight.z;
+
+	float range_upper_limit_rgb = encode_hdr_rgb ? 61440.0f : 65535.0f;
+	float range_upper_limit_alpha = encode_hdr_alpha ? 61440.0f : 65535.0f;
+
+	// it is possible to get endpoint colors significantly outside [0,upper-limit]
+	// even if the input data are safely contained in [0,upper-limit];
+	// we need to add an error term for this situation,
+	float4 ep0_range_error_high;
+	float4 ep1_range_error_high;
+	float4 ep0_range_error_low;
+	float4 ep1_range_error_low;
+
+	ep0_range_error_high.x = MAX(0.0f, ep0.x - range_upper_limit_rgb);
+	ep0_range_error_high.y = MAX(0.0f, ep0.y - range_upper_limit_rgb);
+	ep0_range_error_high.z = MAX(0.0f, ep0.z - range_upper_limit_rgb);
+	ep0_range_error_high.w = MAX(0.0f, ep0.w - range_upper_limit_alpha);
+	ep1_range_error_high.x = MAX(0.0f, ep1.x - range_upper_limit_rgb);
+	ep1_range_error_high.y = MAX(0.0f, ep1.y - range_upper_limit_rgb);
+	ep1_range_error_high.z = MAX(0.0f, ep1.z - range_upper_limit_rgb);
+	ep1_range_error_high.w = MAX(0.0f, ep1.w - range_upper_limit_alpha);
+
+	ep0_range_error_low.x = MIN(0.0f, ep0.x);
+	ep0_range_error_low.y = MIN(0.0f, ep0.y);
+	ep0_range_error_low.z = MIN(0.0f, ep0.z);
+	ep0_range_error_low.w = MIN(0.0f, ep0.w);
+	ep1_range_error_low.x = MIN(0.0f, ep1.x);
+	ep1_range_error_low.y = MIN(0.0f, ep1.y);
+	ep1_range_error_low.z = MIN(0.0f, ep1.z);
+	ep1_range_error_low.w = MIN(0.0f, ep1.w);
+
+	float4 sum_range_error =
+		(ep0_range_error_low * ep0_range_error_low) + (ep1_range_error_low * ep1_range_error_low) + (ep0_range_error_high * ep0_range_error_high) + (ep1_range_error_high * ep1_range_error_high);
+	float rgb_range_error = dot(sum_range_error.xyz, error_weight.xyz) * 0.5f * partition_size;
+	float alpha_range_error = sum_range_error.w * error_weight.w * 0.5f * partition_size;
+
+
+	#ifdef DEBUG_PRINT_DIAGNOSTICS
+		if (print_diagnostics)
+		{
+			printf("%s : partition=%d\nrgb-error_wt=%f  alpha_error_wt=%f\n", __func__, partition_index, error_weight_rgbsum, error_weight.w);
+
+			printf("ep0 = %f %f %f %f\n", ep0.x, ep0.y, ep0.z, ep0.w);
+			printf("ep1 = %f %f %f %f\n", ep1.x, ep1.y, ep1.z, ep1.w);
+
+
+			printf("rgb_range_error = %f, alpha_range_error = %f\n", rgb_range_error, alpha_range_error);
+
+			printf("rgb-luma-error: %f\n", eci->rgb_luma_error);
+		}
+	#endif
+
+	if (encode_hdr_rgb)
+	{
+
+		// collect some statistics
+		float af, cf;
+		if (ep1.x > ep1.y && ep1.x > ep1.z)
+		{
+			af = ep1.x;
+			cf = ep1.x - ep0.x;
+		}
+		else if (ep1.y > ep1.z)
+		{
+			af = ep1.y;
+			cf = ep1.y - ep0.y;
+		}
+		else
+		{
+			af = ep1.z;
+			cf = ep1.z - ep0.z;
+		}
+
+		float bf = af - ep1_min;	// estimate of color-component spread in high endpoint color
+		float3 prd = ep1.xyz - float3(cf, cf, cf);
+		float3 pdif = prd - ep0.xyz;
+		// estimate of color-component spread in low endpoint color
+		float df = MAX(MAX(fabs(pdif.x), fabs(pdif.y)), fabs(pdif.z));
+
+		int b = (int)bf;
+		int c = (int)cf;
+		int d = (int)df;
+
+
+		// determine which one of the 6 submodes is likely to be used in
+		// case of an RGBO-mode
+		int rgbo_mode = 5;		// 7 bits per component
+		// mode 4: 8 7 6
+		if (b < 32768 && c < 16384)
+			rgbo_mode = 4;
+		// mode 3: 9 6 7
+		if (b < 8192 && c < 16384)
+			rgbo_mode = 3;
+		// mode 2: 10 5 8
+		if (b < 2048 && c < 16384)
+			rgbo_mode = 2;
+		// mode 1: 11 6 5
+		if (b < 2048 && c < 1024)
+			rgbo_mode = 1;
+		// mode 0: 11 5 7
+		if (b < 1024 && c < 4096)
+			rgbo_mode = 0;
+
+		// determine which one of the 9 submodes is likely to be used in
+		// case of an RGB-mode.
+		int rgb_mode = 8;		// 8 bits per component, except 7 bits for blue
+
+		// mode 0: 9 7 6 7
+		if (b < 16384 && c < 8192 && d < 8192)
+			rgb_mode = 0;
+		// mode 1: 9 8 6 6
+		if (b < 32768 && c < 8192 && d < 4096)
+			rgb_mode = 1;
+		// mode 2: 10 6 7 7
+		if (b < 4096 && c < 8192 && d < 4096)
+			rgb_mode = 2;
+		// mode 3: 10 7 7 6
+		if (b < 8192 && c < 8192 && d < 2048)
+			rgb_mode = 3;
+		// mode 4: 11 8 6 5
+		if (b < 8192 && c < 2048 && d < 512)
+			rgb_mode = 4;
+		// mode 5: 11 6 8 6
+		if (b < 2048 && c < 8192 && d < 1024)
+			rgb_mode = 5;
+		// mode 6: 12 7 7 5
+		if (b < 2048 && c < 2048 && d < 256)
+			rgb_mode = 6;
+		// mode 7: 12 6 7 6
+		if (b < 1024 && c < 2048 && d < 512)
+			rgb_mode = 7;
+
+
+		static const float rgbo_error_scales[6] = { 4.0f, 4.0f, 16.0f, 64.0f, 256.0f, 1024.0f };
+		static const float rgb_error_scales[9] = { 64.0f, 64.0f, 16.0f, 16.0f, 4.0f, 4.0f, 1.0f, 1.0f, 384.0f };
+
+		float mode7mult = rgbo_error_scales[rgbo_mode] * 0.0015f;	// empirically determined ....
+		float mode11mult = rgb_error_scales[rgb_mode] * 0.010f;	// empirically determined ....
+
+
+		float lum_high = (ep1.x + ep1.y + ep1.z) * (1.0f / 3.0f);
+		float lum_low = (ep0.x + ep0.y + ep0.z) * (1.0f / 3.0f);
+		float lumdif = lum_high - lum_low;
+		float mode23mult = lumdif < 960 ? 4.0f : lumdif < 3968 ? 16.0f : 128.0f;
+
+		mode23mult *= 0.0005f;	// empirically determined ....
+
+
+
+		// pick among the available HDR endpoint modes
+		for (i = 0; i < 8; i++)
+		{
+			best_error[i][3] = 1e30f;
+			format_of_choice[i][3] = encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA;
+			best_error[i][2] = 1e30f;
+			format_of_choice[i][2] = FMT_HDR_RGB;
+			best_error[i][1] = 1e30f;
+			format_of_choice[i][1] = FMT_HDR_RGB_SCALE;
+			best_error[i][0] = 1e30f;
+			format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
+		}
+
+
+		for (i = 8; i < 21; i++)
+		{
+			// base_quant_error should depend on the scale-factor that would be used
+			// during actual encode of the color value.
+
+			float base_quant_error = baseline_quant_error[i] * partition_size * 1.0f;
+			float rgb_quantization_error = error_weight_rgbsum * base_quant_error * 2.0f;
+			float alpha_quantization_error = error_weight.w * base_quant_error * 2.0f;
+			float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error;
+
+			#ifdef DEBUG_PRINT_DIAGNOSTICS
+				if (print_diagnostics)
+					printf("rgba-quant = %f can_offset_encode=%d\n", rgba_quantization_error, eci->can_offset_encode);
+			#endif
+
+			// for 8 integers, we have two encodings: one with HDR alpha and another one
+			// with LDR alpha.
+
+			float full_hdr_rgba_error = rgba_quantization_error + rgb_range_error + alpha_range_error;
+			best_error[i][3] = full_hdr_rgba_error;
+			format_of_choice[i][3] = encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA;
+
+			// for 6 integers, we have one HDR-RGB encoding
+			float full_hdr_rgb_error = (rgb_quantization_error * mode11mult) + rgb_range_error + eci->alpha_drop_error;
+			best_error[i][2] = full_hdr_rgb_error;
+			format_of_choice[i][2] = FMT_HDR_RGB;
+
+			// for 4 integers, we have one HDR-RGB-Scale encoding
+			float hdr_rgb_scale_error = (rgb_quantization_error * mode7mult) + rgb_range_error + eci->alpha_drop_error + eci->rgb_luma_error;
+
+			best_error[i][1] = hdr_rgb_scale_error;
+			format_of_choice[i][1] = FMT_HDR_RGB_SCALE;
+
+			// for 2 integers, we assume luminance-with-large-range
+			float hdr_luminance_error = (rgb_quantization_error * mode23mult) + rgb_range_error + eci->alpha_drop_error + eci->luminance_error;
+			best_error[i][0] = hdr_luminance_error;
+			format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
+
+			#ifdef DEBUG_PRINT_DIAGNOSTICS
+				if (print_diagnostics)
+				{
+					for (j = 0; j < 4; j++)
+					{
+						printf("(hdr) quant-level=%d ints=%d format=%d error=%f\n", i, j, format_of_choice[i][j], best_error[i][j]);
+					}
+				}
+			#endif
+		}
+	}
+
+
+	else
+	{
+		for (i = 0; i < 4; i++)
+		{
+			best_error[i][3] = 1e30f;
+			best_error[i][2] = 1e30f;
+			best_error[i][1] = 1e30f;
+			best_error[i][0] = 1e30f;
+
+			format_of_choice[i][3] = FMT_RGBA;
+			format_of_choice[i][2] = FMT_RGB;
+			format_of_choice[i][1] = FMT_RGB_SCALE;
+			format_of_choice[i][0] = FMT_LUMINANCE;
+		}
+
+
+		// pick among the available LDR endpoint modes
+		for (i = 4; i < 21; i++)
+		{
+			float base_quant_error = baseline_quant_error[i] * partition_size * 1.0f;
+			float rgb_quantization_error = error_weight_rgbsum * base_quant_error;
+			float alpha_quantization_error = error_weight.w * base_quant_error;
+			float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error;
+
+			#ifdef DEBUG_PRINT_DIAGNOSTICS
+				if (print_diagnostics)
+					printf("rgba-quant = %f can_offset_encode=%d\n", rgba_quantization_error, eci->can_offset_encode);
+			#endif
+
+			// for 8 integers, the available encodings are:
+			// full LDR RGB-Alpha
+			float full_ldr_rgba_error = rgba_quantization_error;
+			if (eci->can_blue_contract)
+				full_ldr_rgba_error *= 0.625f;
+			if (eci->can_offset_encode && i <= 18)
+				full_ldr_rgba_error *= 0.5f;
+			full_ldr_rgba_error += rgb_range_error + alpha_range_error;
+
+			best_error[i][3] = full_ldr_rgba_error;
+			format_of_choice[i][3] = FMT_RGBA;
+
+			// for 6 integers, we have:
+			// - an LDR-RGB encoding
+			// - an RGBS + Alpha encoding (LDR)
+
+			float full_ldr_rgb_error = rgb_quantization_error;
+			if (eci->can_blue_contract)
+				full_ldr_rgb_error *= 0.5f;
+			if (eci->can_offset_encode && i <= 18)
+				full_ldr_rgb_error *= 0.25f;
+			full_ldr_rgb_error += eci->alpha_drop_error + rgb_range_error;
+
+			float rgbs_alpha_error = rgba_quantization_error + eci->rgb_scale_error + rgb_range_error + alpha_range_error;
+
+			if (rgbs_alpha_error < full_ldr_rgb_error)
+			{
+				best_error[i][2] = rgbs_alpha_error;
+				format_of_choice[i][2] = FMT_RGB_SCALE_ALPHA;
+			}
+			else
+			{
+				best_error[i][2] = full_ldr_rgb_error;
+				format_of_choice[i][2] = FMT_RGB;
+			}
+
+
+			// for 4 integers, we have a Luminance-Alpha encoding and the RGBS encoding
+			float ldr_rgbs_error = rgb_quantization_error + eci->alpha_drop_error + eci->rgb_scale_error + rgb_range_error;
+
+			float lum_alpha_error = rgba_quantization_error + eci->luminance_error + rgb_range_error + alpha_range_error;
+
+			if (ldr_rgbs_error < lum_alpha_error)
+			{
+				best_error[i][1] = ldr_rgbs_error;
+				format_of_choice[i][1] = FMT_RGB_SCALE;
+			}
+			else
+			{
+				best_error[i][1] = lum_alpha_error;
+				format_of_choice[i][1] = FMT_LUMINANCE_ALPHA;
+			}
+
+
+			// for 2 integers, we have a Luminance-encoding and an Alpha-encoding.
+			float luminance_error = rgb_quantization_error + eci->alpha_drop_error + eci->luminance_error + rgb_range_error;
+
+			best_error[i][0] = luminance_error;
+			format_of_choice[i][0] = FMT_LUMINANCE;
+
+			#ifdef DEBUG_PRINT_DIAGNOSTICS
+				if (print_diagnostics)
+				{
+					for (j = 0; j < 4; j++)
+					{
+						printf(" (ldr) quant-level=%d ints=%d format=%d error=%f\n", i, j, format_of_choice[i][j], best_error[i][j]);
+					}
+				}
+			#endif
+		}
+	}
+}
+
+
+
+// for 1 partition, find the best combination (one format + a quantization level) for a given bitcount
+
+static void one_partition_find_best_combination_for_bitcount(float combined_best_error[21][4],
+															 int formats_of_choice[21][4], int bits_available, int *best_quantization_level, int *best_formats, float *error_of_best_combination)
+{
+	int i;
+	int best_integer_count = -1;
+	float best_integer_count_error = 1e20f;
+	for (i = 0; i < 4; i++)
+	{
+		// compute the quantization level for a given number of integers and a given number of bits.
+		int quantization_level = quantization_mode_table[i + 1][bits_available];
+		if (quantization_level == -1)
+			continue;			// used to indicate the case where we don't have enough bits to represent a given endpoint format at all.
+		if (combined_best_error[quantization_level][i] < best_integer_count_error)
+		{
+			best_integer_count_error = combined_best_error[quantization_level][i];
+			best_integer_count = i;
+		}
+	}
+
+	int ql = quantization_mode_table[best_integer_count + 1][bits_available];
+
+	*best_quantization_level = ql;
+	*error_of_best_combination = best_integer_count_error;
+	if (ql >= 0)
+		*best_formats = formats_of_choice[ql][best_integer_count];
+	else
+		*best_formats = FMT_LUMINANCE;
+
+}
+
+
+
+// for 2 partitions, find the best format combinations for every (quantization-mode, integer-count) combination
+
+static void two_partitions_find_best_combination_for_every_quantization_and_integer_count(float best_error[2][21][4],	// indexed by (partition, quant-level, integer-pair-count-minus-1)
+																						  int format_of_choice[2][21][4],
+																						  float combined_best_error[21][7],	// indexed by (quant-level, integer-pair-count-minus-2)
+																						  int formats_of_choice[21][7][2])
+{
+	int i, j;
+
+	for (i = 0; i < 21; i++)
+		for (j = 0; j < 7; j++)
+			combined_best_error[i][j] = 1e30f;
+
+	int quant;
+	for (quant = 5; quant < 21; quant++)
+	{
+		for (i = 0; i < 4; i++)	// integer-count for first endpoint-pair
+		{
+			for (j = 0; j < 4; j++)	// integer-count for second endpoint-pair
+			{
+				int low2 = MIN(i, j);
+				int high2 = MAX(i, j);
+				if ((high2 - low2) > 1)
+					continue;
+
+				int intcnt = i + j;
+				float errorterm = MIN(best_error[0][quant][i] + best_error[1][quant][j], 1e10f);
+				if (errorterm <= combined_best_error[quant][intcnt])
+				{
+					combined_best_error[quant][intcnt] = errorterm;
+					formats_of_choice[quant][intcnt][0] = format_of_choice[0][quant][i];
+					formats_of_choice[quant][intcnt][1] = format_of_choice[1][quant][j];
+				}
+			}
+		}
+	}
+}
+
+
+// for 2 partitions, find the best combination (two formats + a quantization level) for a given bitcount
+
+static void two_partitions_find_best_combination_for_bitcount(float combined_best_error[21][7],
+															  int formats_of_choice[21][7][2],
+															  int bits_available, int *best_quantization_level, int *best_quantization_level_mod, int *best_formats, float *error_of_best_combination)
+{
+	int i;
+
+	int best_integer_count = 0;
+	float best_integer_count_error = 1e20f;
+	int integer_count;
+
+	for (integer_count = 2; integer_count <= 8; integer_count++)
+	{
+		// compute the quantization level for a given number of integers and a given number of bits.
+		int quantization_level = quantization_mode_table[integer_count][bits_available];
+		if (quantization_level == -1)
+			break;				// used to indicate the case where we don't have enough bits to represent a given endpoint format at all.
+		float integer_count_error = combined_best_error[quantization_level][integer_count - 2];
+		if (integer_count_error < best_integer_count_error)
+		{
+			best_integer_count_error = integer_count_error;
+			best_integer_count = integer_count;
+		}
+	}
+
+	int ql = quantization_mode_table[best_integer_count][bits_available];
+	int ql_mod = quantization_mode_table[best_integer_count][bits_available + 2];
+
+	*best_quantization_level = ql;
+	*best_quantization_level_mod = ql_mod;
+	*error_of_best_combination = best_integer_count_error;
+	if (ql >= 0)
+	{
+		for (i = 0; i < 2; i++)
+			best_formats[i] = formats_of_choice[ql][best_integer_count - 2][i];
+	}
+	else
+	{
+		for (i = 0; i < 2; i++)
+			best_formats[i] = FMT_LUMINANCE;
+	}
+}
+
+
+
+
+// for 3 partitions, find the best format combinations for every (quantization-mode, integer-count) combination
+
+static void three_partitions_find_best_combination_for_every_quantization_and_integer_count(float best_error[3][21][4],	// indexed by (partition, quant-level, integer-count)
+																							int format_of_choice[3][21][4], float combined_best_error[21][10], int formats_of_choice[21][10][3])
+{
+	int i, j, k;
+
+	for (i = 0; i < 21; i++)
+		for (j = 0; j < 10; j++)
+			combined_best_error[i][j] = 1e30f;
+
+	int quant;
+	for (quant = 5; quant < 21; quant++)
+	{
+		for (i = 0; i < 4; i++)	// integer-count for first endpoint-pair
+		{
+			for (j = 0; j < 4; j++)	// integer-count for second endpoint-pair
+			{
+				int low2 = MIN(i, j);
+				int high2 = MAX(i, j);
+				if ((high2 - low2) > 1)
+					continue;
+				for (k = 0; k < 4; k++)	// integer-count for third endpoint-pair
+				{
+					int low3 = MIN(k, low2);
+					int high3 = MAX(k, high2);
+					if ((high3 - low3) > 1)
+						continue;
+
+					int intcnt = i + j + k;
+					float errorterm = MIN(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k], 1e10f);
+					if (errorterm <= combined_best_error[quant][intcnt])
+					{
+						combined_best_error[quant][intcnt] = errorterm;
+						formats_of_choice[quant][intcnt][0] = format_of_choice[0][quant][i];
+						formats_of_choice[quant][intcnt][1] = format_of_choice[1][quant][j];
+						formats_of_choice[quant][intcnt][2] = format_of_choice[2][quant][k];
+					}
+				}
+			}
+		}
+	}
+}
+
+
+// for 3 partitions, find the best combination (three formats + a quantization level) for a given bitcount
+
+static void three_partitions_find_best_combination_for_bitcount(float combined_best_error[21][10],
+																int formats_of_choice[21][10][3],
+																int bits_available, int *best_quantization_level, int *best_quantization_level_mod, int *best_formats, float *error_of_best_combination)
+{
+	int i;
+
+	int best_integer_count = 0;
+	float best_integer_count_error = 1e20f;
+	int integer_count;
+
+	for (integer_count = 3; integer_count <= 9; integer_count++)
+	{
+		// compute the quantization level for a given number of integers and a given number of bits.
+		int quantization_level = quantization_mode_table[integer_count][bits_available];
+		if (quantization_level == -1)
+			break;				// used to indicate the case where we don't have enough bits to represent a given endpoint format at all.
+		float integer_count_error = combined_best_error[quantization_level][integer_count - 3];
+		if (integer_count_error < best_integer_count_error)
+		{
+			best_integer_count_error = integer_count_error;
+			best_integer_count = integer_count;
+		}
+	}
+
+	int ql = quantization_mode_table[best_integer_count][bits_available];
+	int ql_mod = quantization_mode_table[best_integer_count][bits_available + 5];
+
+	*best_quantization_level = ql;
+	*best_quantization_level_mod = ql_mod;
+	*error_of_best_combination = best_integer_count_error;
+	if (ql >= 0)
+	{
+		for (i = 0; i < 3; i++)
+			best_formats[i] = formats_of_choice[ql][best_integer_count - 3][i];
+	}
+	else
+	{
+		for (i = 0; i < 3; i++)
+			best_formats[i] = FMT_LUMINANCE;
+	}
+}
+
+
+
+
+// for 4 partitions, find the best format combinations for every (quantization-mode, integer-count) combination
+
+static void four_partitions_find_best_combination_for_every_quantization_and_integer_count(float best_error[4][21][4],	// indexed by (partition, quant-level, integer-count)
+																						   int format_of_choice[4][21][4], float combined_best_error[21][13], int formats_of_choice[21][13][4])
+{
+	int i, j, k, l;
+
+	for (i = 0; i < 21; i++)
+		for (j = 0; j < 13; j++)
+			combined_best_error[i][j] = 1e30f;
+
+	int quant;
+	for (quant = 5; quant < 21; quant++)
+	{
+		for (i = 0; i < 4; i++)	// integer-count for first endpoint-pair
+		{
+			for (j = 0; j < 4; j++)	// integer-count for second endpoint-pair
+			{
+				int low2 = MIN(i, j);
+				int high2 = MAX(i, j);
+				if ((high2 - low2) > 1)
+					continue;
+				for (k = 0; k < 4; k++)	// integer-count for third endpoint-pair
+				{
+					int low3 = MIN(k, low2);
+					int high3 = MAX(k, high2);
+					if ((high3 - low3) > 1)
+						continue;
+					for (l = 0; l < 4; l++)	// integer-count for fourth endpoint-pair
+					{
+						int low4 = MIN(l, low3);
+						int high4 = MAX(l, high3);
+						if ((high4 - low4) > 1)
+							continue;
+
+						int intcnt = i + j + k + l;
+						float errorterm = MIN(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k] + best_error[3][quant][l], 1e10f);
+						if (errorterm <= combined_best_error[quant][intcnt])
+						{
+							combined_best_error[quant][intcnt] = errorterm;
+							formats_of_choice[quant][intcnt][0] = format_of_choice[0][quant][i];
+							formats_of_choice[quant][intcnt][1] = format_of_choice[1][quant][j];
+							formats_of_choice[quant][intcnt][2] = format_of_choice[2][quant][k];
+							formats_of_choice[quant][intcnt][3] = format_of_choice[3][quant][l];
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+
+
+
+
+
+// for 4 partitions, find the best combination (four formats + a quantization level) for a given bitcount
+
+static void four_partitions_find_best_combination_for_bitcount(float combined_best_error[21][13],
+															   int formats_of_choice[21][13][4],
+															   int bits_available, int *best_quantization_level, int *best_quantization_level_mod, int *best_formats, float *error_of_best_combination)
+{
+	int i;
+	int best_integer_count = 0;
+	float best_integer_count_error = 1e20f;
+	int integer_count;
+
+	for (integer_count = 4; integer_count <= 9; integer_count++)
+	{
+		// compute the quantization level for a given number of integers and a given number of bits.
+		int quantization_level = quantization_mode_table[integer_count][bits_available];
+		if (quantization_level == -1)
+			break;				// used to indicate the case where we don't have enough bits to represent a given endpoint format at all.
+		float integer_count_error = combined_best_error[quantization_level][integer_count - 4];
+		if (integer_count_error < best_integer_count_error)
+		{
+			best_integer_count_error = integer_count_error;
+			best_integer_count = integer_count;
+		}
+	}
+
+	int ql = quantization_mode_table[best_integer_count][bits_available];
+	int ql_mod = quantization_mode_table[best_integer_count][bits_available + 8];
+
+	*best_quantization_level = ql;
+	*best_quantization_level_mod = ql_mod;
+	*error_of_best_combination = best_integer_count_error;
+	if (ql >= 0)
+	{
+		for (i = 0; i < 4; i++)
+			best_formats[i] = formats_of_choice[ql][best_integer_count - 4][i];
+	}
+	else
+	{
+		for (i = 0; i < 4; i++)
+			best_formats[i] = FMT_LUMINANCE;
+	}
+}
+
+
+
+/*
+	The determine_optimal_set_of_endpoint_formats_to_use() function.
+
+	It identifies, for each mode, which set of color endpoint encodings
+	produces the best overall result. It then reports back which 4 modes
+	look best, along with the ideal color encoding combination for each.
+
+	It takes as input:
+		a partitioning an imageblock,
+		a set of color endpoints.
+		for each mode, the number of bits available for color encoding and the error incurred by quantization.
+		in case of 2 plane of weights, a specifier for which color component to use for the second plane of weights.
+
+	It delivers as output for each of the 4 selected modes:
+		format specifier
+		for each partition
+			quantization level to use
+			modified quantization level to use
+		(when all format specifiers are equal)
+ */
+
+void determine_optimal_set_of_endpoint_formats_to_use(int xdim, int ydim, int zdim,
+													  const partition_info * pt, const imageblock * blk, const error_weight_block * ewb,
+													  const endpoints * ep,
+													  int separate_component,	// separate color component for 2-plane mode; -1 for single-plane mode
+													  // bitcounts and errors computed for the various quantization methods
+													  const int *qwt_bitcounts, const float *qwt_errors,
+													  // output data
+													  int partition_format_specifiers[4][4], int quantized_weight[4],
+													  int quantization_level[4], int quantization_level_mod[4])
+{
+	int i, j;
+	int partition_count = pt->partition_count;
+
+	int encode_hdr_rgb = blk->rgb_lns[0];
+	int encode_hdr_alpha = blk->alpha_lns[0];
+
+
+	// call a helper function to compute the errors that result from various
+	// encoding choices (such as using luminance instead of RGB, discarding Alpha,
+	// using RGB-scale in place of two separate RGB endpoints and so on)
+	encoding_choice_errors eci[4];
+	compute_encoding_choice_errors(xdim, ydim, zdim, blk, pt, ewb, separate_component, eci);
+
+	// for each partition, compute the error weights to apply for that partition.
+	float4 error_weightings[4];
+	float4 dummied_color_scalefactors[4];	// only used to receive data
+	compute_partition_error_color_weightings(xdim, ydim, zdim, ewb, pt, error_weightings, dummied_color_scalefactors);
+
+
+	float best_error[4][21][4];
+	int format_of_choice[4][21][4];
+	for (i = 0; i < partition_count; i++)
+		compute_color_error_for_every_integer_count_and_quantization_level(encode_hdr_rgb, encode_hdr_alpha, i, pt, &(eci[i]), ep, error_weightings, best_error[i], format_of_choice[i]);
+
+	float errors_of_best_combination[MAX_WEIGHT_MODES];
+	int best_quantization_levels[MAX_WEIGHT_MODES];
+	int best_quantization_levels_mod[MAX_WEIGHT_MODES];
+	int best_ep_formats[MAX_WEIGHT_MODES][4];
+
+	// code for the case where the block contains 1 partition
+	if (partition_count == 1)
+	{
+		int best_quantization_level;
+		int best_format;
+		float error_of_best_combination;
+		for (i = 0; i < MAX_WEIGHT_MODES; i++)
+		{
+			if (qwt_errors[i] >= 1e29f)
+			{
+				errors_of_best_combination[i] = 1e30f;
+				continue;
+			}
+
+			one_partition_find_best_combination_for_bitcount(best_error[0], format_of_choice[0], qwt_bitcounts[i], &best_quantization_level, &best_format, &error_of_best_combination);
+			error_of_best_combination += qwt_errors[i];
+
+			errors_of_best_combination[i] = error_of_best_combination;
+			best_quantization_levels[i] = best_quantization_level;
+			best_quantization_levels_mod[i] = best_quantization_level;
+			best_ep_formats[i][0] = best_format;
+		}
+	}
+
+	// code for the case where the block contains 2 partitions
+	else if (partition_count == 2)
+	{
+		int best_quantization_level;
+		int best_quantization_level_mod;
+		int best_formats[2];
+		float error_of_best_combination;
+
+		float combined_best_error[21][7];
+		int formats_of_choice[21][7][2];
+
+		two_partitions_find_best_combination_for_every_quantization_and_integer_count(best_error, format_of_choice, combined_best_error, formats_of_choice);
+
+
+		for (i = 0; i < MAX_WEIGHT_MODES; i++)
+		{
+			if (qwt_errors[i] >= 1e29f)
+			{
+				errors_of_best_combination[i] = 1e30f;
+				continue;
+			}
+
+			two_partitions_find_best_combination_for_bitcount(combined_best_error, formats_of_choice, qwt_bitcounts[i],
+															  &best_quantization_level, &best_quantization_level_mod, best_formats, &error_of_best_combination);
+
+			error_of_best_combination += qwt_errors[i];
+
+			errors_of_best_combination[i] = error_of_best_combination;
+			best_quantization_levels[i] = best_quantization_level;
+			best_quantization_levels_mod[i] = best_quantization_level_mod;
+			best_ep_formats[i][0] = best_formats[0];
+			best_ep_formats[i][1] = best_formats[1];
+		}
+	}
+
+	// code for the case where the block contains 3 partitions
+	else if (partition_count == 3)
+	{
+		int best_quantization_level;
+		int best_quantization_level_mod;
+		int best_formats[3];
+		float error_of_best_combination;
+
+		float combined_best_error[21][10];
+		int formats_of_choice[21][10][3];
+
+		three_partitions_find_best_combination_for_every_quantization_and_integer_count(best_error, format_of_choice, combined_best_error, formats_of_choice);
+
+		for (i = 0; i < MAX_WEIGHT_MODES; i++)
+		{
+			if (qwt_errors[i] >= 1e29f)
+			{
+				errors_of_best_combination[i] = 1e30f;
+				continue;
+			}
+
+			three_partitions_find_best_combination_for_bitcount(combined_best_error,
+																formats_of_choice, qwt_bitcounts[i], &best_quantization_level, &best_quantization_level_mod, best_formats, &error_of_best_combination);
+			error_of_best_combination += qwt_errors[i];
+
+			errors_of_best_combination[i] = error_of_best_combination;
+			best_quantization_levels[i] = best_quantization_level;
+			best_quantization_levels_mod[i] = best_quantization_level_mod;
+			best_ep_formats[i][0] = best_formats[0];
+			best_ep_formats[i][1] = best_formats[1];
+			best_ep_formats[i][2] = best_formats[2];
+		}
+	}
+
+	// code for the case where the block contains 4 partitions
+	else if (partition_count == 4)
+	{
+		int best_quantization_level;
+		int best_quantization_level_mod;
+		int best_formats[4];
+		float error_of_best_combination;
+
+		float combined_best_error[21][13];
+		int formats_of_choice[21][13][4];
+
+		four_partitions_find_best_combination_for_every_quantization_and_integer_count(best_error, format_of_choice, combined_best_error, formats_of_choice);
+
+		for (i = 0; i < MAX_WEIGHT_MODES; i++)
+		{
+			if (qwt_errors[i] >= 1e29f)
+			{
+				errors_of_best_combination[i] = 1e30f;
+				continue;
+			}
+			four_partitions_find_best_combination_for_bitcount(combined_best_error,
+															   formats_of_choice, qwt_bitcounts[i], &best_quantization_level, &best_quantization_level_mod, best_formats, &error_of_best_combination);
+			error_of_best_combination += qwt_errors[i];
+
+			errors_of_best_combination[i] = error_of_best_combination;
+			best_quantization_levels[i] = best_quantization_level;
+			best_quantization_levels_mod[i] = best_quantization_level_mod;
+			best_ep_formats[i][0] = best_formats[0];
+			best_ep_formats[i][1] = best_formats[1];
+			best_ep_formats[i][2] = best_formats[2];
+			best_ep_formats[i][3] = best_formats[3];
+		}
+	}
+
+	// finally, go through the results and pick the 4 best-looking modes.
+
+	int best_error_weights[4];
+
+	for (i = 0; i < 4; i++)
+	{
+		float best_ep_error = 1e30f;
+		int best_error_index = -1;
+		for (j = 0; j < MAX_WEIGHT_MODES; j++)
+		{
+			if (errors_of_best_combination[j] < best_ep_error && best_quantization_levels[j] >= 5)
+			{
+				best_ep_error = errors_of_best_combination[j];
+				best_error_index = j;
+			}
+		}
+		best_error_weights[i] = best_error_index;
+
+		if(best_error_index >= 0)
+		{
+			errors_of_best_combination[best_error_index] = 1e30f;
+		}
+	}
+
+	for (i = 0; i < 4; i++)
+	{
+		quantized_weight[i] = best_error_weights[i];
+		if (quantized_weight[i] >= 0)
+		{
+			quantization_level[i] = best_quantization_levels[best_error_weights[i]];
+			quantization_level_mod[i] = best_quantization_levels_mod[best_error_weights[i]];
+			for (j = 0; j < partition_count; j++)
+			{
+				partition_format_specifiers[i][j] = best_ep_formats[best_error_weights[i]][j];
+			}
+		}
+	}
+}
--- a/3rdparty/astc/astc_quantization.cpp
+++ b/3rdparty/astc/astc_quantization.cpp
@@ -0,0 +1,558 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Functions and data table related to data quantization in ASTC.
+ */
+/*----------------------------------------------------------------------------*/
+
+#include "astc_codec_internals.h"
+
+const uint8_t color_quantization_tables[21][256] = {
+	{
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 },
+	{
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 },
+	{
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 },
+	{
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+	 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+	 },
+	{
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4,
+	 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+	 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+	 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+	 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+	 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+	 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+	 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 },
+	{
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
+	 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+	 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+	 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+	 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+	 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6,
+	 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+	 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7,
+	 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+	 },
+	{
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4,
+	 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+	 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+	 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+	 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+	 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+	 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+	 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7,
+	 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+	 7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5,
+	 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+	 5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 },
+	{
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4,
+	 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+	 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+	 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+	 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10,
+	 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+	 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+	 11, 11, 11, 11, 11, 11, 11, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+	 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3,
+	 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 3, 3, 3, 3, 3, 3, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+	 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 5, 5,
+	 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+	 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	 },
+	{
+	 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
+	 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
+	 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5,
+	 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
+	 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+	 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+	 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+	 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+	 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+	 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+	 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	 },
+	{
+	 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+	 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+	 8, 8, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6,
+	 6, 6, 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10,
+	 10, 10, 10, 10, 10, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	 14, 14, 14, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+	 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 15, 15, 15,
+	 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 11, 11, 11, 11, 11,
+	 11, 11, 11, 11, 11, 11, 11, 11, 7, 7, 7, 7, 7, 7, 7, 7,
+	 7, 7, 7, 7, 7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	 3, 3, 3, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+	 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 9, 9,
+	 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 5, 5, 5, 5,
+	 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1,
+	 },
+	{
+	 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+	 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 2, 2, 2, 2,
+	 2, 2, 2, 2, 2, 2, 2, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+	 10, 10, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 4, 4, 4,
+	 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 12, 12,
+	 12, 12, 12, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 6, 6,
+	 6, 6, 6, 6, 6, 6, 6, 6, 6, 14, 14, 14, 14, 14, 14, 14,
+	 14, 14, 14, 14, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 15, 15, 15, 15,
+	 15, 15, 15, 15, 15, 15, 15, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+	 7, 7, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 13, 13, 13,
+	 13, 13, 13, 13, 13, 13, 13, 13, 5, 5, 5, 5, 5, 5, 5, 5,
+	 5, 5, 5, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 11, 11,
+	 11, 11, 11, 11, 11, 11, 11, 11, 11, 3, 3, 3, 3, 3, 3, 3,
+	 3, 3, 3, 3, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 9,
+	 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1,
+	 },
+	{
+	 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
+	 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
+	 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
+	 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8,
+	 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10,
+	 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12,
+	 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13,
+	 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15,
+	 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
+	 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
+	 19, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21,
+	 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23,
+	 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25,
+	 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
+	 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29,
+	 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31,
+	 },
+	{
+	 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16,
+	 16, 24, 24, 24, 24, 24, 24, 32, 32, 32, 32, 32, 32, 32, 2, 2,
+	 2, 2, 2, 2, 10, 10, 10, 10, 10, 10, 10, 18, 18, 18, 18, 18,
+	 18, 26, 26, 26, 26, 26, 26, 26, 34, 34, 34, 34, 34, 34, 4, 4,
+	 4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 20, 20, 20, 20, 20,
+	 20, 20, 28, 28, 28, 28, 28, 28, 36, 36, 36, 36, 36, 36, 36, 6,
+	 6, 6, 6, 6, 6, 14, 14, 14, 14, 14, 14, 14, 22, 22, 22, 22,
+	 22, 22, 30, 30, 30, 30, 30, 30, 30, 38, 38, 38, 38, 38, 38, 38,
+	 39, 39, 39, 39, 39, 39, 39, 31, 31, 31, 31, 31, 31, 31, 23, 23,
+	 23, 23, 23, 23, 15, 15, 15, 15, 15, 15, 15, 7, 7, 7, 7, 7,
+	 7, 37, 37, 37, 37, 37, 37, 37, 29, 29, 29, 29, 29, 29, 21, 21,
+	 21, 21, 21, 21, 21, 13, 13, 13, 13, 13, 13, 5, 5, 5, 5, 5,
+	 5, 5, 35, 35, 35, 35, 35, 35, 27, 27, 27, 27, 27, 27, 27, 19,
+	 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 3, 3, 3, 3,
+	 3, 3, 33, 33, 33, 33, 33, 33, 33, 25, 25, 25, 25, 25, 25, 17,
+	 17, 17, 17, 17, 17, 17, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1,
+	 },
+	{
+	 0, 0, 0, 16, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 2, 2,
+	 2, 2, 2, 18, 18, 18, 18, 18, 18, 34, 34, 34, 34, 34, 4, 4,
+	 4, 4, 4, 4, 20, 20, 20, 20, 20, 36, 36, 36, 36, 36, 6, 6,
+	 6, 6, 6, 6, 22, 22, 22, 22, 22, 38, 38, 38, 38, 38, 38, 8,
+	 8, 8, 8, 8, 24, 24, 24, 24, 24, 24, 40, 40, 40, 40, 40, 10,
+	 10, 10, 10, 10, 26, 26, 26, 26, 26, 26, 42, 42, 42, 42, 42, 12,
+	 12, 12, 12, 12, 12, 28, 28, 28, 28, 28, 44, 44, 44, 44, 44, 14,
+	 14, 14, 14, 14, 14, 30, 30, 30, 30, 30, 46, 46, 46, 46, 46, 46,
+	 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 15, 15, 15, 15, 15,
+	 15, 45, 45, 45, 45, 45, 29, 29, 29, 29, 29, 13, 13, 13, 13, 13,
+	 13, 43, 43, 43, 43, 43, 27, 27, 27, 27, 27, 27, 11, 11, 11, 11,
+	 11, 41, 41, 41, 41, 41, 25, 25, 25, 25, 25, 25, 9, 9, 9, 9,
+	 9, 39, 39, 39, 39, 39, 39, 23, 23, 23, 23, 23, 7, 7, 7, 7,
+	 7, 7, 37, 37, 37, 37, 37, 21, 21, 21, 21, 21, 5, 5, 5, 5,
+	 5, 5, 35, 35, 35, 35, 35, 19, 19, 19, 19, 19, 19, 3, 3, 3,
+	 3, 3, 33, 33, 33, 33, 33, 17, 17, 17, 17, 17, 17, 1, 1, 1,
+	 },
+	{
+	 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4,
+	 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8,
+	 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12,
+	 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16,
+	 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19,
+	 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23,
+	 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27,
+	 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31,
+	 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35,
+	 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39,
+	 40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43,
+	 44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47,
+	 47, 48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51,
+	 51, 52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55,
+	 55, 56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59,
+	 59, 60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63,
+	 },
+	{
+	 0, 0, 16, 16, 16, 32, 32, 32, 48, 48, 48, 48, 64, 64, 64, 2,
+	 2, 2, 18, 18, 18, 34, 34, 34, 50, 50, 50, 50, 66, 66, 66, 4,
+	 4, 4, 20, 20, 20, 36, 36, 36, 36, 52, 52, 52, 68, 68, 68, 6,
+	 6, 6, 22, 22, 22, 38, 38, 38, 38, 54, 54, 54, 70, 70, 70, 8,
+	 8, 8, 24, 24, 24, 24, 40, 40, 40, 56, 56, 56, 72, 72, 72, 10,
+	 10, 10, 26, 26, 26, 26, 42, 42, 42, 58, 58, 58, 74, 74, 74, 12,
+	 12, 12, 12, 28, 28, 28, 44, 44, 44, 60, 60, 60, 76, 76, 76, 14,
+	 14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 78, 78, 78, 78,
+	 79, 79, 79, 79, 63, 63, 63, 47, 47, 47, 31, 31, 31, 15, 15, 15,
+	 15, 77, 77, 77, 61, 61, 61, 45, 45, 45, 29, 29, 29, 13, 13, 13,
+	 13, 75, 75, 75, 59, 59, 59, 43, 43, 43, 27, 27, 27, 27, 11, 11,
+	 11, 73, 73, 73, 57, 57, 57, 41, 41, 41, 25, 25, 25, 25, 9, 9,
+	 9, 71, 71, 71, 55, 55, 55, 39, 39, 39, 39, 23, 23, 23, 7, 7,
+	 7, 69, 69, 69, 53, 53, 53, 37, 37, 37, 37, 21, 21, 21, 5, 5,
+	 5, 67, 67, 67, 51, 51, 51, 51, 35, 35, 35, 19, 19, 19, 3, 3,
+	 3, 65, 65, 65, 49, 49, 49, 49, 33, 33, 33, 17, 17, 17, 1, 1,
+	 },
+	{
+	 0, 0, 32, 32, 64, 64, 64, 2, 2, 2, 34, 34, 66, 66, 66, 4,
+	 4, 4, 36, 36, 68, 68, 68, 6, 6, 6, 38, 38, 70, 70, 70, 8,
+	 8, 8, 40, 40, 40, 72, 72, 10, 10, 10, 42, 42, 42, 74, 74, 12,
+	 12, 12, 44, 44, 44, 76, 76, 14, 14, 14, 46, 46, 46, 78, 78, 16,
+	 16, 16, 48, 48, 48, 80, 80, 80, 18, 18, 50, 50, 50, 82, 82, 82,
+	 20, 20, 52, 52, 52, 84, 84, 84, 22, 22, 54, 54, 54, 86, 86, 86,
+	 24, 24, 56, 56, 56, 88, 88, 88, 26, 26, 58, 58, 58, 90, 90, 90,
+	 28, 28, 60, 60, 60, 92, 92, 92, 30, 30, 62, 62, 62, 94, 94, 94,
+	 95, 95, 95, 63, 63, 63, 31, 31, 93, 93, 93, 61, 61, 61, 29, 29,
+	 91, 91, 91, 59, 59, 59, 27, 27, 89, 89, 89, 57, 57, 57, 25, 25,
+	 87, 87, 87, 55, 55, 55, 23, 23, 85, 85, 85, 53, 53, 53, 21, 21,
+	 83, 83, 83, 51, 51, 51, 19, 19, 81, 81, 81, 49, 49, 49, 17, 17,
+	 17, 79, 79, 47, 47, 47, 15, 15, 15, 77, 77, 45, 45, 45, 13, 13,
+	 13, 75, 75, 43, 43, 43, 11, 11, 11, 73, 73, 41, 41, 41, 9, 9,
+	 9, 71, 71, 71, 39, 39, 7, 7, 7, 69, 69, 69, 37, 37, 5, 5,
+	 5, 67, 67, 67, 35, 35, 3, 3, 3, 65, 65, 65, 33, 33, 1, 1,
+	 },
+	{
+	 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
+	 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15,
+	 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23,
+	 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31,
+	 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39,
+	 40, 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47,
+	 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55,
+	 56, 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63,
+	 64, 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71,
+	 72, 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79,
+	 80, 80, 81, 81, 82, 82, 83, 83, 84, 84, 85, 85, 86, 86, 87, 87,
+	 88, 88, 89, 89, 90, 90, 91, 91, 92, 92, 93, 93, 94, 94, 95, 95,
+	 96, 96, 97, 97, 98, 98, 99, 99, 100, 100, 101, 101, 102, 102, 103, 103,
+	 104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111,
+	 112, 112, 113, 113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119,
+	 120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127,
+	 },
+	{
+	 0, 32, 32, 64, 96, 96, 128, 128, 2, 34, 34, 66, 98, 98, 130, 130,
+	 4, 36, 36, 68, 100, 100, 132, 132, 6, 38, 38, 70, 102, 102, 134, 134,
+	 8, 40, 40, 72, 104, 104, 136, 136, 10, 42, 42, 74, 106, 106, 138, 138,
+	 12, 44, 44, 76, 108, 108, 140, 140, 14, 46, 46, 78, 110, 110, 142, 142,
+	 16, 48, 48, 80, 112, 112, 144, 144, 18, 50, 50, 82, 114, 114, 146, 146,
+	 20, 52, 52, 84, 116, 116, 148, 148, 22, 54, 54, 86, 118, 118, 150, 150,
+	 24, 56, 56, 88, 120, 120, 152, 152, 26, 58, 58, 90, 122, 122, 154, 154,
+	 28, 60, 60, 92, 124, 124, 156, 156, 30, 62, 62, 94, 126, 126, 158, 158,
+	 159, 159, 127, 127, 95, 63, 63, 31, 157, 157, 125, 125, 93, 61, 61, 29,
+	 155, 155, 123, 123, 91, 59, 59, 27, 153, 153, 121, 121, 89, 57, 57, 25,
+	 151, 151, 119, 119, 87, 55, 55, 23, 149, 149, 117, 117, 85, 53, 53, 21,
+	 147, 147, 115, 115, 83, 51, 51, 19, 145, 145, 113, 113, 81, 49, 49, 17,
+	 143, 143, 111, 111, 79, 47, 47, 15, 141, 141, 109, 109, 77, 45, 45, 13,
+	 139, 139, 107, 107, 75, 43, 43, 11, 137, 137, 105, 105, 73, 41, 41, 9,
+	 135, 135, 103, 103, 71, 39, 39, 7, 133, 133, 101, 101, 69, 37, 37, 5,
+	 131, 131, 99, 99, 67, 35, 35, 3, 129, 129, 97, 97, 65, 33, 33, 1,
+	 },
+	{
+	 0, 64, 128, 128, 2, 66, 130, 130, 4, 68, 132, 132, 6, 70, 134, 134,
+	 8, 72, 136, 136, 10, 74, 138, 138, 12, 76, 140, 140, 14, 78, 142, 142,
+	 16, 80, 144, 144, 18, 82, 146, 146, 20, 84, 148, 148, 22, 86, 150, 150,
+	 24, 88, 152, 152, 26, 90, 154, 154, 28, 92, 156, 156, 30, 94, 158, 158,
+	 32, 96, 160, 160, 34, 98, 162, 162, 36, 100, 164, 164, 38, 102, 166, 166,
+	 40, 104, 168, 168, 42, 106, 170, 170, 44, 108, 172, 172, 46, 110, 174, 174,
+	 48, 112, 176, 176, 50, 114, 178, 178, 52, 116, 180, 180, 54, 118, 182, 182,
+	 56, 120, 184, 184, 58, 122, 186, 186, 60, 124, 188, 188, 62, 126, 190, 190,
+	 191, 191, 127, 63, 189, 189, 125, 61, 187, 187, 123, 59, 185, 185, 121, 57,
+	 183, 183, 119, 55, 181, 181, 117, 53, 179, 179, 115, 51, 177, 177, 113, 49,
+	 175, 175, 111, 47, 173, 173, 109, 45, 171, 171, 107, 43, 169, 169, 105, 41,
+	 167, 167, 103, 39, 165, 165, 101, 37, 163, 163, 99, 35, 161, 161, 97, 33,
+	 159, 159, 95, 31, 157, 157, 93, 29, 155, 155, 91, 27, 153, 153, 89, 25,
+	 151, 151, 87, 23, 149, 149, 85, 21, 147, 147, 83, 19, 145, 145, 81, 17,
+	 143, 143, 79, 15, 141, 141, 77, 13, 139, 139, 75, 11, 137, 137, 73, 9,
+	 135, 135, 71, 7, 133, 133, 69, 5, 131, 131, 67, 3, 129, 129, 65, 1,
+	 },
+	{
+	 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+	 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+	 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+	 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+	 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+	 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+	 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+	 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+	 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+	 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+	 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+	 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+	 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+	 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+	 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+	 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
+	 },
+};
+
+
+const uint8_t color_unquantization_tables[21][256] = {
+	{
+	 0, 255,
+	 },
+	{
+	 0, 128, 255,
+	 },
+	{
+	 0, 85, 170, 255,
+	 },
+	{
+	 0, 64, 128, 192, 255,
+	 },
+	{
+	 0, 255, 51, 204, 102, 153,
+	 },
+	{
+	 0, 36, 73, 109, 146, 182, 219, 255,
+	 },
+	{
+	 0, 255, 28, 227, 56, 199, 84, 171, 113, 142,
+	 },
+	{
+	 0, 255, 69, 186, 23, 232, 92, 163, 46, 209, 116, 139,
+	 },
+	{
+	 0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255,
+	 },
+	{
+	 0, 255, 67, 188, 13, 242, 80, 175, 27, 228, 94, 161, 40, 215, 107, 148,
+	 54, 201, 121, 134,
+	 },
+	{
+	 0, 255, 33, 222, 66, 189, 99, 156, 11, 244, 44, 211, 77, 178, 110, 145,
+	 22, 233, 55, 200, 88, 167, 121, 134,
+	 },
+	{
+	 0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123,
+	 132, 140, 148, 156, 165, 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255,
+	 },
+	{
+	 0, 255, 32, 223, 65, 190, 97, 158, 6, 249, 39, 216, 71, 184, 104, 151,
+	 13, 242, 45, 210, 78, 177, 110, 145, 19, 236, 52, 203, 84, 171, 117, 138,
+	 26, 229, 58, 197, 91, 164, 123, 132,
+	 },
+	{
+	 0, 255, 16, 239, 32, 223, 48, 207, 65, 190, 81, 174, 97, 158, 113, 142,
+	 5, 250, 21, 234, 38, 217, 54, 201, 70, 185, 86, 169, 103, 152, 119, 136,
+	 11, 244, 27, 228, 43, 212, 59, 196, 76, 179, 92, 163, 108, 147, 124, 131,
+	 },
+	{
+	 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+	 65, 69, 73, 77, 81, 85, 89, 93, 97, 101, 105, 109, 113, 117, 121, 125,
+	 130, 134, 138, 142, 146, 150, 154, 158, 162, 166, 170, 174, 178, 182, 186, 190,
+	 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, 243, 247, 251, 255,
+	 },
+	{
+	 0, 255, 16, 239, 32, 223, 48, 207, 64, 191, 80, 175, 96, 159, 112, 143,
+	 3, 252, 19, 236, 35, 220, 51, 204, 67, 188, 83, 172, 100, 155, 116, 139,
+	 6, 249, 22, 233, 38, 217, 54, 201, 71, 184, 87, 168, 103, 152, 119, 136,
+	 9, 246, 25, 230, 42, 213, 58, 197, 74, 181, 90, 165, 106, 149, 122, 133,
+	 13, 242, 29, 226, 45, 210, 61, 194, 77, 178, 93, 162, 109, 146, 125, 130,
+	 },
+	{
+	 0, 255, 8, 247, 16, 239, 24, 231, 32, 223, 40, 215, 48, 207, 56, 199,
+	 64, 191, 72, 183, 80, 175, 88, 167, 96, 159, 104, 151, 112, 143, 120, 135,
+	 2, 253, 10, 245, 18, 237, 26, 229, 35, 220, 43, 212, 51, 204, 59, 196,
+	 67, 188, 75, 180, 83, 172, 91, 164, 99, 156, 107, 148, 115, 140, 123, 132,
+	 5, 250, 13, 242, 21, 234, 29, 226, 37, 218, 45, 210, 53, 202, 61, 194,
+	 70, 185, 78, 177, 86, 169, 94, 161, 102, 153, 110, 145, 118, 137, 126, 129,
+	 },
+	{
+	 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+	 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+	 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94,
+	 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
+	 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159,
+	 161, 163, 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191,
+	 193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223,
+	 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255,
+	 },
+	{
+	 0, 255, 8, 247, 16, 239, 24, 231, 32, 223, 40, 215, 48, 207, 56, 199,
+	 64, 191, 72, 183, 80, 175, 88, 167, 96, 159, 104, 151, 112, 143, 120, 135,
+	 1, 254, 9, 246, 17, 238, 25, 230, 33, 222, 41, 214, 49, 206, 57, 198,
+	 65, 190, 73, 182, 81, 174, 89, 166, 97, 158, 105, 150, 113, 142, 121, 134,
+	 3, 252, 11, 244, 19, 236, 27, 228, 35, 220, 43, 212, 51, 204, 59, 196,
+	 67, 188, 75, 180, 83, 172, 91, 164, 99, 156, 107, 148, 115, 140, 123, 132,
+	 4, 251, 12, 243, 20, 235, 28, 227, 36, 219, 44, 211, 52, 203, 60, 195,
+	 68, 187, 76, 179, 84, 171, 92, 163, 100, 155, 108, 147, 116, 139, 124, 131,
+	 6, 249, 14, 241, 22, 233, 30, 225, 38, 217, 46, 209, 54, 201, 62, 193,
+	 70, 185, 78, 177, 86, 169, 94, 161, 102, 153, 110, 145, 118, 137, 126, 129,
+	 },
+	{
+	 0, 255, 4, 251, 8, 247, 12, 243, 16, 239, 20, 235, 24, 231, 28, 227,
+	 32, 223, 36, 219, 40, 215, 44, 211, 48, 207, 52, 203, 56, 199, 60, 195,
+	 64, 191, 68, 187, 72, 183, 76, 179, 80, 175, 84, 171, 88, 167, 92, 163,
+	 96, 159, 100, 155, 104, 151, 108, 147, 112, 143, 116, 139, 120, 135, 124, 131,
+	 1, 254, 5, 250, 9, 246, 13, 242, 17, 238, 21, 234, 25, 230, 29, 226,
+	 33, 222, 37, 218, 41, 214, 45, 210, 49, 206, 53, 202, 57, 198, 61, 194,
+	 65, 190, 69, 186, 73, 182, 77, 178, 81, 174, 85, 170, 89, 166, 93, 162,
+	 97, 158, 101, 154, 105, 150, 109, 146, 113, 142, 117, 138, 121, 134, 125, 130,
+	 2, 253, 6, 249, 10, 245, 14, 241, 18, 237, 22, 233, 26, 229, 30, 225,
+	 34, 221, 38, 217, 42, 213, 46, 209, 50, 205, 54, 201, 58, 197, 62, 193,
+	 66, 189, 70, 185, 74, 181, 78, 177, 82, 173, 86, 169, 90, 165, 94, 161,
+	 98, 157, 102, 153, 106, 149, 110, 145, 114, 141, 118, 137, 122, 133, 126, 129,
+	 },
+	{
+	 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+	 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+	 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+	 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+	 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+	 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+	 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+	 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+	 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+	 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+	 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+	 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+	 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+	 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+	 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+	 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
+	 },
+};
+
+// quantization_mode_table[integercount/2][bits] gives
+// us the quantization level for a given integer count and number of bits that
+// the integer may fit into. This is needed for color decoding,
+// and for the color encoding.
+int quantization_mode_table[17][128];
+
+void build_quantization_mode_table(void)
+{
+	int i, j;
+	for (i = 0; i <= 16; i++)
+		for (j = 0; j < 128; j++)
+			quantization_mode_table[i][j] = -1;
+
+	for (i = 0; i < 21; i++)
+		for (j = 1; j <= 16; j++)
+		{
+			int p = compute_ise_bitcount(2 * j, (quantization_method) i);
+			if (p < 128)
+				quantization_mode_table[j][p] = i;
+		}
+	for (i = 0; i <= 16; i++)
+	{
+		int largest_value_so_far = -1;
+		for (j = 0; j < 128; j++)
+		{
+			if (quantization_mode_table[i][j] > largest_value_so_far)
+				largest_value_so_far = quantization_mode_table[i][j];
+			else
+				quantization_mode_table[i][j] = largest_value_so_far;
+		}
+	}
+}
--- a/3rdparty/astc/astc_symbolic_physical.cpp
+++ b/3rdparty/astc/astc_symbolic_physical.cpp
@@ -0,0 +1,431 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Functions to convert a compressed block between the symbolic and
+ *			the physical representation.
+ */
+/*----------------------------------------------------------------------------*/
+
+#include "astc_codec_internals.h"
+
+// routine to write up to 8 bits
+static inline void write_bits(int value, int bitcount, int bitoffset, uint8_t * ptr)
+{
+	int mask = (1 << bitcount) - 1;
+	value &= mask;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	value <<= bitoffset;
+	mask <<= bitoffset;
+	mask = ~mask;
+
+	ptr[0] &= mask;
+	ptr[0] |= value;
+	ptr[1] &= mask >> 8;
+	ptr[1] |= value >> 8;
+}
+
+
+// routine to read up to 8 bits
+static inline int read_bits(int bitcount, int bitoffset, const uint8_t * ptr)
+{
+	int mask = (1 << bitcount) - 1;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	int value = ptr[0] | (ptr[1] << 8);
+	value >>= bitoffset;
+	value &= mask;
+	return value;
+}
+
+
+int bitrev8(int p)
+{
+	p = ((p & 0xF) << 4) | ((p >> 4) & 0xF);
+	p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
+	p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
+	return p;
+}
+
+
+
+
+physical_compressed_block symbolic_to_physical(int xdim, int ydim, int zdim, const symbolic_compressed_block * sc)
+{
+	int i, j;
+	physical_compressed_block res;
+
+
+	if (sc->block_mode == -2)
+	{
+		// UNORM16 constant-color block.
+		// This encodes separate constant-color blocks. There is currently
+		// no attempt to coalesce them into larger void-extents.
+
+		static const uint8_t cbytes[8] = { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+		for (i = 0; i < 8; i++)
+			res.data[i] = cbytes[i];
+
+		for (i = 0; i < 4; i++)
+		{
+			res.data[2 * i + 8] = sc->constant_color[i] & 0xFF;
+			res.data[2 * i + 9] = (sc->constant_color[i] >> 8) & 0xFF;
+		}
+		return res;
+	}
+
+
+	if (sc->block_mode == -1)
+	{
+		// FP16 constant-color block.
+		// This encodes separate constant-color blocks. There is currently
+		// no attempt to coalesce them into larger void-extents.
+
+		static const uint8_t cbytes[8] = { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+		for (i = 0; i < 8; i++)
+			res.data[i] = cbytes[i];
+
+		for (i = 0; i < 4; i++)
+		{
+			res.data[2 * i + 8] = sc->constant_color[i] & 0xFF;
+			res.data[2 * i + 9] = (sc->constant_color[i] >> 8) & 0xFF;
+		}
+		return res;
+	}
+
+
+
+	int partition_count = sc->partition_count;
+
+	// first, compress the weights. They are encoded as an ordinary
+	// integer-sequence, then bit-reversed
+	uint8_t weightbuf[16];
+	for (i = 0; i < 16; i++)
+		weightbuf[i] = 0;
+
+	const block_size_descriptor *bsd = get_block_size_descriptor(xdim, ydim, zdim);
+	const decimation_table *const *ixtab2 = bsd->decimation_tables;
+
+
+	int weight_count = ixtab2[bsd->block_modes[sc->block_mode].decimation_mode]->num_weights;
+	int weight_quantization_method = bsd->block_modes[sc->block_mode].quantization_mode;
+	int is_dual_plane = bsd->block_modes[sc->block_mode].is_dual_plane;
+
+	int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
+
+	int bits_for_weights = compute_ise_bitcount(real_weight_count,
+												(quantization_method) weight_quantization_method);
+
+
+	if (is_dual_plane)
+	{
+		uint8_t weights[64];
+		for (i = 0; i < weight_count; i++)
+		{
+			weights[2 * i] = sc->plane1_weights[i];
+			weights[2 * i + 1] = sc->plane2_weights[i];
+		}
+		encode_ise(weight_quantization_method, real_weight_count, weights, weightbuf, 0);
+	}
+	else
+	{
+		encode_ise(weight_quantization_method, weight_count, sc->plane1_weights, weightbuf, 0);
+	}
+
+	for (i = 0; i < 16; i++)
+		res.data[i] = bitrev8(weightbuf[15 - i]);
+
+	write_bits(sc->block_mode, 11, 0, res.data);
+	write_bits(partition_count - 1, 2, 11, res.data);
+
+	int below_weights_pos = 128 - bits_for_weights;
+
+	// encode partition index and color endpoint types for blocks with
+	// 2 or more partitions.
+	if (partition_count > 1)
+	{
+		write_bits(sc->partition_index, 6, 13, res.data);
+		write_bits(sc->partition_index >> 6, PARTITION_BITS - 6, 19, res.data);
+
+		if (sc->color_formats_matched)
+		{
+			write_bits(sc->color_formats[0] << 2, 6, 13 + PARTITION_BITS, res.data);
+		}
+		else
+		{
+			// go through the selected endpoint type classes for each partition
+			// in order to determine the lowest class present.
+			int low_class = 4;
+			for (i = 0; i < partition_count; i++)
+			{
+				int class_of_format = sc->color_formats[i] >> 2;
+				if (class_of_format < low_class)
+					low_class = class_of_format;
+			}
+			if (low_class == 3)
+				low_class = 2;
+			int encoded_type = low_class + 1;
+			int bitpos = 2;
+			for (i = 0; i < partition_count; i++)
+			{
+				int classbit_of_format = (sc->color_formats[i] >> 2) - low_class;
+
+				encoded_type |= classbit_of_format << bitpos;
+				bitpos++;
+			}
+			for (i = 0; i < partition_count; i++)
+			{
+				int lowbits_of_format = sc->color_formats[i] & 3;
+				encoded_type |= lowbits_of_format << bitpos;
+				bitpos += 2;
+			}
+			int encoded_type_lowpart = encoded_type & 0x3F;
+			int encoded_type_highpart = encoded_type >> 6;
+			int encoded_type_highpart_size = (3 * partition_count) - 4;
+			int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
+			write_bits(encoded_type_lowpart, 6, 13 + PARTITION_BITS, res.data);
+			write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, res.data);
+
+			below_weights_pos -= encoded_type_highpart_size;
+		}
+	}
+
+	else
+		write_bits(sc->color_formats[0], 4, 13, res.data);
+
+	// in dual-plane mode, encode the color component of the second plane of weights
+	if (is_dual_plane)
+		write_bits(sc->plane2_color_component, 2, below_weights_pos - 2, res.data);
+
+	// finally, encode the color bits
+	// first, get hold of all the color components to encode
+	uint8_t values_to_encode[32];
+	int valuecount_to_encode = 0;
+	for (i = 0; i < sc->partition_count; i++)
+	{
+		int vals = 2 * (sc->color_formats[i] >> 2) + 2;
+		for (j = 0; j < vals; j++)
+			values_to_encode[j + valuecount_to_encode] = sc->color_values[i][j];
+		valuecount_to_encode += vals;
+	}
+	// then, encode an ISE based on them.
+	encode_ise(sc->color_quantization_level, valuecount_to_encode, values_to_encode, res.data, (sc->partition_count == 1 ? 17 : 19 + PARTITION_BITS));
+
+	return res;
+}
+
+
+void physical_to_symbolic(int xdim, int ydim, int zdim, physical_compressed_block pb, symbolic_compressed_block * res)
+{
+	uint8_t bswapped[16];
+	int i, j;
+
+	res->error_block = 0;
+
+	// get hold of the block-size descriptor and the decimation tables.
+	const block_size_descriptor *bsd = get_block_size_descriptor(xdim, ydim, zdim);
+	const decimation_table *const *ixtab2 = bsd->decimation_tables;
+
+	// extract header fields
+	int block_mode = read_bits(11, 0, pb.data);
+
+
+	if ((block_mode & 0x1FF) == 0x1FC)
+	{
+		// void-extent block!
+
+		// check what format the data has
+		if (block_mode & 0x200)
+			res->block_mode = -1;	// floating-point
+		else
+			res->block_mode = -2;	// unorm16.
+
+		res->partition_count = 0;
+		for (i = 0; i < 4; i++)
+		{
+			res->constant_color[i] = pb.data[2 * i + 8] | (pb.data[2 * i + 9] << 8);
+		}
+
+		// additionally, check that the void-extent
+		if (zdim == 1)
+		{
+			// 2D void-extent
+			int rsvbits = read_bits(2, 10, pb.data);
+			if (rsvbits != 3)
+				res->error_block = 1;
+
+			int vx_low_s = read_bits(8, 12, pb.data) | (read_bits(5, 12 + 8, pb.data) << 8);
+			int vx_high_s = read_bits(8, 25, pb.data) | (read_bits(5, 25 + 8, pb.data) << 8);
+			int vx_low_t = read_bits(8, 38, pb.data) | (read_bits(5, 38 + 8, pb.data) << 8);
+			int vx_high_t = read_bits(8, 51, pb.data) | (read_bits(5, 51 + 8, pb.data) << 8);
+
+			int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
+
+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
+				res->error_block = 1;
+		}
+		else
+		{
+			// 3D void-extent
+			int vx_low_s = read_bits(9, 10, pb.data);
+			int vx_high_s = read_bits(9, 19, pb.data);
+			int vx_low_t = read_bits(9, 28, pb.data);
+			int vx_high_t = read_bits(9, 37, pb.data);
+			int vx_low_p = read_bits(9, 46, pb.data);
+			int vx_high_p = read_bits(9, 55, pb.data);
+
+			int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;
+
+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_p >= vx_high_p) && !all_ones)
+				res->error_block = 1;
+		}
+
+		return;
+	}
+
+	if (bsd->block_modes[block_mode].permit_decode == 0)
+	{
+		res->error_block = 1;
+		return;
+	}
+
+	int weight_count = ixtab2[bsd->block_modes[block_mode].decimation_mode]->num_weights;
+	int weight_quantization_method = bsd->block_modes[block_mode].quantization_mode;
+	int is_dual_plane = bsd->block_modes[block_mode].is_dual_plane;
+
+	int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
+
+	int partition_count = read_bits(2, 11, pb.data) + 1;
+
+	res->block_mode = block_mode;
+	res->partition_count = partition_count;
+
+	for (i = 0; i < 16; i++)
+		bswapped[i] = bitrev8(pb.data[15 - i]);
+
+	int bits_for_weights = compute_ise_bitcount(real_weight_count,
+												(quantization_method) weight_quantization_method);
+
+	int below_weights_pos = 128 - bits_for_weights;
+
+	if (is_dual_plane)
+	{
+		uint8_t indices[64];
+		decode_ise(weight_quantization_method, real_weight_count, bswapped, indices, 0);
+		for (i = 0; i < weight_count; i++)
+		{
+			res->plane1_weights[i] = indices[2 * i];
+			res->plane2_weights[i] = indices[2 * i + 1];
+		}
+	}
+	else
+	{
+		decode_ise(weight_quantization_method, weight_count, bswapped, res->plane1_weights, 0);
+	}
+
+	if (is_dual_plane && partition_count == 4)
+		res->error_block = 1;
+
+
+
+	res->color_formats_matched = 0;
+
+	// then, determine the format of each endpoint pair
+	int color_formats[4];
+	int encoded_type_highpart_size = 0;
+	if (partition_count == 1)
+	{
+		color_formats[0] = read_bits(4, 13, pb.data);
+		res->partition_index = 0;
+	}
+	else
+	{
+		encoded_type_highpart_size = (3 * partition_count) - 4;
+		below_weights_pos -= encoded_type_highpart_size;
+		int encoded_type = read_bits(6, 13 + PARTITION_BITS, pb.data) | (read_bits(encoded_type_highpart_size, below_weights_pos, pb.data) << 6);
+		int baseclass = encoded_type & 0x3;
+		if (baseclass == 0)
+		{
+			for (i = 0; i < partition_count; i++)
+			{
+				color_formats[i] = (encoded_type >> 2) & 0xF;
+			}
+			below_weights_pos += encoded_type_highpart_size;
+			res->color_formats_matched = 1;
+			encoded_type_highpart_size = 0;
+		}
+		else
+		{
+			int bitpos = 2;
+			baseclass--;
+			for (i = 0; i < partition_count; i++)
+			{
+				color_formats[i] = (((encoded_type >> bitpos) & 1) + baseclass) << 2;
+				bitpos++;
+			}
+			for (i = 0; i < partition_count; i++)
+			{
+				color_formats[i] |= (encoded_type >> bitpos) & 3;
+				bitpos += 2;
+			}
+		}
+		res->partition_index = read_bits(6, 13, pb.data) | (read_bits(PARTITION_BITS - 6, 19, pb.data) << 6);
+
+	}
+	for (i = 0; i < partition_count; i++)
+		res->color_formats[i] = color_formats[i];
+
+
+	// then, determine the number of integers we need to unpack for the endpoint pairs
+	int color_integer_count = 0;
+	for (i = 0; i < partition_count; i++)
+	{
+		int endpoint_class = color_formats[i] >> 2;
+		color_integer_count += (endpoint_class + 1) * 2;
+	}
+
+	if (color_integer_count > 18)
+		res->error_block = 1;
+
+	// then, determine the color endpoint format to use for these integers
+	static const int color_bits_arr[5] = { -1, 115 - 4, 113 - 4 - PARTITION_BITS, 113 - 4 - PARTITION_BITS, 113 - 4 - PARTITION_BITS };
+	int color_bits = color_bits_arr[partition_count] - bits_for_weights - encoded_type_highpart_size;
+	if (is_dual_plane)
+		color_bits -= 2;
+	if (color_bits < 0)
+		color_bits = 0;
+
+	int color_quantization_level = quantization_mode_table[color_integer_count >> 1][color_bits];
+	res->color_quantization_level = color_quantization_level;
+	if (color_quantization_level < 4)
+		res->error_block = 1;
+
+
+	// then unpack the integer-bits
+	uint8_t values_to_decode[32];
+	decode_ise(color_quantization_level, color_integer_count, pb.data, values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_BITS));
+
+	// and distribute them over the endpoint types
+	int valuecount_to_decode = 0;
+
+	for (i = 0; i < partition_count; i++)
+	{
+		int vals = 2 * (color_formats[i] >> 2) + 2;
+		for (j = 0; j < vals; j++)
+			res->color_values[i][j] = values_to_decode[j + valuecount_to_decode];
+		valuecount_to_decode += vals;
+	}
+
+	// get hold of color component for second-plane in the case of dual plane of weights.
+	if (is_dual_plane)
+		res->plane2_color_component = read_bits(2, below_weights_pos - 2, pb.data);
+
+}
--- a/3rdparty/astc/astc_weight_align.cpp
+++ b/3rdparty/astc/astc_weight_align.cpp
@@ -0,0 +1,598 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Angular-sum algorithm for weight alignment.
+ *
+ *			This algorithm works as follows:
+ *			* we compute a complex number P as (cos s*i, sin s*i) for each
+ *			  weight, where i is the input value and s is a scaling factor
+ *			  based on the spacing between the weights.
+ *			* we then add together complex numbers for all the weights.
+ *			* we then compute the length and angle of the resulting sum.
+ *
+ *			This should produce the following results:
+ *			* perfect alignment results in a vector whose length is equal to
+ *			  the sum of lengths of all inputs
+ *			* even distribution results in a vector of length 0.
+ *			* all samples identical results in perfect alignment for every
+ *			  scaling.
+ *
+ *			For each scaling factor within a given set, we compute an alignment
+ *			factor from 0 to 1. This should then result in some scalings standing
+ *			out as having particularly good alignment factors; we can use this to
+ *			produce a set of candidate scale/shift values for various quantization
+ *			levels; we should then actually try them and see what happens.
+ *
+ *			Assuming N quantization steps, the scaling factor becomes s=2*PI*(N-1);
+ *			we should probably have about 1 scaling factor for every 1/4
+ *			quantization step (perhaps 1/8 for low levels of quantization)
+ */
+/*----------------------------------------------------------------------------*/
+
+#include <math.h>
+#include "astc_codec_internals.h"
+
+#ifdef DEBUG_PRINT_DIAGNOSTICS
+	#include <stdio.h>
+#endif
+
+static const float angular_steppings[] = {
+	1.0, 1.125,
+	1.25, 1.375,
+	1.5, 1.625,
+	1.75, 1.875,
+
+	2.0, 2.25, 2.5, 2.75,
+	3.0, 3.25, 3.5, 3.75,
+	4.0, 4.25, 4.5, 4.75,
+	5.0, 5.25, 5.5, 5.75,
+	6.0, 6.25, 6.5, 6.75,
+	7.0, 7.25, 7.5, 7.75,
+
+	8.0, 8.5,
+	9.0, 9.5,
+	10.0, 10.5,
+	11.0, 11.5,
+	12.0, 12.5,
+	13.0, 13.5,
+	14.0, 14.5,
+	15.0, 15.5,
+	16.0, 16.5,
+	17.0, 17.5,
+	18.0, 18.5,
+	19.0, 19.5,
+	20.0, 20.5,
+	21.0, 21.5,
+	22.0, 22.5,
+	23.0, 23.5,
+	24.0, 24.5,
+	25.0, 25.5,
+	26.0, 26.5,
+	27.0, 27.5,
+	28.0, 28.5,
+	29.0, 29.5,
+	30.0, 30.5,
+	31.0, 31.5,
+	32.0, 32.5,
+	33.0, 33.5,
+	34.0, 34.5,
+	35.0, 35.5,
+};
+
+#define ANGULAR_STEPS ((int)(sizeof(angular_steppings)/sizeof(angular_steppings[0])))
+
+static float stepsizes[ANGULAR_STEPS];
+static float stepsizes_sqr[ANGULAR_STEPS];
+
+static int max_angular_steps_needed_for_quant_level[13];
+
+// we store sine/cosine values for 64 possible weight values; this causes
+// slight quality loss compared to using sin() and cos() directly.
+
+#define SINCOS_STEPS 64
+
+static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
+static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
+
+void prepare_angular_tables(void)
+{
+	int i, j;
+	int max_angular_steps_needed_for_quant_steps[40];
+	for (i = 0; i < ANGULAR_STEPS; i++)
+	{
+		stepsizes[i] = 1.0f / angular_steppings[i];
+		stepsizes_sqr[i] = stepsizes[i] * stepsizes[i];
+
+		for (j = 0; j < SINCOS_STEPS; j++)
+		{
+			sin_table[j][i] = static_cast < float >(sin((2.0f * M_PI / (SINCOS_STEPS - 1.0f)) * angular_steppings[i] * j));
+			cos_table[j][i] = static_cast < float >(cos((2.0f * M_PI / (SINCOS_STEPS - 1.0f)) * angular_steppings[i] * j));
+		}
+
+		int p = static_cast < int >(floor(angular_steppings[i])) + 1;
+		max_angular_steps_needed_for_quant_steps[p] = MIN(i + 1, ANGULAR_STEPS - 1);
+	}
+
+
+	// yes, the next-to-last entry is supposed to have the value 33. This because under
+	// ASTC, the 32-weight mode leaves a double-sized hole in the middle of the
+	// weight space, so we are better off matching 33 weights than 32.
+	static const int steps_of_level[] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33, 36 };
+
+	for (i = 0; i < 13; i++)
+		max_angular_steps_needed_for_quant_level[i] = max_angular_steps_needed_for_quant_steps[steps_of_level[i]];
+
+}
+
+
+union if32
+{
+	float f;
+	int32_t s;
+	uint32_t u;
+};
+
+
+// function to compute angular sums; then, from the
+// angular sums, compute alignment factor and offset.
+
+/* static inline */
+void compute_angular_offsets(int samplecount, const float *samples, const float *sample_weights, int max_angular_steps, float *offsets)
+{
+	int i, j;
+
+	float anglesum_x[ANGULAR_STEPS];
+	float anglesum_y[ANGULAR_STEPS];
+
+	for (i = 0; i < max_angular_steps; i++)
+	{
+		anglesum_x[i] = 0;
+		anglesum_y[i] = 0;
+	}
+
+
+	// compute the angle-sums.
+	for (i = 0; i < samplecount; i++)
+	{
+		float sample = samples[i];
+		float sample_weight = sample_weights[i];
+		if32 p;
+		p.f = (sample * (SINCOS_STEPS - 1.0f)) + 12582912.0f;
+		unsigned int isample = p.u & 0x3F;
+
+		const float *sinptr = sin_table[isample];
+		const float *cosptr = cos_table[isample];
+
+		for (j = 0; j < max_angular_steps; j++)
+		{
+			float cp = cosptr[j];
+			float sp = sinptr[j];
+
+			anglesum_x[j] += cp * sample_weight;
+			anglesum_y[j] += sp * sample_weight;
+		}
+	}
+
+	// post-process the angle-sums
+	for (i = 0; i < max_angular_steps; i++)
+	{
+		float angle = atan2(anglesum_y[i], anglesum_x[i]);	// positive angle -> positive offset
+		offsets[i] = angle * (stepsizes[i] * (1.0f / (2.0f * (float)M_PI)));
+	}
+}
+
+
+
+// for a given step-size and a given offset, compute the
+// lowest and highest weight that results from quantizing using the stepsize & offset.
+// also, compute the resulting error.
+
+
+/* static inline */
+void compute_lowest_and_highest_weight(int samplecount, const float *samples, const float *sample_weights,
+									  int max_angular_steps, const float *offsets,
+									  int8_t * lowest_weight, int8_t * highest_weight,
+									  float *error, float *cut_low_weight_error, float *cut_high_weight_error)
+{
+	int i;
+
+	int sp;
+
+	float error_from_forcing_weight_down[60];
+	float error_from_forcing_weight_either_way[60];
+	for (i = 0; i < 60; i++)
+	{
+		error_from_forcing_weight_down[i] = 0;
+		error_from_forcing_weight_either_way[i] = 0;
+	}
+
+	// weight + 12
+	static const unsigned int idxtab[256] = {
+
+		12, 13, 14, 15, 16, 17, 18, 19,
+		20, 21, 22, 23, 24, 25, 26, 27,
+		28, 29, 30, 31, 32, 33, 34, 35,
+		36, 37, 38, 39, 40, 41, 42, 43,
+		44, 45, 46, 47, 48, 49, 50, 51,
+		52, 53, 54, 55, 55, 55, 55, 55,
+		55, 55, 55, 55, 55, 55, 55, 55,
+		55, 55, 55, 55, 55, 55, 55, 55,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 1, 2, 3,
+		4, 5, 6, 7, 8, 9, 10, 11,
+
+		12, 13, 14, 15, 16, 17, 18, 19,
+		20, 21, 22, 23, 24, 25, 26, 27,
+		28, 29, 30, 31, 32, 33, 34, 35,
+		36, 37, 38, 39, 40, 41, 42, 43,
+		44, 45, 46, 47, 48, 49, 50, 51,
+		52, 53, 54, 55, 55, 55, 55, 55,
+		55, 55, 55, 55, 55, 55, 55, 55,
+		55, 55, 55, 55, 55, 55, 55, 55,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 1, 2, 3,
+		4, 5, 6, 7, 8, 9, 10, 11
+	};
+
+
+
+	for (sp = 0; sp < max_angular_steps; sp++)
+	{
+		unsigned int minidx_bias12 = 55;
+		unsigned int maxidx_bias12 = 0;
+
+		float errval = 0.0f;
+
+		float rcp_stepsize = angular_steppings[sp];
+		float offset = offsets[sp];
+
+		float scaled_offset = rcp_stepsize * offset;
+
+
+		for (i = 0; i < samplecount - 1; i += 2)
+		{
+			float wt1 = sample_weights[i];
+			float wt2 = sample_weights[i + 1];
+			if32 p1, p2;
+			float sval1 = (samples[i] * rcp_stepsize) - scaled_offset;
+			float sval2 = (samples[i + 1] * rcp_stepsize) - scaled_offset;
+			p1.f = sval1 + 12582912.0f;	// FP representation abuse to avoid floor() and float->int conversion
+			p2.f = sval2 + 12582912.0f;	// FP representation abuse to avoid floor() and float->int conversion
+			float isval1 = p1.f - 12582912.0f;
+			float isval2 = p2.f - 12582912.0f;
+			float dif1 = sval1 - isval1;
+			float dif2 = sval2 - isval2;
+
+			errval += (dif1 * wt1) * dif1;
+			errval += (dif2 * wt2) * dif2;
+
+			// table lookups that really perform a minmax function.
+			unsigned int idx1_bias12 = idxtab[p1.u & 0xFF];
+			unsigned int idx2_bias12 = idxtab[p2.u & 0xFF];
+
+			if (idx1_bias12 < minidx_bias12)
+				minidx_bias12 = idx1_bias12;
+			if (idx1_bias12 > maxidx_bias12)
+				maxidx_bias12 = idx1_bias12;
+			if (idx2_bias12 < minidx_bias12)
+				minidx_bias12 = idx2_bias12;
+			if (idx2_bias12 > maxidx_bias12)
+				maxidx_bias12 = idx2_bias12;
+
+			error_from_forcing_weight_either_way[idx1_bias12] += wt1;
+			error_from_forcing_weight_down[idx1_bias12] += (dif1 * wt1);
+
+			error_from_forcing_weight_either_way[idx2_bias12] += wt2;
+			error_from_forcing_weight_down[idx2_bias12] += (dif2 * wt2);
+		}
+
+		if (samplecount & 1)
+		{
+			i = samplecount - 1;
+			float wt = sample_weights[i];
+			if32 p;
+			float sval = (samples[i] * rcp_stepsize) - scaled_offset;
+			p.f = sval + 12582912.0f;	// FP representation abuse to avoid floor() and float->int conversion
+			float isval = p.f - 12582912.0f;
+			float dif = sval - isval;
+
+			errval += (dif * wt) * dif;
+
+			unsigned int idx_bias12 = idxtab[p.u & 0xFF];
+
+			if (idx_bias12 < minidx_bias12)
+				minidx_bias12 = idx_bias12;
+			if (idx_bias12 > maxidx_bias12)
+				maxidx_bias12 = idx_bias12;
+
+			error_from_forcing_weight_either_way[idx_bias12] += wt;
+			error_from_forcing_weight_down[idx_bias12] += dif * wt;
+		}
+
+
+		lowest_weight[sp] = (int)minidx_bias12 - 12;
+		highest_weight[sp] = (int)maxidx_bias12 - 12;
+		error[sp] = errval;
+
+		// the cut_(lowest/highest)_weight_error indicate the error that results from
+		// forcing samples that should have had the (lowest/highest) weight value
+		// one step (up/down).
+		cut_low_weight_error[sp] = error_from_forcing_weight_either_way[minidx_bias12] - 2.0f * error_from_forcing_weight_down[minidx_bias12];
+		cut_high_weight_error[sp] = error_from_forcing_weight_either_way[maxidx_bias12] + 2.0f * error_from_forcing_weight_down[maxidx_bias12];
+
+		// clear out the error-from-forcing values we actually used in this pass
+		// so that these are clean for the next pass.
+		unsigned int ui;
+		for (ui = minidx_bias12 & ~0x3; ui <= maxidx_bias12; ui += 4)
+		{
+			error_from_forcing_weight_either_way[ui] = 0;
+			error_from_forcing_weight_down[ui] = 0;
+			error_from_forcing_weight_either_way[ui + 1] = 0;
+			error_from_forcing_weight_down[ui + 1] = 0;
+			error_from_forcing_weight_either_way[ui + 2] = 0;
+			error_from_forcing_weight_down[ui + 2] = 0;
+			error_from_forcing_weight_either_way[ui + 3] = 0;
+			error_from_forcing_weight_down[ui + 3] = 0;
+		}
+	}
+
+
+	for (sp = 0; sp < max_angular_steps; sp++)
+	{
+		float errscale = stepsizes_sqr[sp];
+		error[sp] *= errscale;
+		cut_low_weight_error[sp] *= errscale;
+		cut_high_weight_error[sp] *= errscale;
+	}
+}
+
+
+
+// main function for running the angular algorithm.
+
+
+void compute_angular_endpoints_for_quantization_levels(int samplecount, const float *samples, const float *sample_weights, int max_quantization_level, float low_value[12], float high_value[12])
+{
+	int i;
+
+
+	max_quantization_level++;	// Temporarily increase level - needs refinement
+
+	static const int quantization_steps_for_level[13] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33, 36 };
+	int max_quantization_steps = quantization_steps_for_level[max_quantization_level];
+
+	float offsets[ANGULAR_STEPS];
+
+	int max_angular_steps = max_angular_steps_needed_for_quant_level[max_quantization_level];
+
+	compute_angular_offsets(samplecount, samples, sample_weights, max_angular_steps, offsets);
+
+
+	// the +4 offsets are to allow for vectorization within compute_lowest_and_highest_weight().
+	int8_t lowest_weight[ANGULAR_STEPS + 4];
+	int8_t highest_weight[ANGULAR_STEPS + 4];
+	float error[ANGULAR_STEPS + 4];
+
+	float cut_low_weight_error[ANGULAR_STEPS + 4];
+	float cut_high_weight_error[ANGULAR_STEPS + 4];
+
+	compute_lowest_and_highest_weight(samplecount, samples, sample_weights, max_angular_steps, offsets, lowest_weight, highest_weight, error, cut_low_weight_error, cut_high_weight_error);
+
+
+	#ifdef DEBUG_PRINT_DIAGNOSTICS
+		if (print_diagnostics)
+		{
+			printf("%s : max-angular-steps=%d \n", __func__, max_angular_steps);
+			printf("Samplecount=%d, max_quantization_level=%d\n", samplecount, max_quantization_level);
+			for (i = 0; i < samplecount; i++)
+				printf("Sample %d : %f (weight %f)\n", i, samples[i], sample_weights[i]);
+
+			for (i = 0; i < max_angular_steps; i++)
+			{
+				printf("%d: offset=%f error=%f lowest=%d highest=%d cl=%f ch=%f\n", i, offsets[i], error[i], lowest_weight[i], highest_weight[i], cut_low_weight_error[i], cut_high_weight_error[i]);
+			}
+			printf("\n");
+		}
+	#endif
+
+	// for each quantization level, find the best error terms.
+	float best_errors[40];
+	int best_scale[40];
+	uint8_t cut_low_weight[40];
+
+	for (i = 0; i < (max_quantization_steps + 4); i++)
+	{
+		best_errors[i] = 1e30f;
+		best_scale[i] = -1;	// Indicates no solution found
+		cut_low_weight[i] = 0;
+	}
+
+
+
+	for (i = 0; i < max_angular_steps; i++)
+	{
+		int samplecount = highest_weight[i] - lowest_weight[i] + 1;
+		if (samplecount >= (max_quantization_steps + 4))
+		{
+			continue;
+		}
+		if (samplecount < 2)
+			samplecount = 2;
+
+		if (best_errors[samplecount] > error[i])
+		{
+			best_errors[samplecount] = error[i];
+			best_scale[samplecount] = i;
+			cut_low_weight[samplecount] = 0;
+		}
+
+		float error_cut_low = error[i] + cut_low_weight_error[i];
+		float error_cut_high = error[i] + cut_high_weight_error[i];
+		float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];
+
+		if (best_errors[samplecount - 1] > error_cut_low)
+		{
+			best_errors[samplecount - 1] = error_cut_low;
+			best_scale[samplecount - 1] = i;
+			cut_low_weight[samplecount - 1] = 1;
+		}
+
+		if (best_errors[samplecount - 1] > error_cut_high)
+		{
+			best_errors[samplecount - 1] = error_cut_high;
+			best_scale[samplecount - 1] = i;
+			cut_low_weight[samplecount - 1] = 0;
+		}
+
+		if (best_errors[samplecount - 2] > error_cut_low_high)
+		{
+			best_errors[samplecount - 2] = error_cut_low_high;
+			best_scale[samplecount - 2] = i;
+			cut_low_weight[samplecount - 2] = 1;
+		}
+
+	}
+
+	// if we got a better error-value for a low sample count than for a high one,
+	// use the low sample count error value for the higher sample count as well.
+	for (i = 3; i <= max_quantization_steps; i++)
+	{
+		if (best_errors[i] > best_errors[i - 1])
+		{
+			best_errors[i] = best_errors[i - 1];
+			best_scale[i] = best_scale[i - 1];
+			cut_low_weight[i] = cut_low_weight[i - 1];
+		}
+	}
+
+
+	max_quantization_level--;	// Decrease level again (see corresponding ++, above)
+
+	static const int ql_weights[12] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33 };
+	for (i = 0; i <= max_quantization_level; i++)
+	{
+		int q = ql_weights[i];
+		int bsi = best_scale[q];
+
+		// Did we find anything?
+		if(bsi < 0)
+		{
+			printf("ERROR: Unable to find an encoding within the specified error limits. Please revise the error limit values and try again.\n");
+			exit(1);
+		}
+
+		float stepsize = stepsizes[bsi];
+		int lwi = lowest_weight[bsi] + cut_low_weight[q];
+		int hwi = lwi + q - 1;
+		float offset = offsets[bsi];
+
+		low_value[i] = offset + lwi * stepsize;
+		high_value[i] = offset + hwi * stepsize;
+	}
+
+}
+
+
+// helper functions that will compute ideal angular-endpoints
+// for a given set of weights and a given block size descriptors
+
+void compute_angular_endpoints_1plane(float mode_cutoff, const block_size_descriptor * bsd,
+									  const float *decimated_quantized_weights, const float *decimated_weights,
+									  float low_value[MAX_WEIGHT_MODES], float high_value[MAX_WEIGHT_MODES])
+{
+	int i;
+	float low_values[MAX_DECIMATION_MODES][12];
+	float high_values[MAX_DECIMATION_MODES][12];
+
+	for (i = 0; i < MAX_DECIMATION_MODES; i++)
+	{
+		int samplecount = bsd->decimation_mode_samples[i];
+		int quant_mode = bsd->decimation_mode_maxprec_1plane[i];
+		float percentile = bsd->decimation_mode_percentile[i];
+		int permit_encode = bsd->permit_encode[i];
+		if (permit_encode == 0 || samplecount < 1 || quant_mode < 0 || percentile > mode_cutoff)
+			continue;
+
+
+		compute_angular_endpoints_for_quantization_levels(samplecount,
+														  decimated_quantized_weights + i * MAX_WEIGHTS_PER_BLOCK,
+														  decimated_weights + i * MAX_WEIGHTS_PER_BLOCK, quant_mode, low_values[i], high_values[i]);
+	}
+
+	for (i = 0; i < MAX_WEIGHT_MODES; i++)
+	{
+		if (bsd->block_modes[i].is_dual_plane != 0 || bsd->block_modes[i].percentile > mode_cutoff)
+			continue;
+		int quant_mode = bsd->block_modes[i].quantization_mode;
+		int decim_mode = bsd->block_modes[i].decimation_mode;
+
+		low_value[i] = low_values[decim_mode][quant_mode];
+		high_value[i] = high_values[decim_mode][quant_mode];
+	}
+
+}
+
+
+
+void compute_angular_endpoints_2planes(float mode_cutoff,
+									   const block_size_descriptor * bsd,
+									   const float *decimated_quantized_weights,
+									   const float *decimated_weights,
+									   float low_value1[MAX_WEIGHT_MODES], float high_value1[MAX_WEIGHT_MODES], float low_value2[MAX_WEIGHT_MODES], float high_value2[MAX_WEIGHT_MODES])
+{
+	int i;
+	float low_values1[MAX_DECIMATION_MODES][12];
+	float high_values1[MAX_DECIMATION_MODES][12];
+	float low_values2[MAX_DECIMATION_MODES][12];
+	float high_values2[MAX_DECIMATION_MODES][12];
+
+	for (i = 0; i < MAX_DECIMATION_MODES; i++)
+	{
+		int samplecount = bsd->decimation_mode_samples[i];
+		int quant_mode = bsd->decimation_mode_maxprec_2planes[i];
+		float percentile = bsd->decimation_mode_percentile[i];
+		int permit_encode = bsd->permit_encode[i];
+		if (permit_encode == 0 || samplecount < 1 || quant_mode < 0 || percentile > mode_cutoff)
+			continue;
+
+		compute_angular_endpoints_for_quantization_levels(samplecount,
+														  decimated_quantized_weights + 2 * i * MAX_WEIGHTS_PER_BLOCK,
+														  decimated_weights + 2 * i * MAX_WEIGHTS_PER_BLOCK, quant_mode, low_values1[i], high_values1[i]);
+
+		compute_angular_endpoints_for_quantization_levels(samplecount,
+														  decimated_quantized_weights + (2 * i + 1) * MAX_WEIGHTS_PER_BLOCK,
+														  decimated_weights + (2 * i + 1) * MAX_WEIGHTS_PER_BLOCK, quant_mode, low_values2[i], high_values2[i]);
+
+	}
+
+	for (i = 0; i < MAX_WEIGHT_MODES; i++)
+	{
+		if (bsd->block_modes[i].is_dual_plane != 1 || bsd->block_modes[i].percentile > mode_cutoff)
+			continue;
+		int quant_mode = bsd->block_modes[i].quantization_mode;
+		int decim_mode = bsd->block_modes[i].decimation_mode;
+
+		low_value1[i] = low_values1[decim_mode][quant_mode];
+		high_value1[i] = high_values1[decim_mode][quant_mode];
+		low_value2[i] = low_values2[decim_mode][quant_mode];
+		high_value2[i] = high_values2[decim_mode][quant_mode];
+	}
+}
--- a/3rdparty/astc/astc_weight_quant_xfer_tables.cpp
+++ b/3rdparty/astc/astc_weight_quant_xfer_tables.cpp
--- a/3rdparty/astc/license.txt
+++ b/3rdparty/astc/license.txt
@@ -0,0 +1,137 @@
+END USER LICENCE AGREEMENT FOR THE MALI ASTC SPECIFICATION AND SOFTWARE CODEC,
+VERSION: 1.3
+
+THIS END USER LICENCE AGREEMENT ("LICENCE") IS A LEGAL AGREEMENT BETWEEN YOU
+(EITHER A SINGLE INDIVIDUAL, OR SINGLE LEGAL ENTITY) AND ARM LIMITED ("ARM")
+FOR THE USE OF THE SOFTWARE ACCOMPANYING THIS LICENCE. ARM IS ONLY WILLING
+TO LICENSE THE SOFTWARE TO YOU ON CONDITION THAT YOU ACCEPT ALL OF THE TERMS
+IN THIS LICENCE. BY CLICKING "I AGREE" OR BY INSTALLING OR OTHERWISE USING
+OR COPYING THE SOFTWARE YOU INDICATE THAT YOU AGREE TO BE BOUND BY ALL THE
+TERMS OF THIS LICENCE.
+
+IF YOU DO NOT AGREE TO THE TERMS OF THIS LICENCE, ARM IS UNWILLING TO LICENSE
+THE SOFTWARE TO YOU AND YOU MAY NOT INSTALL, USE OR COPY THE SOFTWARE.
+
+1.  DEFINITIONS.
+
+"Authorised Purpose" means the use of the Software solely to develop products
+and tools which implement the Khronos ASTC specification to;
+(i) compress texture images into ASTC format ("Compression Results"); 
+(ii) distribute such Compression Results to third parties; and 
+(iii) decompress texture images stored in ASTC format.
+
+"Software" means the source code and Software binaries accompanying this
+Licence, and any printed, electronic or online documentation supplied with it,
+in all cases relating to the MALI ASTC SPECIFICATION AND SOFTWARE CODEC.
+
+2. LICENCE GRANT.
+
+ARM hereby grants to you, subject to the terms and conditions of this Licence,
+a nonexclusive, nontransferable, free of charge, royalty free, worldwide
+licence to use, copy, modify and (subject to Clause 3 below) distribute the
+Software solely for the Authorised Purpose.
+
+No right is granted to use the Software to develop hardware.
+
+Notwithstanding the foregoing, nothing in this Licence prevents you from
+using the Software to develop products that conform to an application
+programming interface specification issued by The Khronos Group Inc.
+("Khronos"), provided that you have licences to develop such products
+under the relevant Khronos agreements.
+
+ 3. RESTRICTIONS ON USE OF THE SOFTWARE.
+
+RESTRICTIONS ON TRANSFER OF LICENSED RIGHTS: The rights granted to you under
+this Licence may not be assigned by you to any third party without the prior
+written consent of ARM.
+
+TITLE AND RESERVATION OF RIGHTS: You acquire no rights to the Software other
+than as expressly provided by this Licence. The Software is licensed not sold.
+ARM does not transfer title to the Software to you. In no event shall the
+licences granted in Clause 2 be construed as granting you expressly or by
+implication, estoppel or otherwise, licences to any ARM technology other than
+the Software.
+
+NOTICES: You shall not remove from the Software any copyright notice or other
+notice (whether ARM's or its licensor's), and you shall ensure that any such
+notice is reproduced in any copies of the whole or any part of the Software
+made by you.  You shall not use ARM's or its licensor's name, logo or
+trademarks to market Compression Results. If you distribute the Software to a
+third party, you agree to include a copy of this Licence with such
+distribution.
+
+4. NO SUPPORT.
+
+ARM has no obligation to support or to continue providing or updating any of
+the Software.
+
+5. NO WARRANTIES.
+
+YOU AGREE THAT THE SOFTWARE IS LICENSED "AS IS", AND THAT ARM EXPRESSLY
+DISCLAIMS ALL REPRESENTATIONS, WARRANTIES, CONDITIONS OR OTHER TERMS, EXPRESS,
+IMPLIED OR STATUTORY, TO THE FULLEST EXTENT PERMITTED BY LAW. YOU EXPRESSLY
+ASSUME ALL LIABILITIES AND RISKS, FOR USE OR OPERATION OF ANY APPLICATION
+PROGRAMS YOU CREATE WITH THE SOFTWARE, AND YOU ASSUME THE ENTIRE COST OF ALL
+NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+6. LIMITATION OF LIABILITY.
+
+TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL ARM BE
+LIABLE FOR ANY INDIRECT, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES
+(INCLUDING LOSS OF PROFITS) ARISING OUT OF THE USE OR INABILITY TO USE THE
+SOFTWARE WHETHER BASED ON A CLAIM UNDER CONTRACT, TORT OR OTHER LEGAL THEORY,
+EVEN IF ARM WAS ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+ARM does not seek to limit or exclude liability for death or personal injury
+arising from ARM's negligence and because some jurisdictions do not permit the
+exclusion or limitation of liability for consequential or incidental damages
+the above limitation relating to liability for consequential damages may not
+apply to you.
+
+NOTWITHSTANDING ANYTHING TO THE CONTRARY CONTAINED IN THIS LICENCE, THE
+MAXIMUM LIABILITY OF ARM TO YOU IN AGGREGATE FOR ALL CLAIMS MADE AGAINST ARM
+IN CONTRACT TORT OR OTHERWISE UNDER OR IN CONNECTION WITH THE SUBJECT MATTER
+OF THIS LICENCE SHALL NOT EXCEED THE GREATER OF THE TOTAL OF SUMS PAID BY YOU
+TO ARM (IF ANY) FOR THIS LICENCE AND US$5.00.
+
+7. U.S. GOVERNMENT END USERS.
+
+US Government Restrictions: Use, duplication, reproduction, release,
+modification, disclosure or transfer of this commercial product and
+accompanying documentation is restricted in accordance with the terms
+of this Licence.
+
+8. TERM AND TERMINATION.
+
+This Licence shall remain in force until terminated by you or by ARM. Without
+prejudice to any of its other rights if you are in breach of any of the terms
+and conditions of this Licence then ARM may terminate this Licence immediately
+upon giving written notice to you. You may terminate this Licence at any time.
+
+Upon termination of this Licence by you or by ARM you shall stop using the
+Software and destroy all copies of the Software in your possession together
+with all documentation and related materials. The provisions of Clauses 1, 3,
+4, 5, 6, 7, 8 and 9  shall survive termination of this Licence.
+
+9. GENERAL.
+
+This Licence is governed by English Law. Except where ARM agrees otherwise in
+a written contract signed by you and ARM, this is the only agreement between
+you and ARM relating to the Software and it may only be modified by written
+agreement between you and ARM. Except as expressly agreed in writing, this
+Licence may not be modified by purchase orders, advertising or other
+representation by any person. If any clause in this Licence is held by a court
+of law to be illegal or unenforceable the remaining provisions of this Licence
+shall not be affected thereby. The failure by ARM to enforce any of the
+provisions of this Licence, unless waived in writing, shall not constitute a
+waiver of ARM's rights to enforce such provision or any other provision of
+this Licence in the future.
+
+You agree to comply fully with all laws and regulations of the United States
+and other countries ("Export Laws") to assure that the Software is not;
+(1) exported, directly or indirectly, in violation of Export Laws, either to
+any countries that are subject to U.S.A. export restrictions or to any end
+user who has been prohibited from participating in the U.S.A. export
+transactions by any federal agency of the U.S.A. government; or 
+(2) intended to be used for any purpose prohibited by Export Laws, including,
+without limitation, nuclear, chemical, or biological weapons proliferation.
--- a/3rdparty/astc/mathlib.cpp
+++ b/3rdparty/astc/mathlib.cpp
@@ -0,0 +1,772 @@
+/*----------------------------------------------------------------------------*/  
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Library of math functions.
+ */ 
+/*----------------------------------------------------------------------------*/ 
+
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "mathlib.h"
+
+/**************************
+  basic OpenCL functions
+**************************/
+
+float inversesqrt(float p)
+{
+	return 1.0f / sqrt(p);
+}
+float acospi(float p)
+{
+	return static_cast < float >(acos(p) * (1.0f / M_PI));
+};
+float sinpi(float p)
+{
+	return static_cast < float >(sin(p * M_PI));
+}
+float cospi(float p)
+{
+	return static_cast < float >(cos(p * M_PI));
+}
+
+
+float nan(int p)
+{
+	union
+	{
+		int p;
+		float q;
+	} v;
+	v.p = p | 0x7FC00000U;
+	return v.q;
+}
+
+#if (!_MSC_VER) && (__cplusplus < 201103L)
+float fmax(float p, float q)
+{
+	if (p != p)
+		return q;
+	if (q != q)
+		return p;
+	if (p > q)
+		return p;
+	return q;
+}
+
+float fmin(float p, float q)
+{
+	if (p != p)
+		return q;
+	if (q != q)
+		return p;
+	if (p < q)
+		return p;
+	return q;
+}
+#endif  // C++11
+
+float2 fmax(float2 p, float2 q)
+{
+	return float2(fmax(p.x, q.x), fmax(p.y, q.y));
+}
+
+float3 fmax(float3 p, float3 q)
+{
+	return float3(fmax(p.x, q.x), fmax(p.y, q.y), fmax(p.z, q.z));
+}
+
+float4 fmax(float4 p, float4 q)
+{
+	return float4(fmax(p.x, q.x), fmax(p.y, q.y), fmax(p.z, q.z), fmax(p.w, q.w));
+}
+
+
+float2 fmin(float2 p, float2 q)
+{
+	return float2(fmin(p.x, q.x), fmin(p.y, q.y));
+}
+
+float3 fmin(float3 p, float3 q)
+{
+	return float3(fmin(p.x, q.x), fmin(p.y, q.y), fmin(p.z, q.z));
+}
+
+float4 fmin(float4 p, float4 q)
+{
+	return float4(fmin(p.x, q.x), fmin(p.y, q.y), fmin(p.z, q.z), fmin(p.w, q.w));
+}
+
+/* 
+   float dot( float2 p, float2 q ) { return p.x*q.x + p.y*q.y; } float dot( float3 p, float3 q ) { return p.x*q.x + p.y*q.y + p.z*q.z; } float dot( float4 p, float4 q ) { return p.x*q.x + p.y*q.y +
+   p.z*q.z + p.w*q.w; } */
+
+float3 cross(float3 p, float3 q)
+{
+	return p.yzx * q.zxy - p.zxy * q.yzx;
+}
+
+float4 cross(float4 p, float4 q)
+{
+	return float4(p.yzx * q.zxy - p.zxy * q.yzx, 0.0f);
+}
+
+float length(float2 p)
+{
+	return sqrt(dot(p, p));
+}
+
+float length(float3 p)
+{
+	return sqrt(dot(p, p));
+}
+
+float length(float4 p)
+{
+	return sqrt(dot(p, p));
+}
+
+float length_sqr(float2 p)
+{
+	return dot(p, p);
+}
+
+float length_sqr(float3 p)
+{
+	return dot(p, p);
+}
+
+float length_sqr(float4 p)
+{
+	return dot(p, p);
+}
+
+
+float distance(float2 p, float2 q)
+{
+	return length(q - p);
+}
+
+float distance(float3 p, float3 q)
+{
+	return length(q - p);
+}
+
+float distance(float4 p, float4 q)
+{
+	return length(q - p);
+}
+
+float distance_sqr(float2 p, float2 q)
+{
+	return length_sqr(q - p);
+}
+
+float distance_sqr(float3 p, float3 q)
+{
+	return length_sqr(q - p);
+}
+
+float distance_sqr(float4 p, float4 q)
+{
+	return length_sqr(q - p);
+}
+
+
+float2 normalize(float2 p)
+{
+	return p / length(p);
+}
+
+float3 normalize(float3 p)
+{
+	return p / length(p);
+}
+
+float4 normalize(float4 p)
+{
+	return p / length(p);
+}
+
+
+/**************************************************
+  matrix functions, for 2x2, 3x3 and 4x4 matrices:
+
+   * trace
+   * determinant
+   * transform
+   * inverse
+   * adjugate
+   * characteristic polynomial
+   * eigenvalue
+   * eigenvector
+
+  additionally, root solver
+  for 2nd, 3rd and 4th degree monic polynomials.
+
+*************************************************/
+
+/* 
+	struct mat2 { float2 v[2]; };
+	struct mat3 { float3 v[3]; };
+	struct mat4 { float4 v[4]; };
+*/
+
+float trace(mat2 p)
+{
+	return p.v[0].x + p.v[1].y;
+}
+
+float trace(mat3 p)
+{
+	return p.v[0].x + p.v[1].y + p.v[2].z;
+}
+
+float trace(mat4 p)
+{
+	return p.v[0].x + p.v[1].y + p.v[2].z + p.v[3].w;
+}
+
+float determinant(mat2 p)
+{
+	float2 v = p.v[0].xy * p.v[1].yx;
+	return v.x - v.y;
+}
+
+float determinant(mat3 p)
+{
+	return dot(p.v[0], cross(p.v[1], p.v[2]));
+}
+
+float determinant(mat4 p)
+{
+	return dot(p.v[0],
+			   float4(dot(p.v[1].yzw, cross(p.v[2].yzw, p.v[3].yzw)),
+					  -dot(p.v[1].xzw, cross(p.v[2].xzw, p.v[3].xzw)), dot(p.v[1].xyw, cross(p.v[2].xyw, p.v[3].xyw)), -dot(p.v[1].xyz, cross(p.v[2].xyz, p.v[3].xyz))));
+}
+
+
+/* 
+   characteristic polynomials for matrices. These polynomials are monic, meaning that the coefficient of the highest component is 1; this component is omitted. The first component is the constant
+   part. */
+
+float2 characteristic_poly(mat2 p)
+{
+	return float2(determinant(p), -trace(p));
+}
+
+
+float3 characteristic_poly(mat3 p)
+{
+	float2 v1 = (p.v[0].xy * p.v[1].yx) + (p.v[0].xz * p.v[2].zx) + (p.v[1].yz * p.v[2].zy);
+
+	return float3(-determinant(p), v1.x - v1.y, -trace(p));
+}
+
+
+float4 characteristic_poly(mat4 p)
+{
+	float2 v1 = (p.v[0].xy * p.v[1].yx) + (p.v[0].xz * p.v[2].zx) + (p.v[0].xw * p.v[3].wx) + (p.v[1].yz * p.v[2].zy) + (p.v[1].yw * p.v[3].wy) + (p.v[2].zw * p.v[3].wz);
+
+	return float4(determinant(p),
+				  -dot(p.v[1].yzw, cross(p.v[2].yzw, p.v[3].yzw))
+				  - dot(p.v[0].xzw, cross(p.v[2].xzw, p.v[3].xzw)) - dot(p.v[0].xyw, cross(p.v[1].xyw, p.v[3].xyw)) - dot(p.v[0].xyz, cross(p.v[1].xyz, p.v[2].xyz)), v1.x - v1.y, -trace(p));
+}
+
+
+/* 
+	Root finders for monic polynomials (highest coefficient is equal to 1)
+
+	Returns a vector with length equal to the number of roots that the polynomial has;
+	for roots that do not genuinely exist, we return NaN.
+
+	The polynomial is basically
+
+	poly(n) = p.x + p.y*n + p.z*n^2 + p.w*n^3
+
+	(including only the components of the vector that actually exist; the next coefficient
+	has the value 1, and the remaining ones have value 0. )
+ */
+
+
+float2 solve_monic(float2 p)
+{
+	float v = sqrt(p.y * p.y - 4 * p.x);
+	return (p.yy + float2(v, -v)) * -0.5f;
+}
+
+float3 solve_monic(float3 p)
+{
+
+	p = p * (1.0f / 3.0f);
+
+	float pz = p.z;
+
+	// compute a normalization value to scale the vector by.
+	// The normalization factor is divided by 2^20.
+	// This is supposed to make internal calculations unlikely
+	// to overflow while also making underflows unlikely.
+	float scal = 1.0f;
+
+	float cx = static_cast < float >(cbrt(fabs(p.x)));
+	float cy = static_cast < float >(cbrt(fabs(p.y)));
+	scal = fmax(fmax(fabsf(p.z), cx), cy * cy) * (1.0f / 1048576.0f);
+	float rscal = 1.0f / scal;
+	p = p * float3(rscal * rscal * rscal, rscal * rscal, rscal);
+
+	float bb = p.z * p.z;		// div scal^2
+
+	float nq = bb - p.y;		// div scal^2
+	float r = 1.5f * (p.y * p.z - p.x) - p.z * bb;	// div scal^3
+	float nq3 = nq * nq * nq;	// div scal^6
+	float r2 = r * r;			// div scal^6
+
+	if (nq3 < r2)
+	{
+		// one root
+		float root = sqrt(r2 - nq3);	// div scal^3
+		float s = static_cast < float >(cbrt(r + root));	// div scal
+		float t = static_cast < float >(cbrt(r - root));	// div scal
+		return float3((s + t) * scal - pz, nan(0), nan(0));
+	}
+	else
+	{
+		// three roots
+		float phi_r = inversesqrt(nq3);	// div scal ^ -3
+		float phi_root = static_cast < float >(cbrt(phi_r * nq3));	// div scal
+		float theta = acospi(r * phi_r);
+		theta *= 1.0f / 3.0f;
+		float ncprod = phi_root * cospi(theta);
+		float dev = 1.73205080756887729353f * phi_root * sinpi(theta);
+		return float3(2 * ncprod, -dev - ncprod, dev - ncprod) * scal - pz;
+	}
+}
+
+
+/* 
+ * This function is not overflow-safe. Use with care.
+ */
+float4 solve_monic(float4 p)
+{
+
+	// step 1: depress the input polynomial
+	float bias = p.w * 0.25f;
+	float3 qv = float3((-3.0f / 256.0f) * p.w * p.w, (1.0f / 8.0f) * p.w, (-3.0 / 8.0f));
+	float3 rv = float3((1.0f / 16.0f) * p.z * p.w - (1.0f / 4.0f) * p.y, (-1.0f / 2.0f) * p.z, 0.0f);
+	float3 qx = float3(qv * p.w + rv) * p.w + p.xyz;
+
+	// step 2: solve a cubic equation to get hold of a parameter p.
+	float3 monicp = float3(-qx.y * qx.y, (qx.z * qx.z) - (4.0f * qx.x), 2.0f * qx.z);
+	float4 v = float4(solve_monic(monicp), 1e-37f);
+
+	// the cubic equation may have multiple solutions; at least one of them
+	// is numerically at least nonnegative (but may have become negative as a result of
+	// a roundoff error). We use fmax() to extract this value or a very small positive value.
+	float2 v2 = fmax(v.xy, v.zw);
+	float p2 = fmax(v2.x, v2.y);	// p^2
+	float pr = inversesqrt(p2);	// 1/p
+	float pm = p2 * pr;			// p
+
+	// step 3: use the solution for the cubic equation to set up two quadratic equations;
+	// these two equations then result in the 4 possible roots.
+	float f1 = qx.z + p2;
+	float f2 = qx.y * pr;
+	float s = 0.5f * (f1 + f2);
+	float q = 0.5f * (f1 - f2);
+
+	float4 res = float4(solve_monic(float2(q, pm)),
+						solve_monic(float2(s, -pm)));
+
+	// finally, order the results and apply the bias.
+	if (res.x != res.x)
+		return res.zwxy - bias;
+	else
+		return res - bias;
+}
+
+
+
+float2 transform(mat2 p, float2 q)
+{
+	return float2(dot(p.v[0], q), dot(p.v[1], q));
+}
+
+
+float3 transform(mat3 p, float3 q)
+{
+	return float3(dot(p.v[0], q), dot(p.v[1], q), dot(p.v[2], q));
+}
+
+
+float4 transform(mat4 p, float4 q)
+{
+	return float4(dot(p.v[0], q), dot(p.v[1], q), dot(p.v[2], q), dot(p.v[3], q));
+}
+
+
+
+mat2 adjugate(mat2 p)
+{
+	mat2 res;
+	res.v[0] = float2(p.v[1].y, -p.v[0].y);
+	res.v[1] = float2(-p.v[1].x, p.v[0].x);
+	return res;
+}
+
+
+
+mat2 invert(mat2 p)
+{
+	float rdet = 1.0f / determinant(p);
+	mat2 res;
+	res.v[0] = float2(p.v[1].y, -p.v[0].y) * rdet;
+	res.v[1] = float2(-p.v[1].x, p.v[0].x) * rdet;
+	return res;
+}
+
+
+
+mat3 adjugate(mat3 p)
+{
+	mat3 res;
+	float3 prd0 = cross(p.v[1], p.v[2]);
+	float3 prd1 = cross(p.v[2], p.v[0]);
+	float3 prd2 = cross(p.v[0], p.v[1]);
+	res.v[0] = float3(prd0.x, prd1.x, prd2.x);
+	res.v[1] = float3(prd0.y, prd1.y, prd2.y);
+	res.v[2] = float3(prd0.z, prd1.z, prd2.z);
+	return res;
+}
+
+
+
+mat3 invert(mat3 p)
+{
+	float3 cross0 = cross(p.v[1], p.v[2]);
+	float det = dot(cross0, p.v[0]);
+	float rdet = 1.0f / det;
+	mat3 res;
+	float3 prd0 = cross0 * rdet;
+	float3 prd1 = cross(p.v[2], p.v[0]) * rdet;
+	float3 prd2 = cross(p.v[0], p.v[1]) * rdet;
+	res.v[0] = float3(prd0.x, prd1.x, prd2.x);
+	res.v[1] = float3(prd0.y, prd1.y, prd2.y);
+	res.v[2] = float3(prd0.z, prd1.z, prd2.z);
+	return res;
+}
+
+
+
+mat4 adjugate(mat4 p)
+{
+	mat4 res;
+
+	float3 bpc0 = cross(p.v[2].yzw, p.v[3].yzw);
+	float3 tpc0 = cross(p.v[0].yzw, p.v[1].yzw);
+	res.v[0] = float4(dot(bpc0, p.v[1].yzw), -dot(bpc0, p.v[0].yzw), dot(tpc0, p.v[3].yzw), -dot(tpc0, p.v[2].yzw));
+
+	float3 bpc1 = cross(p.v[2].xzw, p.v[3].xzw);
+	float3 tpc1 = cross(p.v[0].xzw, p.v[1].xzw);
+	res.v[1] = float4(-dot(bpc1, p.v[1].xzw), dot(bpc1, p.v[0].xzw), -dot(tpc1, p.v[3].xzw), dot(tpc1, p.v[2].xzw));
+
+	float3 bpc2 = cross(p.v[2].xyw, p.v[3].xyw);
+	float3 tpc2 = cross(p.v[0].xyw, p.v[1].xyw);
+	res.v[2] = float4(dot(bpc2, p.v[1].xyw), -dot(bpc2, p.v[0].xyw), dot(tpc2, p.v[3].xyw), -dot(tpc2, p.v[2].xyw));
+
+	float3 bpc3 = cross(p.v[2].xyz, p.v[3].xyz);
+	float3 tpc3 = cross(p.v[0].xyz, p.v[1].xyz);
+	res.v[3] = float4(-dot(bpc3, p.v[1].xyz), dot(bpc3, p.v[0].xyz), -dot(tpc3, p.v[3].xyz), dot(tpc3, p.v[2].xyz));
+
+	return res;
+}
+
+
+
+mat4 invert(mat4 p)
+{
+	// cross products between the bottom two rows
+	float3 bpc0 = cross(p.v[2].yzw, p.v[3].yzw);
+	float3 bpc1 = cross(p.v[2].xzw, p.v[3].xzw);
+	float3 bpc2 = cross(p.v[2].xyw, p.v[3].xyw);
+	float3 bpc3 = cross(p.v[2].xyz, p.v[3].xyz);
+
+	// dot-products for the top rows
+	float4 row1 = float4(dot(bpc0, p.v[1].yzw),
+						 -dot(bpc1, p.v[1].xzw),
+						 dot(bpc2, p.v[1].xyw),
+						 -dot(bpc3, p.v[1].xyz));
+
+	float det = dot(p.v[0], row1);
+	float rdet = 1.0f / det;
+
+	mat4 res;
+
+	float3 tpc0 = cross(p.v[0].yzw, p.v[1].yzw);
+	res.v[0] = float4(row1.x, -dot(bpc0, p.v[0].yzw), dot(tpc0, p.v[3].yzw), -dot(tpc0, p.v[2].yzw)) * rdet;
+
+	float3 tpc1 = cross(p.v[0].xzw, p.v[1].xzw);
+	res.v[1] = float4(row1.y, dot(bpc1, p.v[0].xzw), -dot(tpc1, p.v[3].xzw), dot(tpc1, p.v[2].xzw)) * rdet;
+	float3 tpc2 = cross(p.v[0].xyw, p.v[1].xyw);
+
+	res.v[2] = float4(row1.z, -dot(bpc2, p.v[0].xyw), dot(tpc2, p.v[3].xyw), -dot(tpc2, p.v[2].xyw)) * rdet;
+
+	float3 tpc3 = cross(p.v[0].xyz, p.v[1].xyz);
+	res.v[3] = float4(row1.w, dot(bpc3, p.v[0].xyz), -dot(tpc3, p.v[3].xyz), dot(tpc3, p.v[2].xyz)) * rdet;
+
+
+	return res;
+}
+
+
+
+float2 eigenvalues(mat2 p)
+{
+	return solve_monic(characteristic_poly(p));
+}
+
+float3 eigenvalues(mat3 p)
+{
+	return solve_monic(characteristic_poly(p));
+}
+
+float4 eigenvalues(mat4 p)
+{
+	return solve_monic(characteristic_poly(p));
+}
+
+float2 eigenvector(mat2 p, float eigvl)
+{
+	// for a mat2, we first reverse-subtract the eigenvalue from the matrix diagonal,
+	// then return whichever row had the larger sum-of-absolute-values.
+	float4 v = float4(p.v[0], p.v[1]);
+	v.xw = eigvl - v.xw;
+	if (fabs(v.x) + fabs(v.y) > fabs(v.z) + fabs(v.w))
+		return v.yx;
+	else
+		return v.wz;
+}
+
+
+float3 eigenvector(mat3 p, float eigvl)
+{
+	// for a mat3, we obtain the eigenvector as follows:
+	// step 1: subtract the eigenvalue from the matrix diagonal
+	// step 2: take two cross products between rows in the matrix
+	// step 3: return whichever of the cross products resulted in a longer vector.
+
+	float3 r0 = p.v[0];
+	float3 r1 = p.v[1];
+	float3 r2 = p.v[2];
+
+	r0.x = r0.x - eigvl;
+	r1.y = r1.y - eigvl;
+	r2.z = r2.z - eigvl;
+
+	float3 v1 = cross(r0, r1);
+	float3 v2 = cross(r1, r2);
+
+	float len1 = dot(v1, v1);
+	float len2 = dot(v2, v2);
+	return len1 > len2 ? v1 : v2;
+}
+
+
+// generalized cross product: 3 vectors with 4 components each.
+// The result is a vector that is perpendicular to all the three specified vectors.
+
+// it works in the sense that it produces a perpendicular-to-everything vector,
+// but it has not been tested whether it points in the "right" direction.
+float4 gcross(float4 p, float4 q, float4 r)
+{
+	return float4(dot(p.yzw, cross(q.yzw, r.yzw)), -dot(p.xzw, cross(q.xzw, r.xzw)), dot(p.xyw, cross(q.xyw, r.xyw)), -dot(p.xyz, cross(q.xyz, r.xyz)));
+}
+
+
+
+float4 eigenvector(mat4 p, float eigvl)
+{
+	float4 r0 = p.v[0];
+	float4 r1 = p.v[1];
+	float4 r2 = p.v[2];
+	float4 r3 = p.v[3];
+
+	r0.x = r0.x - eigvl;
+	r1.y = r1.y - eigvl;
+	r2.z = r2.z - eigvl;
+	r3.w = r3.w - eigvl;
+
+	// generate four candidate vectors using the generalized cross product.
+	// These will in general point in the same direction (or 180 degree opposite),
+	// however they will have different lengths. Pick the longest one.
+	float3 tpc0 = cross(r0.yzw, r1.yzw);
+	float3 tpc1 = cross(r0.xzw, r1.xzw);
+	float3 tpc2 = cross(r0.xyw, r1.xyw);
+	float3 tpc3 = cross(r0.xyz, r1.xyz);
+
+	float4 v1 = float4(dot(r2.yzw, tpc0),
+					   -dot(r2.xzw, tpc1),
+					   dot(r2.xyw, tpc2),
+					   -dot(r2.xyz, tpc3));
+
+	float4 v2 = float4(dot(r3.yzw, tpc0),
+					   -dot(r3.xzw, tpc1),
+					   dot(r3.xyw, tpc2),
+					   -dot(r3.xyz, tpc3));
+
+	float3 bpc0 = cross(r2.yzw, r3.yzw);
+	float3 bpc1 = cross(r2.xzw, r3.xzw);
+	float3 bpc2 = cross(r2.xyw, r3.xyw);
+	float3 bpc3 = cross(r2.xyz, r3.xyz);
+
+	float4 v3 = float4(dot(r0.yzw, bpc0),
+					   -dot(r0.xzw, bpc1),
+					   dot(r0.xyw, bpc2),
+					   -dot(r0.xyz, bpc3));
+
+	float4 v4 = float4(dot(r1.yzw, bpc0),
+					   -dot(r1.xzw, bpc1),
+					   dot(r1.xyw, bpc2),
+					   -dot(r1.xyz, bpc3));
+
+	float len1 = dot(v1, v1);
+	float len2 = dot(v2, v2);
+	float len3 = dot(v3, v3);
+	float len4 = dot(v4, v4);
+
+	if (fmax(len1, len2) > fmax(len3, len4))
+		return len1 > len2 ? v1 : v2;
+	else
+		return len3 > len4 ? v3 : v4;
+}
+
+
+// matrix multiply
+
+mat2 operator *(mat2 a, mat2 b)
+{
+	mat2 res;
+	res.v[0] = a.v[0].x * b.v[0] + a.v[0].y * b.v[1];
+	res.v[1] = a.v[1].x * b.v[0] + a.v[1].y * b.v[1];
+	return res;
+}
+
+mat3 operator *(mat3 a, mat3 b)
+{
+	mat3 res;
+	res.v[0] = a.v[0].x * b.v[0] + a.v[0].y * b.v[1] + a.v[0].z * b.v[2];
+	res.v[1] = a.v[1].x * b.v[0] + a.v[1].y * b.v[1] + a.v[1].z * b.v[2];
+	res.v[2] = a.v[2].x * b.v[0] + a.v[2].y * b.v[1] + a.v[2].z * b.v[2];
+	return res;
+}
+
+mat4 operator *(mat4 a, mat4 b)
+{
+	mat4 res;
+	res.v[0] = a.v[0].x * b.v[0] + a.v[0].y * b.v[1] + a.v[0].z * b.v[2] + a.v[0].w * b.v[3];
+	res.v[1] = a.v[1].x * b.v[0] + a.v[1].y * b.v[1] + a.v[1].z * b.v[2] + a.v[1].w * b.v[3];
+	res.v[2] = a.v[2].x * b.v[0] + a.v[2].y * b.v[1] + a.v[2].z * b.v[2] + a.v[2].w * b.v[3];
+	res.v[3] = a.v[3].x * b.v[0] + a.v[3].y * b.v[1] + a.v[3].z * b.v[2] + a.v[3].w * b.v[3];
+	return res;
+}
+
+
+
+/*************************
+
+simple geometric functions
+
+*************************/
+
+
+// return parameter value for the point on the line closest to the specified point
+float param_nearest_on_line(float2 point, line2 line)
+{
+	return dot(point - line.a, line.b) / dot(line.b, line.b);
+}
+
+float param_nearest_on_line(float3 point, line3 line)
+{
+	return dot(point - line.a, line.b) / dot(line.b, line.b);
+}
+
+float param_nearest_on_line(float4 point, line4 line)
+{
+	return dot(point - line.a, line.b) / dot(line.b, line.b);
+}
+
+
+// return distance between point and line
+float point_line_distance(float2 point, line2 line)
+{
+	return distance(point, line.a + line.b * param_nearest_on_line(point, line));
+}
+
+float point_line_distance(float3 point, line3 line)
+{
+	return distance(point, line.a + line.b * param_nearest_on_line(point, line));
+}
+
+float point_line_distance(float4 point, line4 line)
+{
+	return distance(point, line.a + line.b * param_nearest_on_line(point, line));
+}
+
+
+float point_line_distance_sqr(float2 point, line2 line)
+{
+	return distance_sqr(point, line.a + line.b * param_nearest_on_line(point, line));
+}
+
+float point_line_distance_sqr(float3 point, line3 line)
+{
+	return distance_sqr(point, line.a + line.b * param_nearest_on_line(point, line));
+}
+
+float point_line_distance_sqr(float4 point, line4 line)
+{
+	return distance_sqr(point, line.a + line.b * param_nearest_on_line(point, line));
+}
+
+
+
+// distance between plane/hyperplane in 3D and 4D
+float point_plane_3d_distance(float3 point, plane_3d plane)
+{
+	return dot(point - plane.root_point, plane.normal);
+}
+
+
+float point_hyperplane_4d_distance(float4 point, hyperplane_4d plane)
+{
+	return dot(point - plane.root_point, plane.normal);
+}
+
+
+// helper functions to produce a 3D plane from three points and a 4D hyperplane from four points.
+plane_3d generate_plane_from_points(float3 point0, float3 point1, float3 point2)
+{
+	plane_3d res;
+	res.root_point = point0;
+	res.normal = normalize(cross(point1 - point0, point2 - point0));
+	return res;
+}
+
+hyperplane_4d generate_hyperplane_from_points(float4 point0, float4 point1, float4 point2, float4 point3)
+{
+	hyperplane_4d res;
+	res.root_point = point0;
+	res.normal = normalize(gcross(point1 - point0, point2 - point0, point3 - point0));
+	return res;
+}
+
+
--- a/3rdparty/astc/mathlib.h
+++ b/3rdparty/astc/mathlib.h
@@ -0,0 +1,200 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012, 2018 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Internal math library declarations for ASTC codec.
+ */
+/*----------------------------------------------------------------------------*/
+
+#ifndef MATHLIB_H_INCLUDED
+
+#define MATHLIB_H_INCLUDED
+
+#include "vectypes.h"
+
+// basic OpenCL functions
+float inversesqrt(float p);
+float acospi(float p);
+float sinpi(float p);
+float cospi(float p);
+
+float nan(int p);
+
+#if __cplusplus < 201103L
+float fmax(float p, float q);
+float fmin(float p, float q);
+#endif  // C++11
+
+float2 fmax(float2 p, float2 q);
+
+float3 fmax(float3 p, float3 q);
+
+float4 fmax(float4 p, float4 q);
+float2 fmin(float2 p, float2 q);
+float3 fmin(float3 p, float3 q);
+float4 fmin(float4 p, float4 q);
+
+/*
+	float dot( float2 p, float2 q );
+	float dot( float3 p, float3 q );
+	float dot( float4 p, float4 q );
+*/
+
+static inline float dot(float2 p, float2 q)
+{
+	return p.x * q.x + p.y * q.y;
+}
+static inline float dot(float3 p, float3 q)
+{
+	return p.x * q.x + p.y * q.y + p.z * q.z;
+}
+static inline float dot(float4 p, float4 q)
+{
+	return p.x * q.x + p.y * q.y + p.z * q.z + p.w * q.w;
+}
+
+
+float3 cross(float3 p, float3 q);
+float4 cross(float4 p, float4 q);
+
+float length(float2 p);
+float length(float3 p);
+float length(float4 p);
+
+float length_sqr(float2 p);
+float length_sqr(float3 p);
+float length_sqr(float4 p);
+
+float distance(float2 p, float2 q);
+float distance(float3 p, float3 q);
+float distance(float4 p, float4 q);
+
+float distance_sqr(float2 p, float2 q);
+float distance_sqr(float3 p, float3 q);
+float distance_sqr(float4 p, float4 q);
+
+float2 normalize(float2 p);
+float3 normalize(float3 p);
+float4 normalize(float4 p);
+
+
+
+// functions other than just basic OpenCL functions
+
+float4 gcross(float4 p, float4 q, float4 r);
+
+struct mat2
+{
+	float2 v[2];
+};
+struct mat3
+{
+	float3 v[3];
+};
+struct mat4
+{
+	float4 v[4];
+};
+
+float trace(mat2 p);
+float trace(mat3 p);
+float trace(mat4 p);
+
+float determinant(mat2 p);
+float determinant(mat3 p);
+float determinant(mat4 p);
+
+float2 characteristic_poly(mat2 p);
+float3 characteristic_poly(mat3 p);
+float4 characteristic_poly(mat4 p);
+
+float2 solve_monic(float2 p);
+float3 solve_monic(float3 p);
+float4 solve_monic(float4 p);
+
+float2 transform(mat2 p, float2 q);
+float3 transform(mat3 p, float3 q);
+float4 transform(mat4 p, float4 q);
+
+mat2 adjugate(mat2 p);
+mat3 adjugate(mat3 p);
+mat4 adjugate(mat4 p);
+
+mat2 invert(mat2 p);
+mat3 invert(mat3 p);
+mat4 invert(mat4 p);
+
+float2 eigenvalues(mat2 p);
+float3 eigenvalues(mat3 p);
+float4 eigenvalues(mat4 p);
+
+float2 eigenvector(mat2 p, float eigvl);
+float3 eigenvector(mat3 p, float eigvl);
+float4 eigenvector(mat4 p, float eigvl);
+
+mat2 operator *(mat2 a, mat2 b);
+mat3 operator *(mat3 a, mat3 b);
+mat4 operator *(mat4 a, mat4 b);
+
+
+
+// parametric line, 2D: The line is given by line = a + b*t.
+struct line2
+{
+	float2 a;
+	float2 b;
+};
+
+// parametric line, 3D
+struct line3
+{
+	float3 a;
+	float3 b;
+};
+
+struct line4
+{
+	float4 a;
+	float4 b;
+};
+
+// plane/hyperplane defined by a point and a normal vector
+struct plane_3d
+{
+	float3 root_point;
+	float3 normal;				// normalized
+};
+
+struct hyperplane_4d
+{
+	float4 root_point;
+	float4 normal;				// normalized
+};
+
+float param_nearest_on_line(float2 point, line2 line);
+float param_nearest_on_line(float3 point, line3 line);
+float param_nearest_on_line(float4 point, line4 line);
+
+float point_line_distance(float2 point, line2 line);
+float point_line_distance(float3 point, line3 line);
+float point_line_distance(float4 point, line4 line);
+
+float point_line_distance_sqr(float2 point, line2 line);
+float point_line_distance_sqr(float3 point, line3 line);
+float point_line_distance_sqr(float4 point, line4 line);
+
+float point_plane_3d_distance(float3 point, plane_3d plane);
+float point_hyperplane_4d_distance(float4 point, hyperplane_4d plane);
+
+plane_3d generate_plane_from_points(float3 point0, float3 point1, float3 point2);
+hyperplane_4d generate_hyperplane_from_points(float4 point0, float4 point1, float4 point2, float4 point3);
+
+
+#endif
--- a/3rdparty/astc/readme.txt
+++ b/3rdparty/astc/readme.txt
@@ -0,0 +1 @@
+Library version of astc-encoder, from https://github.com/andrewwillmott/astc-encoder.
--- a/3rdparty/astc/softfloat.cpp
+++ b/3rdparty/astc/softfloat.cpp
@@ -0,0 +1,398 @@
+/*----------------------------------------------------------------------------*/
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Soft IEEE-754 floating point library.
+ */
+/*----------------------------------------------------------------------------*/
+
+#include "softfloat.h"
+
+#define SOFTFLOAT_INLINE
+
+/******************************************
+  helper functions and their lookup tables
+ ******************************************/
+/* count leading zeros functions. Only used when the input is nonzero. */
+
+#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
+#elif defined(__arm__) && defined(__ARMCC_VERSION)
+#elif defined(__arm__) && defined(__GNUC__)
+#else
+	/* table used for the slow default versions. */
+	static const uint8_t clz_table[256] =
+	{
+		8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+		3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	};
+#endif
+
+
+/*
+   32-bit count-leading-zeros function: use the Assembly instruction whenever possible. */
+SOFTFLOAT_INLINE uint32_t clz32(uint32_t inp)
+{
+	#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
+		uint32_t bsr;
+	__asm__("bsrl %1, %0": "=r"(bsr):"r"(inp | 1));
+		return 31 - bsr;
+	#else
+		#if defined(__arm__) && defined(__ARMCC_VERSION)
+			return __clz(inp);			/* armcc builtin */
+		#else
+			#if defined(__arm__) && defined(__GNUC__)
+				uint32_t lz;
+			__asm__("clz %0, %1": "=r"(lz):"r"(inp));
+				return lz;
+			#else
+				/* slow default version */
+				uint32_t summa = 24;
+				if (inp >= UINT32_C(0x10000))
+				{
+					inp >>= 16;
+					summa -= 16;
+				}
+				if (inp >= UINT32_C(0x100))
+				{
+					inp >>= 8;
+					summa -= 8;
+				}
+				return summa + clz_table[inp];
+			#endif
+		#endif
+	#endif
+}
+
+static SOFTFLOAT_INLINE uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = UINT32_C(1) << shamt;
+	uint32_t inp2 = inp + (vl1 >> 1);	/* added 0.5 ULP */
+	uint32_t msk = (inp | UINT32_C(1)) & vl1;	/* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */
+	msk--;						/* negative if even, nonnegative if odd. */
+	inp2 -= (msk >> 31);		/* subtract epsilon before shift if even. */
+	inp2 >>= shamt;
+	return inp2;
+}
+
+static SOFTFLOAT_INLINE uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;
+	inp += vl1;
+	inp >>= shamt;
+	return inp;
+}
+
+
+static SOFTFLOAT_INLINE uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = UINT32_C(1) << shamt;
+	inp += vl1;
+	inp--;
+	inp >>= shamt;
+	return inp;
+}
+
+
+
+
+/* convert from FP16 to FP32. */
+sf32 sf16_to_sf32(sf16 inp)
+{
+	uint32_t inpx = inp;
+
+	/*
+		This table contains, for every FP16 sign/exponent value combination,
+		the difference between the input FP16 value and the value obtained
+		by shifting the correct FP32 result right by 13 bits.
+		This table allows us to handle every case except denormals and NaN
+		with just 1 table lookup, 2 shifts and 1 add.
+	*/
+
+	#define WITH_MB(a) INT32_C((a) | (1 << 31))
+	static const int32_t tbl[64] =
+	{
+		WITH_MB(0x00000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000),
+		INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000),
+		INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000),
+		INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), INT32_C(0x1C000), WITH_MB(0x38000),
+		WITH_MB(0x38000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000),
+		INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000),
+		INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000),
+		INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), INT32_C(0x54000), WITH_MB(0x70000)
+	};
+
+	int32_t res = tbl[inpx >> 10];
+	res += inpx;
+
+	/* the normal cases: the MSB of 'res' is not set. */
+	if (res >= 0)				/* signed compare */
+		return res << 13;
+
+	/* Infinity and Zero: the bottom 10 bits of 'res' are clear. */
+	if ((res & UINT32_C(0x3FF)) == 0)
+		return res << 13;
+
+	/* NaN: the exponent field of 'inp' is not zero; NaNs must be quietened. */
+	if ((inpx & 0x7C00) != 0)
+		return (res << 13) | UINT32_C(0x400000);
+
+	/* the remaining cases are Denormals. */
+	{
+		uint32_t sign = (inpx & UINT32_C(0x8000)) << 16;
+		uint32_t mskval = inpx & UINT32_C(0x7FFF);
+		uint32_t leadingzeroes = clz32(mskval);
+		mskval <<= leadingzeroes;
+		return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
+	}
+}
+
+/* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */
+
+sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
+{
+	/* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */
+	static const uint8_t tab[512] = {
+		0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+		20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+		30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,
+
+		5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+		25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
+		35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,
+	};
+
+	/* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code
+	   size. */
+	static const uint32_t tabx[60] = {
+		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),
+		UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),
+		UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),
+		UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),
+		UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),
+		UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)
+	};
+
+	uint32_t p;
+	uint32_t idx = rmode + tab[inp >> 23];
+	uint32_t vlx = tabx[idx];
+	switch (idx)
+	{
+		/*
+		  	Positive number which may be Infinity or NaN.
+			We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
+			(If we don't do this quieting, then a NaN  that is distinguished only by having
+			its low-order bits set, would be turned into an INF. */
+	case 50:
+	case 51:
+	case 52:
+	case 53:
+	case 54:
+	case 55:
+	case 56:
+	case 57:
+	case 58:
+	case 59:
+		/*
+			the input value is 0x7F800000 or 0xFF800000 if it is INF.
+			By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
+			For NaNs, however, this operation will keep bit 23 with the value 1.
+			We can then extract bit 23, and logical-OR bit 9 of the result with this
+			bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
+			of the mantissa is set.)
+		*/
+		p = (inp - 1) & UINT32_C(0x800000);	/* zero if INF, nonzero if NaN. */
+		return ((inp + vlx) >> 13) | (p >> 14);
+		/*
+			positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
+			If it is, then return 0, else return 1 (the smallest representable nonzero number)
+		*/
+	case 0:
+		/*
+			-inp will set the MSB if the input number is nonzero.
+			Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
+		*/
+		return (uint32_t) (-(int32_t) inp) >> 31;
+
+		/*
+			negative, exponent = , round-mode == DOWN, need to check whether number is
+			actually 0. If it is, return 0x8000 ( float -0.0 )
+			Else return the smallest negative number ( 0x8001 ) */
+	case 6:
+		/*
+			in this case 'vlx' is 0x80000000. By subtracting the input value from it,
+			we obtain a value that is 0 if the input value is in fact zero and has
+			the MSB set if it isn't. We then right-shift the value by 31 places to
+			get a value that is 0 if the input is -0.0 and 1 otherwise.
+		*/
+		return ((vlx - inp) >> 31) + UINT32_C(0x8000);
+
+		/*
+			for all other cases involving underflow/overflow, we don't need to
+			do actual tests; we just return 'vlx'.
+		*/
+	case 1:
+	case 2:
+	case 3:
+	case 4:
+	case 5:
+	case 7:
+	case 8:
+	case 9:
+	case 10:
+	case 11:
+	case 12:
+	case 13:
+	case 14:
+	case 15:
+	case 16:
+	case 17:
+	case 18:
+	case 19:
+	case 40:
+	case 41:
+	case 42:
+	case 43:
+	case 44:
+	case 45:
+	case 46:
+	case 47:
+	case 48:
+	case 49:
+		return vlx;
+
+		/*
+			for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
+			FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
+			baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
+			from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
+			for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
+			except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */
+
+		/* normal number, all rounding modes except round-to-nearest-even: */
+	case 30:
+	case 31:
+	case 32:
+	case 34:
+	case 35:
+	case 36:
+	case 37:
+	case 39:
+		return (inp + vlx) >> 13;
+
+		/* normal number, round-to-nearest-even. */
+	case 33:
+	case 38:
+		p = inp + vlx;
+		p += (inp >> 13) & 1;
+		return p >> 13;
+
+		/*
+			the various denormal cases. These are not expected to be common, so their performance is a bit
+			less important. For each of these cases, we need to extract an exponent and a mantissa
+			(including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
+			depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
+			sign of the resulting denormal number.
+		*/
+	case 21:
+	case 22:
+	case 25:
+	case 27:
+		/* denormal, round towards zero. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return (((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx;
+	case 20:
+	case 26:
+		/* denormal, round away from zero. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx;
+	case 24:
+	case 29:
+		/* denormal, round to nearest-away */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx;
+	case 23:
+	case 28:
+		/* denormal, round to nearest-even. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx;
+	}
+
+	return 0;
+}
+
+
+
+typedef union if32_
+{
+	uint32_t u;
+	int32_t s;
+	float f;
+} if32;
+
+/* convert from soft-float to native-float */
+
+float sf16_to_float(sf16 p)
+{
+	if32 i;
+	i.u = sf16_to_sf32(p);
+	return i.f;
+}
+
+/* convert from native-float to soft-float */
+
+sf16 float_to_sf16(float p, roundmode rm)
+{
+	if32 i;
+	i.f = p;
+	return sf32_to_sf16(i.u, rm);
+}
--- a/3rdparty/astc/softfloat.h
+++ b/3rdparty/astc/softfloat.h
@@ -0,0 +1,95 @@
+/*----------------------------------------------------------------------------*/  
+/**
+ *	This confidential and proprietary software may be used only as
+ *	authorised by a licensing agreement from ARM Limited
+ *	(C) COPYRIGHT 2011-2012 ARM Limited
+ *	ALL RIGHTS RESERVED
+ *
+ *	The entire notice above must be reproduced on all authorised
+ *	copies and copies may only be made to the extent permitted
+ *	by a licensing agreement from ARM Limited.
+ *
+ *	@brief	Soft IEEE-754 floating point library.
+ */ 
+/*----------------------------------------------------------------------------*/ 
+
+#ifndef SOFTFLOAT_H_INCLUDED
+
+#define SOFTFLOAT_H_INCLUDED
+
+#if defined __cplusplus
+extern "C"
+{
+#endif
+
+#if defined __cplusplus && !defined(_MSC_VER)
+
+	/* if compiling as C++, we need to define these macros in order to obtain all the macros in stdint.h . */
+	#define __STDC_LIMIT_MACROS
+	#define __STDC_CONSTANT_MACROS
+	#include <stdint.h>
+
+#else
+
+	typedef unsigned char uint8_t;
+	typedef signed char int8_t;
+	typedef unsigned short uint16_t;
+	typedef signed short int16_t;
+	typedef unsigned int uint32_t;
+	typedef signed int int32_t;
+
+#endif
+
+
+uint32_t clz32(uint32_t p);
+
+
+/* targets that don't have UINT32_C probably don't have the rest of C99s stdint.h */
+#ifndef UINT32_C
+
+	#define PASTE(a) a
+	#define UINT64_C(a) PASTE(a##ULL)
+	#define UINT32_C(a) PASTE(a##U)
+	#define INT64_C(a) PASTE(a##LL)
+	#define INT32_C(a) a
+	
+	#define PRIX32 "X"
+	#define PRId32 "d"
+	#define PRIu32 "u"
+	#define PRIX64 "LX"
+	#define PRId64 "Ld"
+	#define PRIu64 "Lu"
+
+#endif
+
+	/*	sized soft-float types. These are mapped to the sized integer types of C99, instead of C's
+		floating-point types; this is because the library needs to maintain exact, bit-level control on all
+		operations on these data types. */
+	typedef uint16_t sf16;
+	typedef uint32_t sf32;
+
+	/* the five rounding modes that IEEE-754r defines */
+	typedef enum
+	{
+		SF_UP = 0,				/* round towards positive infinity */
+		SF_DOWN = 1,			/* round towards negative infinity */
+		SF_TOZERO = 2,			/* round towards zero */
+		SF_NEARESTEVEN = 3,		/* round toward nearest value; if mid-between, round to even value */
+		SF_NEARESTAWAY = 4		/* round toward nearest value; if mid-between, round away from zero */
+	} roundmode;
+
+	/* narrowing float->float conversions */
+	sf16 sf32_to_sf16(sf32, roundmode);
+
+	/* widening float->float conversions */
+	sf32 sf16_to_sf32(sf16);
+
+	sf16 float_to_sf16(float, roundmode);
+	float sf16_to_float(sf16);
+
+
+#if defined __cplusplus
+}
+#endif
+
+#endif
--- a/3rdparty/astc/vectypes.h
+++ b/3rdparty/astc/vectypes.h
--- a/scripts/bimg_encode.lua
+++ b/scripts/bimg_encode.lua
@@ -30,6 +30,8 @@ project "bimg_encode"
 		path.join(BIMG_DIR, "3rdparty/nvtt/**.h"),
 		path.join(BIMG_DIR, "3rdparty/pvrtc/**.cpp"),
 		path.join(BIMG_DIR, "3rdparty/pvrtc/**.h"),
+		path.join(BIMG_DIR, "3rdparty/astc/**.cpp"),
+		path.join(BIMG_DIR, "3rdparty/astc/**.h"),
 		path.join(BIMG_DIR, "3rdparty/tinyexr/**.h"),
 		path.join(BIMG_DIR, "3rdparty/iqa/include/**.h"),
 		path.join(BIMG_DIR, "3rdparty/iqa/source/**.c"),
--- a/scripts/texturec.lua
+++ b/scripts/texturec.lua
@@ -9,27 +9,9 @@ project "texturec"
 	includedirs {
 		path.join(BX_DIR,   "include"),
 		path.join(BIMG_DIR, "include"),
-		path.join(BIMG_DIR, "3rdparty"),
-		path.join(BIMG_DIR, "3rdparty/nvtt"),
-		path.join(BIMG_DIR, "3rdparty/iqa/include"),
 	}

 	files {
-		path.join(BIMG_DIR, "3rdparty/libsquish/**.cpp"),
-		path.join(BIMG_DIR, "3rdparty/libsquish/**.h"),
-		path.join(BIMG_DIR, "3rdparty/edtaa3/**.cpp"),
-		path.join(BIMG_DIR, "3rdparty/edtaa3/**.h"),
-		path.join(BIMG_DIR, "3rdparty/etc1/**.cpp"),
-		path.join(BIMG_DIR, "3rdparty/etc1/**.h"),
-		path.join(BIMG_DIR, "3rdparty/etc2/**.cpp"),
-		path.join(BIMG_DIR, "3rdparty/etc2/**.hpp"),
-		path.join(BIMG_DIR, "3rdparty/nvtt/**.cpp"),
-		path.join(BIMG_DIR, "3rdparty/nvtt/**.h"),
-		path.join(BIMG_DIR, "3rdparty/pvrtc/**.cpp"),
-		path.join(BIMG_DIR, "3rdparty/pvrtc/**.h"),
-		path.join(BIMG_DIR, "3rdparty/tinyexr/**.h"),
-		path.join(BIMG_DIR, "3rdparty/iqa/include/**.h"),
-		path.join(BIMG_DIR, "3rdparty/iqa/source/**.c"),
 		path.join(BIMG_DIR, "tools/texturec/**.cpp"),
 		path.join(BIMG_DIR, "tools/texturec/**.h"),
 	}
--- a/src/bimg_p.h
+++ b/src/bimg_p.h
@@ -19,6 +19,10 @@

 BX_ERROR_RESULT(BIMG_ERROR, BX_MAKEFOURCC('b', 'i', 'm', 'g') );

+#ifndef BIMG_CONFIG_ASTC_DECODE
+    #define BIMG_CONFIG_ASTC_DECODE 0
+#endif
+
 namespace bimg
 {
 	struct Memory
--- a/src/image.cpp
+++ b/src/image.cpp
@@ -3,9 +3,15 @@
 * License: https://github.com/bkaradzic/bimg#license-bsd-2-clause
 */

+#define BIMG_CONFIG_ASTC_DECODE 1
+
 #include "bimg_p.h"
 #include <bx/hash.h>

+#if BIMG_CONFIG_ASTC_DECODE
+    #include "../3rdparty/astc/astc_lib.h"
+#endif
+
 namespace bimg
 {
 	static const ImageBlockInfo s_imageBlockInfo[] =
@@ -4476,8 +4482,24 @@ namespace bimg
 		case TextureFormat::ASTC8x5:
 		case TextureFormat::ASTC8x6:
 		case TextureFormat::ASTC10x5:
-			BX_WARN(false, "ASTC decoder is not implemented.");
+#       if BIMG_CONFIG_ASTC_DECODE
+            astc_decompress
+            (
+                (const uint8_t*) _src,
+                s_imageBlockInfo[_srcFormat].blockWidth,
+                s_imageBlockInfo[_srcFormat].blockHeight,
+                ASTC_DECODE_LDR_LINEAR,
+
+                _width,
+                _height,
+                (uint8_t*) _dst,
+                ASTC_BGRA,
+                _dstPitch
+            );
+#       else
+            BX_WARN(false, "ASTC decoder is not implemented.");
 			imageCheckerboard(_dst, _width, _height, 16, UINT32_C(0xff000000), UINT32_C(0xffffff00) );
+#       endif
 			break;

 		case TextureFormat::RGBA8:
@@ -5179,8 +5201,9 @@ namespace bimg
 	{
 		BX_ERROR_SCOPE(_err);

-		uint32_t ddspf      = UINT32_MAX;
-		uint32_t dxgiFormat = UINT32_MAX;
+		uint32_t ddspf        = UINT32_MAX;
+		uint32_t dxgiFormat   = UINT32_MAX;
+        uint32_t fourccFormat = UINT32_MAX;

 		for (uint32_t ii = 0; ii < BX_COUNTOF(s_translateDdsPixelFormat); ++ii)
 		{
@@ -5201,14 +5224,26 @@ namespace bimg
 					break;
 				}
 			}
-
-			if (UINT32_MAX == dxgiFormat)
-			{
-				BX_ERROR_SET(_err, BIMG_ERROR, "DDS: DXGI format not supported.");
-				return 0;
-			}
 		}

+        if (UINT32_MAX == ddspf && UINT32_MAX == dxgiFormat)
+        {
+            for (uint32_t ii = 0; ii < BX_COUNTOF(s_translateDdsFourccFormat); ++ii)
+            {
+                if (s_translateDdsFourccFormat[ii].m_textureFormat == _format)
+                {
+                    fourccFormat = s_translateDdsFourccFormat[ii].m_format;
+                    break;
+                }
+            }
+        }
+
+        if (UINT32_MAX == ddspf && UINT32_MAX == dxgiFormat && UINT32_MAX == fourccFormat)
+        {
+            BX_ERROR_SET(_err, BIMG_ERROR, "DDS: output format not supported.");
+            return 0;
+        }
+
 		const uint32_t bpp = getBitsPerPixel(_format);

 		uint32_t total = 0;
@@ -5254,9 +5289,14 @@ namespace bimg
 		{
 			total += bx::write(_writer, uint32_t(8*sizeof(uint32_t) ), _err); // pixelFormatSize
 			total += bx::write(_writer, uint32_t(DDPF_FOURCC), _err);
-			total += bx::write(_writer, uint32_t(DDS_DX10), _err);
-			total += bx::write(_writer, uint32_t(0), _err); // bitCount
-			total += bx::writeRep(_writer, 0, 4*sizeof(uint32_t), _err); // bitmask
+
+            if (UINT32_MAX != fourccFormat)
+                total += bx::write(_writer, fourccFormat, _err);
+            else
+                total += bx::write(_writer, uint32_t(DDS_DX10), _err);
+
+            total += bx::write(_writer, uint32_t(0), _err); // bitCount
+            total += bx::writeRep(_writer, 0, 4*sizeof(uint32_t), _err); // bitmask
 		}

 		uint32_t caps[4] =
--- a/src/image_encode.cpp
+++ b/src/image_encode.cpp
@@ -12,6 +12,7 @@
 #include <nvtt/nvtt.h>
 #include <pvrtc/PvrTcEncoder.h>
 #include <edtaa3/edtaa3func.h>
+#include <astc/astc_lib.h>

 BX_PRAGMA_DIAGNOSTIC_PUSH();
 BX_PRAGMA_DIAGNOSTIC_IGNORED_MSVC(4100) // warning C4100: 'alloc_context': unreferenced formal parameter
@@ -35,6 +36,14 @@ namespace bimg
 	};
 	BX_STATIC_ASSERT(Quality::Count == BX_COUNTOF(s_squishQuality) );

+    static const ASTC_COMPRESS_MODE s_astcQuality[] =
+    {
+        ASTC_COMPRESS_MEDIUM,       // Default
+        ASTC_COMPRESS_THOROUGH,     // Highest
+        ASTC_COMPRESS_FAST,         // Fastest
+    };
+    BX_STATIC_ASSERT(Quality::Count == BX_COUNTOF(s_astcQuality));
+
 	void imageEncodeFromRgba8(bx::AllocatorI* _allocator, void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint32_t _depth, TextureFormat::Enum _format, Quality::Enum _quality, bx::Error* _err)
 	{
 		const uint8_t* src = (const uint8_t*)_src;
@@ -122,6 +131,22 @@ namespace bimg
 				}
 				break;

+			case TextureFormat::ASTC4x4:
+			case TextureFormat::ASTC5x5:
+			case TextureFormat::ASTC6x6:
+			case TextureFormat::ASTC8x5:
+			case TextureFormat::ASTC8x6:
+			case TextureFormat::ASTC10x5:
+				{
+                    const bimg::ImageBlockInfo& astcBlockInfo = bimg::getBlockInfo(_format);
+
+                    ASTC_COMPRESS_MODE  compress_mode = s_astcQuality[_quality];
+					ASTC_DECODE_MODE    decode_mode   = ASTC_DECODE_LDR_LINEAR;
+
+                    astc_compress(_width, _height, src, ASTC_RGBA, srcPitch, astcBlockInfo.blockWidth, astcBlockInfo.blockHeight, compress_mode, decode_mode, dst);
+				}
+				break;
+
 			case TextureFormat::BGRA8:
 				imageSwizzleBgra8(dst, dstPitch, _width, _height, src, srcPitch);
 				break;
@@ -200,15 +225,21 @@ namespace bimg
 	{
 		switch (_dstFormat)
 		{
-			case bimg::TextureFormat::BC1:
-			case bimg::TextureFormat::BC2:
-			case bimg::TextureFormat::BC3:
-			case bimg::TextureFormat::BC4:
-			case bimg::TextureFormat::BC5:
-			case bimg::TextureFormat::ETC1:
-			case bimg::TextureFormat::ETC2:
-			case bimg::TextureFormat::PTC14:
-			case bimg::TextureFormat::PTC14A:
+			case TextureFormat::BC1:
+			case TextureFormat::BC2:
+			case TextureFormat::BC3:
+			case TextureFormat::BC4:
+			case TextureFormat::BC5:
+			case TextureFormat::ETC1:
+			case TextureFormat::ETC2:
+			case TextureFormat::PTC14:
+			case TextureFormat::PTC14A:
+			case TextureFormat::ASTC4x4:
+			case TextureFormat::ASTC5x5:
+			case TextureFormat::ASTC6x6:
+			case TextureFormat::ASTC8x5:
+			case TextureFormat::ASTC8x6:
+			case TextureFormat::ASTC10x5:
 				{
 					uint8_t* temp = (uint8_t*)BX_ALLOC(_allocator, _width*_height*_depth*4);
 					imageDecodeToRgba8(_allocator, temp, _src, _width, _height, _width*4, _srcFormat);
--- a/tools/texturec/texturec.cpp
+++ b/tools/texturec/texturec.cpp
@@ -156,12 +156,12 @@ bimg::ImageContainer* convert(bx::AllocatorI* _allocator, const void* _inputData
 		const bimg::ImageBlockInfo&  inputBlockInfo  = bimg::getBlockInfo(inputFormat);
 		const bimg::ImageBlockInfo&  outputBlockInfo = bimg::getBlockInfo(outputFormat);
 		const uint32_t blockWidth  = outputBlockInfo.blockWidth;
-		const uint32_t blockHeight = outputBlockInfo.blockHeight;
-		const uint32_t minBlockX   = outputBlockInfo.minBlockX;
-		const uint32_t minBlockY   = outputBlockInfo.minBlockY;
-		uint32_t outputWidth  = bx::uint32_max(blockWidth  * minBlockX, ( (input->m_width  + blockWidth  - 1) / blockWidth )*blockWidth);
-		uint32_t outputHeight = bx::uint32_max(blockHeight * minBlockY, ( (input->m_height + blockHeight - 1) / blockHeight)*blockHeight);
-		uint32_t outputDepth  = input->m_depth;
+        const uint32_t blockHeight = outputBlockInfo.blockHeight;
+        const uint32_t minBlockX   = outputBlockInfo.minBlockX;
+        const uint32_t minBlockY   = outputBlockInfo.minBlockY;
+        uint32_t outputWidth  = bx::uint32_max(blockWidth  * minBlockX, ( (input->m_width  + blockWidth  - 1) / blockWidth )*blockWidth);
+        uint32_t outputHeight = bx::uint32_max(blockHeight * minBlockY, ( (input->m_height + blockHeight - 1) / blockHeight)*blockHeight);
+        uint32_t outputDepth  = input->m_depth;

 		if (_options.equirect)
 		{
@@ -842,10 +842,11 @@ void help(const char* _error = NULL, bool _showHelp = true)
 		  "                           aspect ratio will be preserved.\n"
 		  "      --radiance <model>   Radiance cubemap filter. (Lighting model: Phong, PhongBrdf, Blinn, BlinnBrdf, GGX)\n"
 		  "      --as <extension>     Save as.\n"
+          "      --formats            List all supported formats.\n"
 		  "      --validate           *DEBUG* Validate that output image produced matches after loading.\n"

 		  "\n"
-		  "For additional information, see https://github.com/bkaradzic/bgfx\n"
+		  "For additional information, see https://github.com/bkaradzic/bimg\n"
 		);
 }

@@ -909,6 +910,24 @@ int main(int _argc, const char* _argv[])
 		return bx::kExitFailure;
 	}

+    if (cmdLine.hasArg("formats"))
+    {
+        printf("Uncompressed formats:\n");
+
+        for (int format = bimg::TextureFormat::Unknown + 1; format < bimg::TextureFormat::UnknownDepth; format++)
+            printf("  %s\n", bimg::getName((bimg::TextureFormat::Enum) format));
+
+        for (int format = bimg::TextureFormat::UnknownDepth + 1; format < bimg::TextureFormat::Count; format++)
+            printf("  %s\n", bimg::getName((bimg::TextureFormat::Enum) format));
+
+        printf("Compressed formats:\n");
+
+        for (int format = 0; format < bimg::TextureFormat::Unknown; format++)
+            printf("  %s\n", bimg::getName((bimg::TextureFormat::Enum) format));
+
+        return bx::kExitSuccess;
+    }
+
 	const char* inputFileName = cmdLine.findOption('f');
 	if (NULL == inputFileName)
 	{
				`@@ -0,0 +1 @@`
				`Library version of astc-encoder, from https://github.com/andrewwillmott/astc-encoder.`