Moved image code from bgfx into bimg library.

2026-02-17 20:52:38 +01:00 · 2017-04-02 19:26:02 -07:00
parent 369198a1d4
commit f475cf0623
151 changed files with 70010 additions and 22 deletions
--- a/3rdparty/edtaa3/LICENSE.md
+++ b/3rdparty/edtaa3/LICENSE.md
@@ -0,0 +1,34 @@
+https://github.com/OpenGLInsights/OpenGLInsightsCode/blob/master/Chapter%2012%202D%20Shape%20Rendering%20by%20Distance%20Fields/LICENSE.txt
+
+The C code and the GLSL code for the OpenGL demo is public
+domain code. The distance transform code in the console
+application to create distance field textures, located in
+the file "edtaa3func.c", is MIT licensed, and free to use
+under the following conditions.
+
+https://github.com/OpenGLInsights/OpenGLInsightsCode/issues/6#issuecomment-67829157
+
+----
+
+Copyright (C) 2011 by Stefan Gustavson
+(stefan.gustavson@liu.se)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+----
--- a/3rdparty/edtaa3/edtaa3func.cpp
+++ b/3rdparty/edtaa3/edtaa3func.cpp
@@ -0,0 +1,580 @@
+/*
+ * edtaa3()
+ *
+ * Sweep-and-update Euclidean distance transform of an
+ * image. Positive pixels are treated as object pixels,
+ * zero or negative pixels are treated as background.
+ * An attempt is made to treat antialiased edges correctly.
+ * The input image must have pixels in the range [0,1],
+ * and the antialiased image should be a box-filter
+ * sampling of the ideal, crisp edge.
+ * If the antialias region is more than 1 pixel wide,
+ * the result from this transform will be inaccurate.
+ *
+ * By Stefan Gustavson (stefan.gustavson@gmail.com).
+ *
+ * Originally written in 1994, based on a verbal
+ * description of Per-Erik Danielsson's SSED8 algorithm
+ * as presented in the PhD dissertation of Ingemar
+ * Ragnemalm. This is Per-Erik Danielsson's scanline
+ * scheme from 1979 - I only implemented it in C.
+ *
+ * Updated in 2004 to treat border pixels correctly,
+ * and cleaned up the code to improve readability.
+ *
+ * Updated in 2009 to handle anti-aliased edges,
+ * as published in the article "Anti-aliased Euclidean
+ * distance transform" by Stefan Gustavson and Robin Strand,
+ * Pattern Recognition Letters 32 (2011) 252–257.
+ *
+ * Updated in 2011 to avoid a corner case causing an
+ * infinite loop for some input data.
+ *
+*/
+
+/*
+
+Copyright (C) 2011 by Stefan Gustavson
+
+(stefan.gustavson@liu.se)
+
+This code is distributed under the permissive "MIT license":
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+#include <math.h>
+
+/*
+ * Compute the local gradient at edge pixels using convolution filters.
+ * The gradient is computed only at edge pixels. At other places in the
+ * image, it is never used, and it's mostly zero anyway.
+ */
+void computegradient(double *img, int w, int h, double *gx, double *gy)
+{
+    int i,j,k;
+    double glength;
+#define SQRT2 1.4142136
+    for(i = 1; i < h-1; i++) { // Avoid edges where the kernels would spill over
+        for(j = 1; j < w-1; j++) {
+            k = i*w + j;
+            if((img[k]>0.0) && (img[k]<1.0)) { // Compute gradient for edge pixels only
+                gx[k] = -img[k-w-1] - SQRT2*img[k-1] - img[k+w-1] + img[k-w+1] + SQRT2*img[k+1] + img[k+w+1];
+                gy[k] = -img[k-w-1] - SQRT2*img[k-w] - img[k-w+1] + img[k+w-1] + SQRT2*img[k+w] + img[k+w+1];
+                glength = gx[k]*gx[k] + gy[k]*gy[k];
+                if(glength > 0.0) { // Avoid division by zero
+                    glength = sqrt(glength);
+                    gx[k]=gx[k]/glength;
+                    gy[k]=gy[k]/glength;
+                }
+            }
+        }
+    }
+    // TODO: Compute reasonable values for gx, gy also around the image edges.
+    // (These are zero now, which reduces the accuracy for a 1-pixel wide region
+	// around the image edge.) 2x2 kernels would be suitable for this.
+}
+
+/*
+ * A somewhat tricky function to approximate the distance to an edge in a
+ * certain pixel, with consideration to either the local gradient (gx,gy)
+ * or the direction to the pixel (dx,dy) and the pixel greyscale value a.
+ * The latter alternative, using (dx,dy), is the metric used by edtaa2().
+ * Using a local estimate of the edge gradient (gx,gy) yields much better
+ * accuracy at and near edges, and reduces the error even at distant pixels
+ * provided that the gradient direction is accurately estimated.
+ */
+double edgedf(double gx, double gy, double a)
+{
+    double df, glength, temp, a1;
+
+    if ((gx == 0) || (gy == 0)) { // Either A) gu or gv are zero, or B) both
+        df = 0.5-a;  // Linear approximation is A) correct or B) a fair guess
+    } else {
+        glength = sqrt(gx*gx + gy*gy);
+        if(glength>0) {
+            gx = gx/glength;
+            gy = gy/glength;
+        }
+        /* Everything is symmetric wrt sign and transposition,
+         * so move to first octant (gx>=0, gy>=0, gx>=gy) to
+         * avoid handling all possible edge directions.
+         */
+        gx = fabs(gx);
+        gy = fabs(gy);
+        if(gx<gy) {
+            temp = gx;
+            gx = gy;
+            gy = temp;
+        }
+        a1 = 0.5*gy/gx;
+        if (a < a1) { // 0 <= a < a1
+            df = 0.5*(gx + gy) - sqrt(2.0*gx*gy*a);
+        } else if (a < (1.0-a1)) { // a1 <= a <= 1-a1
+            df = (0.5-a)*gx;
+        } else { // 1-a1 < a <= 1
+            df = -0.5*(gx + gy) + sqrt(2.0*gx*gy*(1.0-a));
+        }
+    }    
+    return df;
+}
+
+double distaa3(double *img, double *gximg, double *gyimg, int w, int c, int xc, int yc, int xi, int yi)
+{
+  double di, df, dx, dy, gx, gy, a;
+  int closest;
+  
+  closest = c-xc-yc*w; // Index to the edge pixel pointed to from c
+  a = img[closest];    // Grayscale value at the edge pixel
+  gx = gximg[closest]; // X gradient component at the edge pixel
+  gy = gyimg[closest]; // Y gradient component at the edge pixel
+  
+  if(a > 1.0) a = 1.0;
+  if(a < 0.0) a = 0.0; // Clip grayscale values outside the range [0,1]
+  if(a == 0.0) return 1000000.0; // Not an object pixel, return "very far" ("don't know yet")
+
+  dx = (double)xi;
+  dy = (double)yi;
+  di = sqrt(dx*dx + dy*dy); // Length of integer vector, like a traditional EDT
+  if(di==0) { // Use local gradient only at edges
+      // Estimate based on local gradient only
+      df = edgedf(gx, gy, a);
+  } else {
+      // Estimate gradient based on direction to edge (accurate for large di)
+      df = edgedf(dx, dy, a);
+  }
+  return di + df; // Same metric as edtaa2, except at edges (where di=0)
+}
+
+// Shorthand macro: add ubiquitous parameters img, gx, gy and w and call distaa3()
+#define DISTAA(c,xc,yc,xi,yi) (distaa3(img, gx, gy, w, c, xc, yc, xi, yi))
+
+void edtaa3(double *img, double *gx, double *gy, int w, int h, short *distx, short *disty, double *dist)
+{
+  int x, y, i, c;
+  int offset_u, offset_ur, offset_r, offset_rd,
+  offset_d, offset_dl, offset_l, offset_lu;
+  double olddist, newdist;
+  int cdistx, cdisty, newdistx, newdisty;
+  int changed;
+  double epsilon = 1e-3; // Safeguard against errors due to limited precision
+
+  /* Initialize index offsets for the current image width */
+  offset_u = -w;
+  offset_ur = -w+1;
+  offset_r = 1;
+  offset_rd = w+1;
+  offset_d = w;
+  offset_dl = w-1;
+  offset_l = -1;
+  offset_lu = -w-1;
+
+  /* Initialize the distance images */
+  for(i=0; i<w*h; i++) {
+    distx[i] = 0; // At first, all pixels point to
+    disty[i] = 0; // themselves as the closest known.
+    if(img[i] <= 0.0)
+      {
+	dist[i]= 1000000.0; // Big value, means "not set yet"
+      }
+    else if (img[i]<1.0) {
+      dist[i] = edgedf(gx[i], gy[i], img[i]); // Gradient-assisted estimate
+    }
+    else {
+      dist[i]= 0.0; // Inside the object
+    }
+  }
+
+  /* Perform the transformation */
+  do
+    {
+      changed = 0;
+
+      /* Scan rows, except first row */
+      for(y=1; y<h; y++)
+        {
+
+          /* move index to leftmost pixel of current row */
+          i = y*w;
+
+          /* scan right, propagate distances from above & left */
+
+          /* Leftmost pixel is special, has no left neighbors */
+          olddist = dist[i];
+          if(olddist > 0) // If non-zero distance or not set yet
+            {
+	      c = i + offset_u; // Index of candidate for testing
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_ur;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+          i++;
+
+          /* Middle pixels have all neighbors */
+          for(x=1; x<w-1; x++, i++)
+            {
+              olddist = dist[i];
+              if(olddist <= 0) continue; // No need to update further
+
+	      c = i+offset_l;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_lu;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_u;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_ur;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+
+          /* Rightmost pixel of row is special, has no right neighbors */
+          olddist = dist[i];
+          if(olddist > 0) // If not already zero distance
+            {
+	      c = i+offset_l;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_lu;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_u;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+
+          /* Move index to second rightmost pixel of current row. */
+          /* Rightmost pixel is skipped, it has no right neighbor. */
+          i = y*w + w-2;
+
+          /* scan left, propagate distance from right */
+          for(x=w-2; x>=0; x--, i--)
+            {
+              olddist = dist[i];
+              if(olddist <= 0) continue; // Already zero distance
+
+	      c = i+offset_r;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+        }
+      
+      /* Scan rows in reverse order, except last row */
+      for(y=h-2; y>=0; y--)
+        {
+          /* move index to rightmost pixel of current row */
+          i = y*w + w-1;
+
+          /* Scan left, propagate distances from below & right */
+
+          /* Rightmost pixel is special, has no right neighbors */
+          olddist = dist[i];
+          if(olddist > 0) // If not already zero distance
+            {
+	      c = i+offset_d;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_dl;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+          i--;
+
+          /* Middle pixels have all neighbors */
+          for(x=w-2; x>0; x--, i--)
+            {
+              olddist = dist[i];
+              if(olddist <= 0) continue; // Already zero distance
+
+	      c = i+offset_r;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_rd;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_d;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_dl;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+          /* Leftmost pixel is special, has no left neighbors */
+          olddist = dist[i];
+          if(olddist > 0) // If not already zero distance
+            {
+	      c = i+offset_r;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_rd;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_d;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+
+          /* Move index to second leftmost pixel of current row. */
+          /* Leftmost pixel is skipped, it has no left neighbor. */
+          i = y*w + 1;
+          for(x=1; x<w; x++, i++)
+            {
+              /* scan right, propagate distance from left */
+              olddist = dist[i];
+              if(olddist <= 0) continue; // Already zero distance
+
+	      c = i+offset_l;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+        }
+    }
+  while(changed); // Sweep until no more updates are made
+
+  /* The transformation is completed. */
+
+}
--- a/3rdparty/edtaa3/edtaa3func.h
+++ b/3rdparty/edtaa3/edtaa3func.h
@@ -0,0 +1,7 @@
+#ifndef EDTAA3_H_HEADER_GUARD
+#define EDTAA3_H_HEADER_GUARD
+
+extern void computegradient(double *img, int w, int h, double *gx, double *gy);
+extern void edtaa3(double *img, double *gx, double *gy, int w, int h, short *distx, short *disty, double *dist);
+
+#endif // EDTAA3_H_HEADER_GUARD
--- a/3rdparty/etc1/LICENSE
+++ b/3rdparty/etc1/LICENSE
@@ -0,0 +1,161 @@
+Apache License
+
+Version 2.0, January 2004
+
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by the
+copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all other
+entities that control, are controlled by, or are under common control with
+that entity. For the purposes of this definition, "control" means (i) the
+power, direct or indirect, to cause the direction or management of such 
+entity, whether by contract or otherwise, or (ii) ownership of fifty 
+percent (50%) or more of the outstanding shares, or (iii) beneficial 
+ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity exercising 
+permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications, 
+including but not limited to software source code, documentation 
+source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical transformation 
+or translation of a Source form, including but not limited to compiled 
+object code, generated documentation, and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or Object 
+form, made available under the License, as indicated by a copyright 
+notice that is included in or attached to the work (an example is 
+provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object 
+form, that is based on (or derived from) the Work and for which the 
+editorial revisions, annotations, elaborations, or other modifications 
+represent, as a whole, an original work of authorship. For the purposes 
+of this License, Derivative Works shall not include works that remain 
+separable from, or merely link (or bind by name) to the interfaces of, 
+the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including the original 
+version of the Work and any modifications or additions to that Work or 
+Derivative Works thereof, that is intentionally submitted to Licensor 
+for inclusion in the Work by the copyright owner or by an individual or 
+Legal Entity authorized to submit on behalf of the copyright owner. For 
+the purposes of this definition, "submitted" means any form of electronic, 
+verbal, or written communication sent to the Licensor or its 
+representatives, including but not limited to communication on electronic 
+mailing lists, source code control systems, and issue tracking systems that 
+are managed by, or on behalf of, the Licensor for the purpose of discussing 
+and improving the Work, but excluding communication that is conspicuously 
+marked or otherwise designated in writing by the copyright owner as "Not 
+a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity on 
+behalf of whom a Contribution has been received by Licensor and subsequently 
+incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of this 
+License, each Contributor hereby grants to You a perpetual, worldwide, 
+non-exclusive, no-charge, royalty-free, irrevocable copyright license to 
+reproduce, prepare Derivative Works of, publicly display, publicly perform, 
+sublicense, and distribute the Work and such Derivative Works in Source or 
+Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of this 
+License, each Contributor hereby grants to You a perpetual, worldwide, 
+non-exclusive, no-charge, royalty-free, irrevocable (except as stated in 
+this section) patent license to make, have made, use, offer to sell, sell, 
+import, and otherwise transfer the Work, where such license applies only to 
+those patent claims licensable by such Contributor that are necessarily 
+infringed by their Contribution(s) alone or by combination of their 
+Contribution(s) with the Work to which such Contribution(s) was submitted. 
+If You institute patent litigation against any entity (including a cross-claim
+or counterclaim in a lawsuit) alleging that the Work or a Contribution 
+incorporated within the Work constitutes direct or contributory patent 
+infringement, then any patent licenses granted to You under this License 
+for that Work shall terminate as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the Work or 
+Derivative Works thereof in any medium, with or without modifications, and 
+in Source or Object form, provided that You meet the following conditions:
+
+You must give any other recipients of the Work or Derivative Works a copy of 
+this License; and
+You must cause any modified files to carry prominent notices stating that 
+You changed the files; and
+You must retain, in the Source form of any Derivative Works that You 
+distribute, all copyright, patent, trademark, and attribution notices 
+from the Source form of the Work, excluding those notices that do not 
+pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, 
+then any Derivative Works that You distribute must include a readable 
+copy of the attribution notices contained within such NOTICE file, excluding
+those notices that do not pertain to any part of the Derivative Works, in
+at least one of the following places: within a NOTICE text file distributed 
+as part of the Derivative Works; within the Source form or documentation, if 
+provided along with the Derivative Works; or, within a display generated by 
+the Derivative Works, if and wherever such third-party notices normally 
+appear. The contents of the NOTICE file are for informational purposes 
+only and do not modify the License. You may add Your own attribution 
+notices within Derivative Works that You distribute, alongside or as 
+an addendum to the NOTICE text from the Work, provided that such additional 
+attribution notices cannot be construed as modifying the License. 
+
+You may add Your own copyright statement to Your modifications and may provide
+additional or different license terms and conditions for use, reproduction, or
+distribution of Your modifications, or for any such Derivative Works as a 
+whole, provided Your use, reproduction, and distribution of the Work otherwise 
+complies with the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise, any 
+Contribution intentionally submitted for inclusion in the Work by You to the 
+Licensor shall be under the terms and conditions of this License, without any 
+additional terms or conditions. Notwithstanding the above, nothing herein 
+shall supersede or modify the terms of any separate license agreement you 
+may have executed with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade names, 
+trademarks, service marks, or product names of the Licensor, except as 
+required for reasonable and customary use in describing the origin of the 
+Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to 
+in writing, Licensor provides the Work (and each Contributor provides its 
+Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 
+ANY KIND, either express or implied, including, without limitation, any 
+warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or 
+FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining 
+the appropriateness of using or redistributing the Work and assume any risks 
+associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory, whether in
+tort (including negligence), contract, or otherwise, unless required by 
+applicable law (such as deliberate and grossly negligent acts) or agreed to 
+in writing, shall any Contributor be liable to You for damages, including 
+any direct, indirect, special, incidental, or consequential damages of any 
+character arising as a result of this License or out of the use or inability 
+to use the Work (including but not limited to damages for loss of goodwill, 
+work stoppage, computer failure or malfunction, or any and all other 
+commercial damages or losses), even if such Contributor has been advised 
+of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing the 
+Work or Derivative Works thereof, You may choose to offer, and charge a 
+fee for, acceptance of support, warranty, indemnity, or other liability 
+obligations and/or rights consistent with this License. However, in accepting
+such obligations, You may act only on Your own behalf and on Your sole 
+responsibility, not on behalf of any other Contributor, and only if You
+agree to indemnify, defend, and hold each Contributor harmless for any 
+liability incurred by, or claims asserted against, such Contributor by 
+reason of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
--- a/3rdparty/etc1/etc1.cpp
+++ b/3rdparty/etc1/etc1.cpp
@@ -0,0 +1,686 @@
+// Copyright 2009 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+// This is a fork of the AOSP project ETC1 codec. The original code can be found
+// at the following web site:
+// https://android.googlesource.com/platform/frameworks/native/+/master/opengl/include/ETC1/
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+#include "etc1.h"
+
+#include <cstring>
+
+/* From http://www.khronos.org/registry/gles/extensions/OES/OES_compressed_ETC1_RGB8_texture.txt
+
+ The number of bits that represent a 4x4 texel block is 64 bits if
+ <internalformat> is given by ETC1_RGB8_OES.
+
+ The data for a block is a number of bytes,
+
+ {q0, q1, q2, q3, q4, q5, q6, q7}
+
+ where byte q0 is located at the lowest memory address and q7 at
+ the highest. The 64 bits specifying the block is then represented
+ by the following 64 bit integer:
+
+ int64bit = 256*(256*(256*(256*(256*(256*(256*q0+q1)+q2)+q3)+q4)+q5)+q6)+q7;
+
+ ETC1_RGB8_OES:
+
+ a) bit layout in bits 63 through 32 if diffbit = 0
+
+ 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48
+ -----------------------------------------------
+ | base col1 | base col2 | base col1 | base col2 |
+ | R1 (4bits)| R2 (4bits)| G1 (4bits)| G2 (4bits)|
+ -----------------------------------------------
+
+ 47 46 45 44 43 42 41 40 39 38 37 36 35 34  33  32
+ ---------------------------------------------------
+ | base col1 | base col2 | table  | table  |diff|flip|
+ | B1 (4bits)| B2 (4bits)| cw 1   | cw 2   |bit |bit |
+ ---------------------------------------------------
+
+
+ b) bit layout in bits 63 through 32 if diffbit = 1
+
+ 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48
+ -----------------------------------------------
+ | base col1    | dcol 2 | base col1    | dcol 2 |
+ | R1' (5 bits) | dR2    | G1' (5 bits) | dG2    |
+ -----------------------------------------------
+
+ 47 46 45 44 43 42 41 40 39 38 37 36 35 34  33  32
+ ---------------------------------------------------
+ | base col 1   | dcol 2 | table  | table  |diff|flip|
+ | B1' (5 bits) | dB2    | cw 1   | cw 2   |bit |bit |
+ ---------------------------------------------------
+
+
+ c) bit layout in bits 31 through 0 (in both cases)
+
+ 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16
+ -----------------------------------------------
+ |       most significant pixel index bits       |
+ | p| o| n| m| l| k| j| i| h| g| f| e| d| c| b| a|
+ -----------------------------------------------
+
+ 15 14 13 12 11 10  9  8  7  6  5  4  3   2   1  0
+ --------------------------------------------------
+ |         least significant pixel index bits       |
+ | p| o| n| m| l| k| j| i| h| g| f| e| d| c | b | a |
+ --------------------------------------------------
+
+
+ Add table 3.17.2: Intensity modifier sets for ETC1 compressed textures:
+
+ table codeword                modifier table
+ ------------------        ----------------------
+ 0                     -8  -2  2   8
+ 1                    -17  -5  5  17
+ 2                    -29  -9  9  29
+ 3                    -42 -13 13  42
+ 4                    -60 -18 18  60
+ 5                    -80 -24 24  80
+ 6                   -106 -33 33 106
+ 7                   -183 -47 47 183
+
+
+ Add table 3.17.3 Mapping from pixel index values to modifier values for
+ ETC1 compressed textures:
+
+ pixel index value
+ ---------------
+ msb     lsb           resulting modifier value
+ -----   -----          -------------------------
+ 1       1            -b (large negative value)
+ 1       0            -a (small negative value)
+ 0       0             a (small positive value)
+ 0       1             b (large positive value)
+
+
+ */
+
+static const int kModifierTable[] = {
+/* 0 */2, 8, -2, -8,
+/* 1 */5, 17, -5, -17,
+/* 2 */9, 29, -9, -29,
+/* 3 */13, 42, -13, -42,
+/* 4 */18, 60, -18, -60,
+/* 5 */24, 80, -24, -80,
+/* 6 */33, 106, -33, -106,
+/* 7 */47, 183, -47, -183 };
+
+static const int kLookup[8] = { 0, 1, 2, 3, -4, -3, -2, -1 };
+
+static inline etc1_byte clamp(int x) {
+    return (etc1_byte) (x >= 0 ? (x < 255 ? x : 255) : 0);
+}
+
+static
+inline int convert4To8(int b) {
+    int c = b & 0xf;
+    return (c << 4) | c;
+}
+
+static
+inline int convert5To8(int b) {
+    int c = b & 0x1f;
+    return (c << 3) | (c >> 2);
+}
+
+static
+inline int convert6To8(int b) {
+    int c = b & 0x3f;
+    return (c << 2) | (c >> 4);
+}
+
+static
+inline int divideBy255(int d) {
+    return (d + 128 + (d >> 8)) >> 8;
+}
+
+static
+inline int convert8To4(int b) {
+    int c = b & 0xff;
+    return divideBy255(c * 15);
+}
+
+static
+inline int convert8To5(int b) {
+    int c = b & 0xff;
+    return divideBy255(c * 31);
+}
+
+static
+inline int convertDiff(int base, int diff) {
+    return convert5To8((0x1f & base) + kLookup[0x7 & diff]);
+}
+
+static
+void decode_subblock(etc1_byte* pOut, int r, int g, int b, const int* table,
+        etc1_uint32 low, bool second, bool flipped) {
+    int baseX = 0;
+    int baseY = 0;
+    if (second) {
+        if (flipped) {
+            baseY = 2;
+        } else {
+            baseX = 2;
+        }
+    }
+    for (int i = 0; i < 8; i++) {
+        int x, y;
+        if (flipped) {
+            x = baseX + (i >> 1);
+            y = baseY + (i & 1);
+        } else {
+            x = baseX + (i >> 2);
+            y = baseY + (i & 3);
+        }
+        int k = y + (x * 4);
+        int offset = ((low >> k) & 1) | ((low >> (k + 15)) & 2);
+        int delta = table[offset];
+        etc1_byte* q = pOut + 3 * (x + 4 * y);
+        *q++ = clamp(r + delta);
+        *q++ = clamp(g + delta);
+        *q++ = clamp(b + delta);
+    }
+}
+
+// Input is an ETC1 compressed version of the data.
+// Output is a 4 x 4 square of 3-byte pixels in form R, G, B
+
+void etc1_decode_block(const etc1_byte* pIn, etc1_byte* pOut) {
+    etc1_uint32 high = (pIn[0] << 24) | (pIn[1] << 16) | (pIn[2] << 8) | pIn[3];
+    etc1_uint32 low = (pIn[4] << 24) | (pIn[5] << 16) | (pIn[6] << 8) | pIn[7];
+    int r1, r2, g1, g2, b1, b2;
+    if (high & 2) {
+        // differential
+        int rBase = high >> 27;
+        int gBase = high >> 19;
+        int bBase = high >> 11;
+        r1 = convert5To8(rBase);
+        r2 = convertDiff(rBase, high >> 24);
+        g1 = convert5To8(gBase);
+        g2 = convertDiff(gBase, high >> 16);
+        b1 = convert5To8(bBase);
+        b2 = convertDiff(bBase, high >> 8);
+    } else {
+        // not differential
+        r1 = convert4To8(high >> 28);
+        r2 = convert4To8(high >> 24);
+        g1 = convert4To8(high >> 20);
+        g2 = convert4To8(high >> 16);
+        b1 = convert4To8(high >> 12);
+        b2 = convert4To8(high >> 8);
+    }
+    int tableIndexA = 7 & (high >> 5);
+    int tableIndexB = 7 & (high >> 2);
+    const int* tableA = kModifierTable + tableIndexA * 4;
+    const int* tableB = kModifierTable + tableIndexB * 4;
+    bool flipped = (high & 1) != 0;
+    decode_subblock(pOut, r1, g1, b1, tableA, low, false, flipped);
+    decode_subblock(pOut, r2, g2, b2, tableB, low, true, flipped);
+}
+
+typedef struct {
+    etc1_uint32 high;
+    etc1_uint32 low;
+    etc1_uint32 score; // Lower is more accurate
+} etc_compressed;
+
+static
+inline void take_best(etc_compressed* a, const etc_compressed* b) {
+    if (a->score > b->score) {
+        *a = *b;
+    }
+}
+
+static
+void etc_average_colors_subblock(const etc1_byte* pIn, etc1_uint32 inMask,
+        etc1_byte* pColors, bool flipped, bool second) {
+    int r = 0;
+    int g = 0;
+    int b = 0;
+
+    if (flipped) {
+        int by = 0;
+        if (second) {
+            by = 2;
+        }
+        for (int y = 0; y < 2; y++) {
+            int yy = by + y;
+            for (int x = 0; x < 4; x++) {
+                int i = x + 4 * yy;
+                if (inMask & (1 << i)) {
+                    const etc1_byte* p = pIn + i * 3;
+                    r += *(p++);
+                    g += *(p++);
+                    b += *(p++);
+                }
+            }
+        }
+    } else {
+        int bx = 0;
+        if (second) {
+            bx = 2;
+        }
+        for (int y = 0; y < 4; y++) {
+            for (int x = 0; x < 2; x++) {
+                int xx = bx + x;
+                int i = xx + 4 * y;
+                if (inMask & (1 << i)) {
+                    const etc1_byte* p = pIn + i * 3;
+                    r += *(p++);
+                    g += *(p++);
+                    b += *(p++);
+                }
+            }
+        }
+    }
+    pColors[0] = (etc1_byte)((r + 4) >> 3);
+    pColors[1] = (etc1_byte)((g + 4) >> 3);
+    pColors[2] = (etc1_byte)((b + 4) >> 3);
+}
+
+static
+inline int square(int x) {
+    return x * x;
+}
+
+static etc1_uint32 chooseModifier(const etc1_byte* pBaseColors,
+        const etc1_byte* pIn, etc1_uint32 *pLow, int bitIndex,
+        const int* pModifierTable) {
+    etc1_uint32 bestScore = ~0;
+    int bestIndex = 0;
+    int pixelR = pIn[0];
+    int pixelG = pIn[1];
+    int pixelB = pIn[2];
+    int r = pBaseColors[0];
+    int g = pBaseColors[1];
+    int b = pBaseColors[2];
+    for (int i = 0; i < 4; i++) {
+        int modifier = pModifierTable[i];
+        int decodedG = clamp(g + modifier);
+        etc1_uint32 score = (etc1_uint32) (6 * square(decodedG - pixelG));
+        if (score >= bestScore) {
+            continue;
+        }
+        int decodedR = clamp(r + modifier);
+        score += (etc1_uint32) (3 * square(decodedR - pixelR));
+        if (score >= bestScore) {
+            continue;
+        }
+        int decodedB = clamp(b + modifier);
+        score += (etc1_uint32) square(decodedB - pixelB);
+        if (score < bestScore) {
+            bestScore = score;
+            bestIndex = i;
+        }
+    }
+    etc1_uint32 lowMask = (((bestIndex >> 1) << 16) | (bestIndex & 1))
+            << bitIndex;
+    *pLow |= lowMask;
+    return bestScore;
+}
+
+static
+void etc_encode_subblock_helper(const etc1_byte* pIn, etc1_uint32 inMask,
+        etc_compressed* pCompressed, bool flipped, bool second,
+        const etc1_byte* pBaseColors, const int* pModifierTable) {
+    int score = pCompressed->score;
+    if (flipped) {
+        int by = 0;
+        if (second) {
+            by = 2;
+        }
+        for (int y = 0; y < 2; y++) {
+            int yy = by + y;
+            for (int x = 0; x < 4; x++) {
+                int i = x + 4 * yy;
+                if (inMask & (1 << i)) {
+                    score += chooseModifier(pBaseColors, pIn + i * 3,
+                            &pCompressed->low, yy + x * 4, pModifierTable);
+                }
+            }
+        }
+    } else {
+        int bx = 0;
+        if (second) {
+            bx = 2;
+        }
+        for (int y = 0; y < 4; y++) {
+            for (int x = 0; x < 2; x++) {
+                int xx = bx + x;
+                int i = xx + 4 * y;
+                if (inMask & (1 << i)) {
+                    score += chooseModifier(pBaseColors, pIn + i * 3,
+                            &pCompressed->low, y + xx * 4, pModifierTable);
+                }
+            }
+        }
+    }
+    pCompressed->score = score;
+}
+
+static bool inRange4bitSigned(int color) {
+    return color >= -4 && color <= 3;
+}
+
+static void etc_encodeBaseColors(etc1_byte* pBaseColors,
+        const etc1_byte* pColors, etc_compressed* pCompressed) {
+    int r1, g1, b1, r2, g2, b2; // 8 bit base colors for sub-blocks
+    bool differential;
+    {
+        int r51 = convert8To5(pColors[0]);
+        int g51 = convert8To5(pColors[1]);
+        int b51 = convert8To5(pColors[2]);
+        int r52 = convert8To5(pColors[3]);
+        int g52 = convert8To5(pColors[4]);
+        int b52 = convert8To5(pColors[5]);
+
+        r1 = convert5To8(r51);
+        g1 = convert5To8(g51);
+        b1 = convert5To8(b51);
+
+        int dr = r52 - r51;
+        int dg = g52 - g51;
+        int db = b52 - b51;
+
+        differential = inRange4bitSigned(dr) && inRange4bitSigned(dg)
+                && inRange4bitSigned(db);
+        if (differential) {
+            r2 = convert5To8(r51 + dr);
+            g2 = convert5To8(g51 + dg);
+            b2 = convert5To8(b51 + db);
+            pCompressed->high |= (r51 << 27) | ((7 & dr) << 24) | (g51 << 19)
+                    | ((7 & dg) << 16) | (b51 << 11) | ((7 & db) << 8) | 2;
+        }
+    }
+
+    if (!differential) {
+        int r41 = convert8To4(pColors[0]);
+        int g41 = convert8To4(pColors[1]);
+        int b41 = convert8To4(pColors[2]);
+        int r42 = convert8To4(pColors[3]);
+        int g42 = convert8To4(pColors[4]);
+        int b42 = convert8To4(pColors[5]);
+        r1 = convert4To8(r41);
+        g1 = convert4To8(g41);
+        b1 = convert4To8(b41);
+        r2 = convert4To8(r42);
+        g2 = convert4To8(g42);
+        b2 = convert4To8(b42);
+        pCompressed->high |= (r41 << 28) | (r42 << 24) | (g41 << 20) | (g42
+                << 16) | (b41 << 12) | (b42 << 8);
+    }
+    pBaseColors[0] = r1;
+    pBaseColors[1] = g1;
+    pBaseColors[2] = b1;
+    pBaseColors[3] = r2;
+    pBaseColors[4] = g2;
+    pBaseColors[5] = b2;
+}
+
+static
+void etc_encode_block_helper(const etc1_byte* pIn, etc1_uint32 inMask,
+        const etc1_byte* pColors, etc_compressed* pCompressed, bool flipped) {
+    pCompressed->score = ~0;
+    pCompressed->high = (flipped ? 1 : 0);
+    pCompressed->low = 0;
+
+    etc1_byte pBaseColors[6];
+
+    etc_encodeBaseColors(pBaseColors, pColors, pCompressed);
+
+    int originalHigh = pCompressed->high;
+
+    const int* pModifierTable = kModifierTable;
+    for (int i = 0; i < 8; i++, pModifierTable += 4) {
+        etc_compressed temp;
+        temp.score = 0;
+        temp.high = originalHigh | (i << 5);
+        temp.low = 0;
+        etc_encode_subblock_helper(pIn, inMask, &temp, flipped, false,
+                pBaseColors, pModifierTable);
+        take_best(pCompressed, &temp);
+    }
+    pModifierTable = kModifierTable;
+    etc_compressed firstHalf = *pCompressed;
+    for (int i = 0; i < 8; i++, pModifierTable += 4) {
+        etc_compressed temp;
+        temp.score = firstHalf.score;
+        temp.high = firstHalf.high | (i << 2);
+        temp.low = firstHalf.low;
+        etc_encode_subblock_helper(pIn, inMask, &temp, flipped, true,
+                pBaseColors + 3, pModifierTable);
+        if (i == 0) {
+            *pCompressed = temp;
+        } else {
+            take_best(pCompressed, &temp);
+        }
+    }
+}
+
+static void writeBigEndian(etc1_byte* pOut, etc1_uint32 d) {
+    pOut[0] = (etc1_byte)(d >> 24);
+    pOut[1] = (etc1_byte)(d >> 16);
+    pOut[2] = (etc1_byte)(d >> 8);
+    pOut[3] = (etc1_byte) d;
+}
+
+// Input is a 4 x 4 square of 3-byte pixels in form R, G, B
+// inmask is a 16-bit mask where bit (1 << (x + y * 4)) tells whether the corresponding (x,y)
+// pixel is valid or not. Invalid pixel color values are ignored when compressing.
+// Output is an ETC1 compressed version of the data.
+
+void etc1_encode_block(const etc1_byte* pIn, etc1_uint32 inMask,
+        etc1_byte* pOut) {
+    etc1_byte colors[6];
+    etc1_byte flippedColors[6];
+    etc_average_colors_subblock(pIn, inMask, colors, false, false);
+    etc_average_colors_subblock(pIn, inMask, colors + 3, false, true);
+    etc_average_colors_subblock(pIn, inMask, flippedColors, true, false);
+    etc_average_colors_subblock(pIn, inMask, flippedColors + 3, true, true);
+
+    etc_compressed a, b;
+    etc_encode_block_helper(pIn, inMask, colors, &a, false);
+    etc_encode_block_helper(pIn, inMask, flippedColors, &b, true);
+    take_best(&a, &b);
+    writeBigEndian(pOut, a.high);
+    writeBigEndian(pOut + 4, a.low);
+}
+
+// Return the size of the encoded image data (does not include size of PKM header).
+
+etc1_uint32 etc1_get_encoded_data_size(etc1_uint32 width, etc1_uint32 height) {
+    return (((width + 3) & ~3) * ((height + 3) & ~3)) >> 1;
+}
+
+// Encode an entire image.
+// pIn - pointer to the image data. Formatted such that the Red component of
+//       pixel (x,y) is at pIn + pixelSize * x + stride * y + redOffset;
+// pOut - pointer to encoded data. Must be large enough to store entire encoded image.
+
+int etc1_encode_image(const etc1_byte* pIn, etc1_uint32 width, etc1_uint32 height,
+        etc1_uint32 pixelSize, etc1_uint32 stride, etc1_byte* pOut) {
+    if (pixelSize < 2 || pixelSize > 4) {
+        return -1;
+    }
+    static const unsigned short kYMask[] = { 0x0, 0xf, 0xff, 0xfff, 0xffff };
+    static const unsigned short kXMask[] = { 0x0, 0x1111, 0x3333, 0x7777,
+            0xffff };
+    etc1_byte block[ETC1_DECODED_BLOCK_SIZE];
+    etc1_byte encoded[ETC1_ENCODED_BLOCK_SIZE];
+
+    etc1_uint32 encodedWidth = (width + 3) & ~3;
+    etc1_uint32 encodedHeight = (height + 3) & ~3;
+
+    for (etc1_uint32 y = 0; y < encodedHeight; y += 4) {
+        etc1_uint32 yEnd = height - y;
+        if (yEnd > 4) {
+            yEnd = 4;
+        }
+        int ymask = kYMask[yEnd];
+        for (etc1_uint32 x = 0; x < encodedWidth; x += 4) {
+            etc1_uint32 xEnd = width - x;
+            if (xEnd > 4) {
+                xEnd = 4;
+            }
+            int mask = ymask & kXMask[xEnd];
+            for (etc1_uint32 cy = 0; cy < yEnd; cy++) {
+                etc1_byte* q = block + (cy * 4) * 3;
+                const etc1_byte* p = pIn + pixelSize * x + stride * (y + cy);
+                if (pixelSize >= 3) {
+                    for (etc1_uint32 cx = 0; cx < xEnd; cx++) {
+                        memcpy(q, p, 3);
+                        q += 3;
+                        p += pixelSize;
+                    }
+                } else {
+                    for (etc1_uint32 cx = 0; cx < xEnd; cx++) {
+                        int pixel = (p[1] << 8) | p[0];
+                        *q++ = convert5To8(pixel >> 11);
+                        *q++ = convert6To8(pixel >> 5);
+                        *q++ = convert5To8(pixel);
+                        p += pixelSize;
+                    }
+                }
+            }
+            etc1_encode_block(block, mask, encoded);
+            memcpy(pOut, encoded, sizeof(encoded));
+            pOut += sizeof(encoded);
+        }
+    }
+    return 0;
+}
+
+// Decode an entire image.
+// pIn - pointer to encoded data.
+// pOut - pointer to the image data. Will be written such that the Red component of
+//       pixel (x,y) is at pIn + pixelSize * x + stride * y + redOffset. Must be
+//        large enough to store entire image.
+
+
+int etc1_decode_image(const etc1_byte* pIn, etc1_byte* pOut,
+        etc1_uint32 width, etc1_uint32 height,
+        etc1_uint32 pixelSize, etc1_uint32 stride) {
+    if (pixelSize < 2 || pixelSize > 4) {
+        return -1;
+    }
+    etc1_byte block[ETC1_DECODED_BLOCK_SIZE];
+
+    etc1_uint32 encodedWidth = (width + 3) & ~3;
+    etc1_uint32 encodedHeight = (height + 3) & ~3;
+
+    for (etc1_uint32 y = 0; y < encodedHeight; y += 4) {
+        etc1_uint32 yEnd = height - y;
+        if (yEnd > 4) {
+            yEnd = 4;
+        }
+        for (etc1_uint32 x = 0; x < encodedWidth; x += 4) {
+            etc1_uint32 xEnd = width - x;
+            if (xEnd > 4) {
+                xEnd = 4;
+            }
+            etc1_decode_block(pIn, block);
+            pIn += ETC1_ENCODED_BLOCK_SIZE;
+            for (etc1_uint32 cy = 0; cy < yEnd; cy++) {
+                const etc1_byte* q = block + (cy * 4) * 3;
+                etc1_byte* p = pOut + pixelSize * x + stride * (y + cy);
+                if (pixelSize >= 3) {
+                    for (etc1_uint32 cx = 0; cx < xEnd; cx++) {
+                        memcpy(p, q, 3);
+                        q += 3;
+                        p += pixelSize;
+                    }
+                } else {
+                    for (etc1_uint32 cx = 0; cx < xEnd; cx++) {
+                        etc1_byte r = *q++;
+                        etc1_byte g = *q++;
+                        etc1_byte b = *q++;
+                        etc1_uint32 pixel = ((r >> 3) << 11) | ((g >> 2) << 5) | (b >> 3);
+                        *p++ = (etc1_byte) pixel;
+                        *p++ = (etc1_byte) (pixel >> 8);
+                    }
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+static const char kMagic[] = { 'P', 'K', 'M', ' ', '1', '0' };
+
+static const etc1_uint32 ETC1_PKM_FORMAT_OFFSET = 6;
+static const etc1_uint32 ETC1_PKM_ENCODED_WIDTH_OFFSET = 8;
+static const etc1_uint32 ETC1_PKM_ENCODED_HEIGHT_OFFSET = 10;
+static const etc1_uint32 ETC1_PKM_WIDTH_OFFSET = 12;
+static const etc1_uint32 ETC1_PKM_HEIGHT_OFFSET = 14;
+
+static const etc1_uint32 ETC1_RGB_NO_MIPMAPS = 0;
+
+static void writeBEUint16(etc1_byte* pOut, etc1_uint32 data) {
+    pOut[0] = (etc1_byte) (data >> 8);
+    pOut[1] = (etc1_byte) data;
+}
+
+static etc1_uint32 readBEUint16(const etc1_byte* pIn) {
+    return (pIn[0] << 8) | pIn[1];
+}
+
+// Format a PKM header
+
+void etc1_pkm_format_header(etc1_byte* pHeader, etc1_uint32 width, etc1_uint32 height) {
+    memcpy(pHeader, kMagic, sizeof(kMagic));
+    etc1_uint32 encodedWidth = (width + 3) & ~3;
+    etc1_uint32 encodedHeight = (height + 3) & ~3;
+    writeBEUint16(pHeader + ETC1_PKM_FORMAT_OFFSET, ETC1_RGB_NO_MIPMAPS);
+    writeBEUint16(pHeader + ETC1_PKM_ENCODED_WIDTH_OFFSET, encodedWidth);
+    writeBEUint16(pHeader + ETC1_PKM_ENCODED_HEIGHT_OFFSET, encodedHeight);
+    writeBEUint16(pHeader + ETC1_PKM_WIDTH_OFFSET, width);
+    writeBEUint16(pHeader + ETC1_PKM_HEIGHT_OFFSET, height);
+}
+
+// Check if a PKM header is correctly formatted.
+
+etc1_bool etc1_pkm_is_valid(const etc1_byte* pHeader) {
+    if (memcmp(pHeader, kMagic, sizeof(kMagic))) {
+        return false;
+    }
+    etc1_uint32 format = readBEUint16(pHeader + ETC1_PKM_FORMAT_OFFSET);
+    etc1_uint32 encodedWidth = readBEUint16(pHeader + ETC1_PKM_ENCODED_WIDTH_OFFSET);
+    etc1_uint32 encodedHeight = readBEUint16(pHeader + ETC1_PKM_ENCODED_HEIGHT_OFFSET);
+    etc1_uint32 width = readBEUint16(pHeader + ETC1_PKM_WIDTH_OFFSET);
+    etc1_uint32 height = readBEUint16(pHeader + ETC1_PKM_HEIGHT_OFFSET);
+    return format == ETC1_RGB_NO_MIPMAPS &&
+            encodedWidth >= width && encodedWidth - width < 4 &&
+            encodedHeight >= height && encodedHeight - height < 4;
+}
+
+// Read the image width from a PKM header
+
+etc1_uint32 etc1_pkm_get_width(const etc1_byte* pHeader) {
+    return readBEUint16(pHeader + ETC1_PKM_WIDTH_OFFSET);
+}
+
+// Read the image height from a PKM header
+
+etc1_uint32 etc1_pkm_get_height(const etc1_byte* pHeader){
+    return readBEUint16(pHeader + ETC1_PKM_HEIGHT_OFFSET);
+}
--- a/3rdparty/etc1/etc1.h
+++ b/3rdparty/etc1/etc1.h
@@ -0,0 +1,114 @@
+// Copyright 2009 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+// This is a fork of the AOSP project ETC1 codec. The original code can be found
+// at the following web site:
+// https://android.googlesource.com/platform/frameworks/native/+/master/opengl/libs/ETC1/
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef __etc1_h__
+#define __etc1_h__
+
+#define ETC1_ENCODED_BLOCK_SIZE 8
+#define ETC1_DECODED_BLOCK_SIZE 48
+
+#ifndef ETC1_RGB8_OES
+#define ETC1_RGB8_OES 0x8D64
+#endif
+
+typedef unsigned char etc1_byte;
+typedef int etc1_bool;
+typedef unsigned int etc1_uint32;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Encode a block of pixels.
+//
+// pIn is a pointer to a ETC_DECODED_BLOCK_SIZE array of bytes that represent a
+// 4 x 4 square of 3-byte pixels in form R, G, B. Byte (3 * (x + 4 * y) is the R
+// value of pixel (x, y).
+//
+// validPixelMask is a 16-bit mask where bit (1 << (x + y * 4)) indicates whether
+// the corresponding (x,y) pixel is valid. Invalid pixel color values are ignored when compressing.
+//
+// pOut is an ETC1 compressed version of the data.
+
+void etc1_encode_block(const etc1_byte* pIn, etc1_uint32 validPixelMask, etc1_byte* pOut);
+
+// Decode a block of pixels.
+//
+// pIn is an ETC1 compressed version of the data.
+//
+// pOut is a pointer to a ETC_DECODED_BLOCK_SIZE array of bytes that represent a
+// 4 x 4 square of 3-byte pixels in form R, G, B. Byte (3 * (x + 4 * y) is the R
+// value of pixel (x, y).
+
+void etc1_decode_block(const etc1_byte* pIn, etc1_byte* pOut);
+
+// Return the size of the encoded image data (does not include size of PKM header).
+
+etc1_uint32 etc1_get_encoded_data_size(etc1_uint32 width, etc1_uint32 height);
+
+// Encode an entire image.
+// pIn - pointer to the image data. Formatted such that
+//       pixel (x,y) is at pIn + pixelSize * x + stride * y;
+// pOut - pointer to encoded data. Must be large enough to store entire encoded image.
+// pixelSize can be 2 or 3. 2 is an GL_UNSIGNED_SHORT_5_6_5 image, 3 is a GL_BYTE RGB image.
+// returns non-zero if there is an error.
+
+int etc1_encode_image(const etc1_byte* pIn, etc1_uint32 width, etc1_uint32 height,
+        etc1_uint32 pixelSize, etc1_uint32 stride, etc1_byte* pOut);
+
+// Decode an entire image.
+// pIn - pointer to encoded data.
+// pOut - pointer to the image data. Will be written such that
+//        pixel (x,y) is at pIn + pixelSize * x + stride * y. Must be
+//        large enough to store entire image.
+// pixelSize can be 2 or 3. 2 is an GL_UNSIGNED_SHORT_5_6_5 image, 3 is a GL_BYTE RGB image.
+// returns non-zero if there is an error.
+
+int etc1_decode_image(const etc1_byte* pIn, etc1_byte* pOut,
+        etc1_uint32 width, etc1_uint32 height,
+        etc1_uint32 pixelSize, etc1_uint32 stride);
+
+// Size of a PKM header, in bytes.
+
+#define ETC_PKM_HEADER_SIZE 16
+
+// Format a PKM header
+
+void etc1_pkm_format_header(etc1_byte* pHeader, etc1_uint32 width, etc1_uint32 height);
+
+// Check if a PKM header is correctly formatted.
+
+etc1_bool etc1_pkm_is_valid(const etc1_byte* pHeader);
+
+// Read the image width from a PKM header
+
+etc1_uint32 etc1_pkm_get_width(const etc1_byte* pHeader);
+
+// Read the image height from a PKM header
+
+etc1_uint32 etc1_pkm_get_height(const etc1_byte* pHeader);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/3rdparty/etc2/LICENSE.txt
+++ b/3rdparty/etc2/LICENSE.txt
@@ -0,0 +1,24 @@
+Copyright (c) 2013, Bartosz Taudul <wolf.pld@gmail.com>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/3rdparty/etc2/Math.hpp
+++ b/3rdparty/etc2/Math.hpp
@@ -0,0 +1,90 @@
+#ifndef __DARKRL__MATH_HPP__
+#define __DARKRL__MATH_HPP__
+
+#include <algorithm>
+#include <math.h>
+
+#include "Types.hpp"
+
+template<typename T>
+inline T AlignPOT( T val )
+{
+    if( val == 0 ) return 1;
+    val--;
+    for( unsigned int i=1; i<sizeof( T ) * 8; i <<= 1 )
+    {
+        val |= val >> i;
+    }
+    return val + 1;
+}
+
+inline int CountSetBits( uint32 val )
+{
+    val -= ( val >> 1 ) & 0x55555555;
+    val = ( ( val >> 2 ) & 0x33333333 ) + ( val & 0x33333333 );
+    val = ( ( val >> 4 ) + val ) & 0x0f0f0f0f;
+    val += val >> 8;
+    val += val >> 16;
+    return val & 0x0000003f;
+}
+
+inline int CountLeadingZeros( uint32 val )
+{
+    val |= val >> 1;
+    val |= val >> 2;
+    val |= val >> 4;
+    val |= val >> 8;
+    val |= val >> 16;
+    return 32 - CountSetBits( val );
+}
+
+inline float sRGB2linear( float v )
+{
+    const float a = 0.055f;
+    if( v <= 0.04045f )
+    {
+        return v / 12.92f;
+    }
+    else
+    {
+        return powf( ( v + a ) / ( 1 + a ), 2.4f );
+    }
+}
+
+inline float linear2sRGB( float v )
+{
+    const float a = 0.055f;
+    if( v <= 0.0031308f )
+    {
+        return 12.92f * v;
+    }
+    else
+    {
+        return ( 1 + a ) * pow( v, 1/2.4f ) - a;
+    }
+}
+
+template<class T>
+inline T SmoothStep( T x )
+{
+    return x*x*(3-2*x);
+}
+
+inline uint8 clampu8( int32 val )
+{
+    return std::min( std::max( 0, val ), 255 );
+}
+
+template<class T>
+inline T sq( T val )
+{
+    return val * val;
+}
+
+static inline int mul8bit( int a, int b )
+{
+    int t = a*b + 128;
+    return ( t + ( t >> 8 ) ) >> 8;
+}
+
+#endif
--- a/3rdparty/etc2/ProcessCommon.hpp
+++ b/3rdparty/etc2/ProcessCommon.hpp
@@ -0,0 +1,51 @@
+#ifndef __PROCESSCOMMON_HPP__
+#define __PROCESSCOMMON_HPP__
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "Types.hpp"
+
+template<class T>
+static size_t GetLeastError( const T* err, size_t num )
+{
+    size_t idx = 0;
+    for( size_t i=1; i<num; i++ )
+    {
+        if( err[i] < err[idx] )
+        {
+            idx = i;
+        }
+    }
+    return idx;
+}
+
+static uint64 FixByteOrder( uint64 d )
+{
+    return ( ( d & 0x00000000FFFFFFFF ) ) |
+           ( ( d & 0xFF00000000000000 ) >> 24 ) |
+           ( ( d & 0x000000FF00000000 ) << 24 ) |
+           ( ( d & 0x00FF000000000000 ) >> 8 ) |
+           ( ( d & 0x0000FF0000000000 ) << 8 );
+}
+
+template<class T, class S>
+static uint64 EncodeSelectors( uint64 d, const T terr[2][8], const S tsel[16][8], const uint32* id )
+{
+    size_t tidx[2];
+    tidx[0] = GetLeastError( terr[0], 8 );
+    tidx[1] = GetLeastError( terr[1], 8 );
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+    for( int i=0; i<16; i++ )
+    {
+        uint64 t = tsel[i][tidx[id[i]%2]];
+        d |= ( t & 0x1 ) << ( i + 32 );
+        d |= ( t & 0x2 ) << ( i + 47 );
+    }
+
+    return d;
+}
+
+#endif
--- a/3rdparty/etc2/ProcessRGB.cpp
+++ b/3rdparty/etc2/ProcessRGB.cpp
@@ -0,0 +1,719 @@
+#include <string.h>
+
+#include "Math.hpp"
+#include "ProcessCommon.hpp"
+#include "ProcessRGB.hpp"
+#include "Tables.hpp"
+#include "Types.hpp"
+#include "Vector.hpp"
+
+#include <bx/endian.h>
+
+#ifdef __SSE4_1__
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#    include <Windows.h>
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
+namespace
+{
+
+typedef uint16 v4i[4];
+
+void Average( const uint8* data, v4i* a )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
+
+    __m128i d0l = _mm_unpacklo_epi8(d0, _mm_setzero_si128());
+    __m128i d0h = _mm_unpackhi_epi8(d0, _mm_setzero_si128());
+    __m128i d1l = _mm_unpacklo_epi8(d1, _mm_setzero_si128());
+    __m128i d1h = _mm_unpackhi_epi8(d1, _mm_setzero_si128());
+    __m128i d2l = _mm_unpacklo_epi8(d2, _mm_setzero_si128());
+    __m128i d2h = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
+    __m128i d3l = _mm_unpacklo_epi8(d3, _mm_setzero_si128());
+    __m128i d3h = _mm_unpackhi_epi8(d3, _mm_setzero_si128());
+
+    __m128i sum0 = _mm_add_epi16(d0l, d1l);
+    __m128i sum1 = _mm_add_epi16(d0h, d1h);
+    __m128i sum2 = _mm_add_epi16(d2l, d3l);
+    __m128i sum3 = _mm_add_epi16(d2h, d3h);
+
+    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
+    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
+    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
+    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
+    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
+    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
+    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
+    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
+
+    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
+    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
+    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
+    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
+
+    __m128i a0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b2, b3), _mm_set1_epi32(4)), 3);
+    __m128i a1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b1), _mm_set1_epi32(4)), 3);
+    __m128i a2 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b1, b3), _mm_set1_epi32(4)), 3);
+    __m128i a3 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b2), _mm_set1_epi32(4)), 3);
+
+    _mm_storeu_si128((__m128i*)&a[0], _mm_packus_epi32(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a1, _MM_SHUFFLE(3, 0, 1, 2))));
+    _mm_storeu_si128((__m128i*)&a[2], _mm_packus_epi32(_mm_shuffle_epi32(a2, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a3, _MM_SHUFFLE(3, 0, 1, 2))));
+#else
+    uint32 r[4];
+    uint32 g[4];
+    uint32 b[4];
+
+    memset(r, 0, sizeof(r));
+    memset(g, 0, sizeof(g));
+    memset(b, 0, sizeof(b));
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            int index = (j & 2) + (i >> 1);
+            b[index] += *data++;
+            g[index] += *data++;
+            r[index] += *data++;
+            data++;
+        }
+    }
+
+    a[0][0] = uint16( (r[2] + r[3] + 4) / 8 );
+    a[0][1] = uint16( (g[2] + g[3] + 4) / 8 );
+    a[0][2] = uint16( (b[2] + b[3] + 4) / 8 );
+    a[0][3] = 0;
+    a[1][0] = uint16( (r[0] + r[1] + 4) / 8 );
+    a[1][1] = uint16( (g[0] + g[1] + 4) / 8 );
+    a[1][2] = uint16( (b[0] + b[1] + 4) / 8 );
+    a[1][3] = 0;
+    a[2][0] = uint16( (r[1] + r[3] + 4) / 8 );
+    a[2][1] = uint16( (g[1] + g[3] + 4) / 8 );
+    a[2][2] = uint16( (b[1] + b[3] + 4) / 8 );
+    a[2][3] = 0;
+    a[3][0] = uint16( (r[0] + r[2] + 4) / 8 );
+    a[3][1] = uint16( (g[0] + g[2] + 4) / 8 );
+    a[3][2] = uint16( (b[0] + b[2] + 4) / 8 );
+    a[3][3] = 0;
+#endif
+}
+
+void CalcErrorBlock( const uint8* data, uint err[4][4] )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
+
+    __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
+
+    __m128i d0l = _mm_unpacklo_epi8(dm0, _mm_setzero_si128());
+    __m128i d0h = _mm_unpackhi_epi8(dm0, _mm_setzero_si128());
+    __m128i d1l = _mm_unpacklo_epi8(dm1, _mm_setzero_si128());
+    __m128i d1h = _mm_unpackhi_epi8(dm1, _mm_setzero_si128());
+    __m128i d2l = _mm_unpacklo_epi8(dm2, _mm_setzero_si128());
+    __m128i d2h = _mm_unpackhi_epi8(dm2, _mm_setzero_si128());
+    __m128i d3l = _mm_unpacklo_epi8(dm3, _mm_setzero_si128());
+    __m128i d3h = _mm_unpackhi_epi8(dm3, _mm_setzero_si128());
+
+    __m128i sum0 = _mm_add_epi16(d0l, d1l);
+    __m128i sum1 = _mm_add_epi16(d0h, d1h);
+    __m128i sum2 = _mm_add_epi16(d2l, d3l);
+    __m128i sum3 = _mm_add_epi16(d2h, d3h);
+
+    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
+    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
+    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
+    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
+    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
+    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
+    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
+    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
+
+    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
+    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
+    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
+    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
+
+    __m128i a0 = _mm_add_epi32(b2, b3);
+    __m128i a1 = _mm_add_epi32(b0, b1);
+    __m128i a2 = _mm_add_epi32(b1, b3);
+    __m128i a3 = _mm_add_epi32(b0, b2);
+
+    _mm_storeu_si128((__m128i*)&err[0], a0);
+    _mm_storeu_si128((__m128i*)&err[1], a1);
+    _mm_storeu_si128((__m128i*)&err[2], a2);
+    _mm_storeu_si128((__m128i*)&err[3], a3);
+#else
+    uint terr[4][4];
+
+    memset(terr, 0, 16 * sizeof(uint));
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            int index = (j & 2) + (i >> 1);
+            uint d = *data++;
+            terr[index][0] += d;
+            d = *data++;
+            terr[index][1] += d;
+            d = *data++;
+            terr[index][2] += d;
+            data++;
+        }
+    }
+
+    for( int i=0; i<3; i++ )
+    {
+        err[0][i] = terr[2][i] + terr[3][i];
+        err[1][i] = terr[0][i] + terr[1][i];
+        err[2][i] = terr[1][i] + terr[3][i];
+        err[3][i] = terr[0][i] + terr[2][i];
+    }
+    for( int i=0; i<4; i++ )
+    {
+        err[i][3] = 0;
+    }
+#endif
+}
+
+uint CalcError( const uint block[4], const v4i& average )
+{
+    uint err = 0x3FFFFFFF; // Big value to prevent negative values, but small enough to prevent overflow
+    err -= block[0] * 2 * average[2];
+    err -= block[1] * 2 * average[1];
+    err -= block[2] * 2 * average[0];
+    err += 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
+    return err;
+}
+
+void ProcessAverages( v4i* a )
+{
+#ifdef __SSE4_1__
+    for( int i=0; i<2; i++ )
+    {
+        __m128i d = _mm_loadu_si128((__m128i*)a[i*2]);
+
+        __m128i t = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(31)), _mm_set1_epi16(128));
+
+        __m128i c = _mm_srli_epi16(_mm_add_epi16(t, _mm_srli_epi16(t, 8)), 8);
+
+        __m128i c1 = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
+        __m128i diff = _mm_sub_epi16(c, c1);
+        diff = _mm_max_epi16(diff, _mm_set1_epi16(-4));
+        diff = _mm_min_epi16(diff, _mm_set1_epi16(3));
+
+        __m128i co = _mm_add_epi16(c1, diff);
+
+        c = _mm_blend_epi16(co, c, 0xF0);
+
+        __m128i a0 = _mm_or_si128(_mm_slli_epi16(c, 3), _mm_srli_epi16(c, 2));
+
+        _mm_storeu_si128((__m128i*)a[4+i*2], a0);
+    }
+
+    for( int i=0; i<2; i++ )
+    {
+        __m128i d = _mm_loadu_si128((__m128i*)a[i*2]);
+
+        __m128i t0 = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(15)), _mm_set1_epi16(128));
+        __m128i t1 = _mm_srli_epi16(_mm_add_epi16(t0, _mm_srli_epi16(t0, 8)), 8);
+
+        __m128i t2 = _mm_or_si128(t1, _mm_slli_epi16(t1, 4));
+
+        _mm_storeu_si128((__m128i*)a[i*2], t2);
+    }
+#else
+    for( int i=0; i<2; i++ )
+    {
+        for( int j=0; j<3; j++ )
+        {
+            int32 c1 = mul8bit( a[i*2+1][j], 31 );
+            int32 c2 = mul8bit( a[i*2][j], 31 );
+
+            int32 diff = c2 - c1;
+            if( diff > 3 ) diff = 3;
+            else if( diff < -4 ) diff = -4;
+
+            int32 co = c1 + diff;
+
+            a[5+i*2][j] = ( c1 << 3 ) | ( c1 >> 2 );
+            a[4+i*2][j] = ( co << 3 ) | ( co >> 2 );
+        }
+    }
+
+    for( int i=0; i<4; i++ )
+    {
+        a[i][0] = g_avg2[mul8bit( a[i][0], 15 )];
+        a[i][1] = g_avg2[mul8bit( a[i][1], 15 )];
+        a[i][2] = g_avg2[mul8bit( a[i][2], 15 )];
+    }
+#endif
+}
+
+void EncodeAverages( uint64& _d, const v4i* a, size_t idx )
+{
+    uint64 d = _d;
+    d |= ( idx << 24 );
+    size_t base = idx << 1;
+
+    if( ( idx & 0x2 ) == 0 )
+    {
+        for( int i=0; i<3; i++ )
+        {
+            d |= uint64( a[base+0][i] >> 4 ) << ( i*8 );
+            d |= uint64( a[base+1][i] >> 4 ) << ( i*8 + 4 );
+        }
+    }
+    else
+    {
+        for( int i=0; i<3; i++ )
+        {
+            d |= uint64( a[base+1][i] & 0xF8 ) << ( i*8 );
+            int32 c = ( ( a[base+0][i] & 0xF8 ) - ( a[base+1][i] & 0xF8 ) ) >> 3;
+            c &= ~0xFFFFFFF8;
+            d |= ((uint64)c) << ( i*8 );
+        }
+    }
+    _d = d;
+}
+
+uint64 CheckSolid( const uint8* src )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
+
+    __m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m128i c0 = _mm_cmpeq_epi8(d0, c);
+    __m128i c1 = _mm_cmpeq_epi8(d1, c);
+    __m128i c2 = _mm_cmpeq_epi8(d2, c);
+    __m128i c3 = _mm_cmpeq_epi8(d3, c);
+
+    __m128i m0 = _mm_and_si128(c0, c1);
+    __m128i m1 = _mm_and_si128(c2, c3);
+    __m128i m = _mm_and_si128(m0, m1);
+
+    if (!_mm_testc_si128(m, _mm_set1_epi32(-1)))
+    {
+        return 0;
+    }
+#else
+    const uint8* ptr = src + 4;
+    for( int i=1; i<16; i++ )
+    {
+        if( memcmp( src, ptr, 4 ) != 0 )
+        {
+            return 0;
+        }
+        ptr += 4;
+    }
+#endif
+    return 0x02000000 |
+        ( uint( src[0] & 0xF8 ) << 16 ) |
+        ( uint( src[1] & 0xF8 ) << 8 ) |
+        ( uint( src[2] & 0xF8 ) );
+}
+
+void PrepareAverages( v4i a[8], const uint8* src, uint err[4] )
+{
+    Average( src, a );
+    ProcessAverages( a );
+
+    uint errblock[4][4];
+    CalcErrorBlock( src, errblock );
+
+    for( int i=0; i<4; i++ )
+    {
+        err[i/2] += CalcError( errblock[i], a[i] );
+        err[2+i/2] += CalcError( errblock[i], a[i+4] );
+    }
+}
+
+void FindBestFit( uint64 terr[2][8], uint16 tsel[16][8], v4i a[8], const uint32* id, const uint8* data )
+{
+    for( size_t i=0; i<16; i++ )
+    {
+        uint16* sel = tsel[i];
+        uint bid = id[i];
+        uint64* ter = terr[bid%2];
+
+        uint8 b = *data++;
+        uint8 g = *data++;
+        uint8 r = *data++;
+        data++;
+
+        int dr = a[bid][0] - r;
+        int dg = a[bid][1] - g;
+        int db = a[bid][2] - b;
+
+#ifdef __SSE4_1__
+        // Reference implementation
+
+        __m128i pix = _mm_set1_epi32(dr * 77 + dg * 151 + db * 28);
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        __m128i error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[0]));
+        __m128i error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[1]));
+        __m128i error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[0]));
+        __m128i error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[1]));
+
+        __m128i index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
+        __m128i minError0 = _mm_min_epi32(error0, error1);
+
+        __m128i index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
+        __m128i minError1 = _mm_min_epi32(error2, error3);
+
+        __m128i minIndex0 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
+        __m128i minError = _mm_min_epi32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        __m128i minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
+        __m128i squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
+        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
+        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
+        __m128i minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
+        __m128i squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
+        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
+        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[2]));
+        error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[3]));
+        error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[2]));
+        error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[3]));
+
+        index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
+        minError0 = _mm_min_epi32(error0, error1);
+
+        index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
+        minError1 = _mm_min_epi32(error2, error3);
+
+        __m128i minIndex1 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
+        minError = _mm_min_epi32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
+        squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
+        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 2));
+        _mm_storeu_si128(((__m128i*)ter) + 2, squareErrorLow);
+        minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
+        squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
+        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 3));
+        _mm_storeu_si128(((__m128i*)ter) + 3, squareErrorHigh);
+        __m128i minIndex = _mm_packs_epi32(minIndex0, minIndex1);
+        _mm_storeu_si128((__m128i*)sel, minIndex);
+#else
+        int pix = dr * 77 + dg * 151 + db * 28;
+
+        for( int t=0; t<8; t++ )
+        {
+            const int64* tab = g_table256[t];
+            uint idx = 0;
+            uint64 err = sq( tab[0] + pix );
+            for( int j=1; j<4; j++ )
+            {
+                uint64 local = sq( tab[j] + pix );
+                if( local < err )
+                {
+                    err = local;
+                    idx = j;
+                }
+            }
+            *sel++ = idx;
+            *ter++ += err;
+        }
+#endif
+    }
+}
+
+#ifdef __SSE4_1__
+// Non-reference implementation, but faster. Produces same results as the AVX2 version
+void FindBestFit( uint32 terr[2][8], uint16 tsel[16][8], v4i a[8], const uint32* id, const uint8* data )
+{
+    for( size_t i=0; i<16; i++ )
+    {
+        uint16* sel = tsel[i];
+        uint bid = id[i];
+        uint32* ter = terr[bid%2];
+
+        uint8 b = *data++;
+        uint8 g = *data++;
+        uint8 r = *data++;
+        data++;
+
+        int dr = a[bid][0] - r;
+        int dg = a[bid][1] - g;
+        int db = a[bid][2] - b;
+
+        // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
+        // This produces slightly different results, but is significant faster
+        __m128i pixel = _mm_set1_epi16(dr * 38 + dg * 76 + db * 14);
+        __m128i pix = _mm_abs_epi16(pixel);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+        __m128i error0 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[0]));
+        __m128i error1 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[1]));
+
+        __m128i index = _mm_and_si128(_mm_cmplt_epi16(error1, error0), _mm_set1_epi16(1));
+        __m128i minError = _mm_min_epi16(error0, error1);
+
+        // Exploiting symmetry of the selector table and use the sign bit
+        // This produces slightly different results, but is needed to produce same results as AVX2 implementation
+        __m128i indexBit = _mm_andnot_si128(_mm_srli_epi16(pixel, 15), _mm_set1_epi8(-1));
+        __m128i minIndex = _mm_or_si128(index, _mm_add_epi16(indexBit, indexBit));
+
+        // Squaring the minimum error to produce correct values when adding
+        __m128i squareErrorLo = _mm_mullo_epi16(minError, minError);
+        __m128i squareErrorHi = _mm_mulhi_epi16(minError, minError);
+
+        __m128i squareErrorLow = _mm_unpacklo_epi16(squareErrorLo, squareErrorHi);
+        __m128i squareErrorHigh = _mm_unpackhi_epi16(squareErrorLo, squareErrorHi);
+
+        squareErrorLow = _mm_add_epi32(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
+        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
+        squareErrorHigh = _mm_add_epi32(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
+        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
+
+        _mm_storeu_si128((__m128i*)sel, minIndex);
+    }
+}
+#endif
+
+uint8_t convert6(float f)
+{
+    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
+    return (i + 11 - ((i + 11) >> 7) - ((i + 4) >> 7)) >> 3;
+}
+
+uint8_t convert7(float f)
+{
+    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
+    return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2;
+}
+
+std::pair<uint64, uint64> Planar(const uint8* src)
+{
+    int32 r = 0;
+    int32 g = 0;
+    int32 b = 0;
+
+    for (int i = 0; i < 16; ++i)
+    {
+        b += src[i * 4 + 0];
+        g += src[i * 4 + 1];
+        r += src[i * 4 + 2];
+    }
+
+    int32 difRyz = 0;
+    int32 difGyz = 0;
+    int32 difByz = 0;
+    int32 difRxz = 0;
+    int32 difGxz = 0;
+    int32 difBxz = 0;
+
+    const int32 scaling[] = { -255, -85, 85, 255 };
+
+    for (int i = 0; i < 16; ++i)
+    {
+        int32 difB = (static_cast<int>(src[i * 4 + 0]) << 4) - b;
+        int32 difG = (static_cast<int>(src[i * 4 + 1]) << 4) - g;
+        int32 difR = (static_cast<int>(src[i * 4 + 2]) << 4) - r;
+
+        difRyz += difR * scaling[i % 4];
+        difGyz += difG * scaling[i % 4];
+        difByz += difB * scaling[i % 4];
+
+        difRxz += difR * scaling[i / 4];
+        difGxz += difG * scaling[i / 4];
+        difBxz += difB * scaling[i / 4];
+    }
+
+    const float scale = -4.0f / ((255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f);
+
+    float aR = difRxz * scale;
+    float aG = difGxz * scale;
+    float aB = difBxz * scale;
+
+    float bR = difRyz * scale;
+    float bG = difGyz * scale;
+    float bB = difByz * scale;
+
+    float dR = r * (4.0f / 16.0f);
+    float dG = g * (4.0f / 16.0f);
+    float dB = b * (4.0f / 16.0f);
+
+    // calculating the three colors RGBO, RGBH, and RGBV.  RGB = df - af * x - bf * y;
+    float cofR = (aR *  255.0f + (bR *  255.0f + dR));
+    float cofG = (aG *  255.0f + (bG *  255.0f + dG));
+    float cofB = (aB *  255.0f + (bB *  255.0f + dB));
+    float chfR = (aR * -425.0f + (bR *  255.0f + dR));
+    float chfG = (aG * -425.0f + (bG *  255.0f + dG));
+    float chfB = (aB * -425.0f + (bB *  255.0f + dB));
+    float cvfR = (aR *  255.0f + (bR * -425.0f + dR));
+    float cvfG = (aG *  255.0f + (bG * -425.0f + dG));
+    float cvfB = (aB *  255.0f + (bB * -425.0f + dB));
+
+    // convert to r6g7b6
+    int32 coR = convert6(cofR);
+    int32 coG = convert7(cofG);
+    int32 coB = convert6(cofB);
+    int32 chR = convert6(chfR);
+    int32 chG = convert7(chfG);
+    int32 chB = convert6(chfB);
+    int32 cvR = convert6(cvfR);
+    int32 cvG = convert7(cvfG);
+    int32 cvB = convert6(cvfB);
+
+    // Error calculation
+    int32 ro0 = coR;
+    int32 go0 = coG;
+    int32 bo0 = coB;
+    int32 ro1 = (ro0 >> 4) | (ro0 << 2);
+    int32 go1 = (go0 >> 6) | (go0 << 1);
+    int32 bo1 = (bo0 >> 4) | (bo0 << 2);
+    int32 ro2 = (ro1 << 2) + 2;
+    int32 go2 = (go1 << 2) + 2;
+    int32 bo2 = (bo1 << 2) + 2;
+
+    int32 rh0 = chR;
+    int32 gh0 = chG;
+    int32 bh0 = chB;
+    int32 rh1 = (rh0 >> 4) | (rh0 << 2);
+    int32 gh1 = (gh0 >> 6) | (gh0 << 1);
+    int32 bh1 = (bh0 >> 4) | (bh0 << 2);
+
+    int32 rh2 = rh1 - ro1;
+    int32 gh2 = gh1 - go1;
+    int32 bh2 = bh1 - bo1;
+
+    int32 rv0 = cvR;
+    int32 gv0 = cvG;
+    int32 bv0 = cvB;
+    int32 rv1 = (rv0 >> 4) | (rv0 << 2);
+    int32 gv1 = (gv0 >> 6) | (gv0 << 1);
+    int32 bv1 = (bv0 >> 4) | (bv0 << 2);
+
+    int32 rv2 = rv1 - ro1;
+    int32 gv2 = gv1 - go1;
+    int32 bv2 = bv1 - bo1;
+
+    uint64 error = 0;
+
+    for (int i = 0; i < 16; ++i)
+    {
+        int32 cR = clampu8((rh2 * (i / 4) + rv2 * (i % 4) + ro2) >> 2);
+        int32 cG = clampu8((gh2 * (i / 4) + gv2 * (i % 4) + go2) >> 2);
+        int32 cB = clampu8((bh2 * (i / 4) + bv2 * (i % 4) + bo2) >> 2);
+
+        int32 difB = static_cast<int>(src[i * 4 + 0]) - cB;
+        int32 difG = static_cast<int>(src[i * 4 + 1]) - cG;
+        int32 difR = static_cast<int>(src[i * 4 + 2]) - cR;
+
+        int32 dif = difR * 38 + difG * 76 + difB * 14;
+
+        error += dif * dif;
+    }
+
+    /**/
+    uint32 rgbv = cvB | (cvG << 6) | (cvR << 13);
+    uint32 rgbh = chB | (chG << 6) | (chR << 13);
+    uint32 hi = rgbv | ((rgbh & 0x1FFF) << 19);
+    uint32 lo = (chR & 0x1) | 0x2 | ((chR << 1) & 0x7C);
+    lo |= ((coB & 0x07) <<  7) | ((coB & 0x18) <<  8) | ((coB & 0x20) << 11);
+    lo |= ((coG & 0x3F) << 17) | ((coG & 0x40) << 18);
+    lo |= coR << 25;
+
+    const int32 idx = (coR & 0x20) | ((coG & 0x20) >> 1) | ((coB & 0x1E) >> 1);
+
+    lo |= g_flags[idx];
+
+    uint64 result = static_cast<uint32>(bx::endianSwap(lo));
+    result |= static_cast<uint64>(static_cast<uint32>(bx::endianSwap(hi))) << 32;
+
+    return std::make_pair(result, error);
+}
+
+template<class T, class S>
+uint64 EncodeSelectors( uint64 d, const T terr[2][8], const S tsel[16][8], const uint32* id, const uint64 value, const uint64 error)
+{
+    size_t tidx[2];
+    tidx[0] = GetLeastError( terr[0], 8 );
+    tidx[1] = GetLeastError( terr[1], 8 );
+
+    if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
+    {
+        return value;
+    }
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+    for( int i=0; i<16; i++ )
+    {
+        uint64 t = tsel[i][tidx[id[i]%2]];
+        d |= ( t & 0x1 ) << ( i + 32 );
+        d |= ( t & 0x2 ) << ( i + 47 );
+    }
+
+    return FixByteOrder(d);
+}
+}
+
+uint64 ProcessRGB( const uint8* src )
+{
+    uint64 d = CheckSolid( src );
+    if( d != 0 ) return d;
+
+    v4i a[8];
+    uint err[4] = {};
+    PrepareAverages( a, src, err );
+    size_t idx = GetLeastError( err, 4 );
+    EncodeAverages( d, a, idx );
+
+#if defined __SSE4_1__ && !defined REFERENCE_IMPLEMENTATION
+    uint32 terr[2][8] = {};
+#else
+    uint64 terr[2][8] = {};
+#endif
+    uint16 tsel[16][8];
+    const uint32* id = g_id[idx];
+    FindBestFit( terr, tsel, a, id, src );
+
+    return FixByteOrder( EncodeSelectors( d, terr, tsel, id ) );
+}
+
+uint64 ProcessRGB_ETC2( const uint8* src )
+{
+    std::pair<uint64, uint64> result = Planar( src );
+
+    uint64 d = 0;
+
+    v4i a[8];
+    uint err[4] = {};
+    PrepareAverages( a, src, err );
+    size_t idx = GetLeastError( err, 4 );
+    EncodeAverages( d, a, idx );
+
+    uint64 terr[2][8] = {};
+    uint16 tsel[16][8];
+    const uint32* id = g_id[idx];
+    FindBestFit( terr, tsel, a, id, src );
+
+    return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
+}
--- a/3rdparty/etc2/ProcessRGB.hpp
+++ b/3rdparty/etc2/ProcessRGB.hpp
@@ -0,0 +1,9 @@
+#ifndef __PROCESSRGB_HPP__
+#define __PROCESSRGB_HPP__
+
+#include "Types.hpp"
+
+uint64 ProcessRGB( const uint8* src );
+uint64 ProcessRGB_ETC2( const uint8* src );
+
+#endif
--- a/3rdparty/etc2/Tables.cpp
+++ b/3rdparty/etc2/Tables.cpp
@@ -0,0 +1,109 @@
+#include "Tables.hpp"
+
+const int32 g_table[8][4] = {
+    {  2,  8,   -2,   -8 },
+    {  5, 17,   -5,  -17 },
+    {  9, 29,   -9,  -29 },
+    { 13, 42,  -13,  -42 },
+    { 18, 60,  -18,  -60 },
+    { 24, 80,  -24,  -80 },
+    { 33, 106, -33, -106 },
+    { 47, 183, -47, -183 }
+};
+
+const int64 g_table256[8][4] = {
+    {  2*256,  8*256,   -2*256,   -8*256 },
+    {  5*256, 17*256,   -5*256,  -17*256 },
+    {  9*256, 29*256,   -9*256,  -29*256 },
+    { 13*256, 42*256,  -13*256,  -42*256 },
+    { 18*256, 60*256,  -18*256,  -60*256 },
+    { 24*256, 80*256,  -24*256,  -80*256 },
+    { 33*256, 106*256, -33*256, -106*256 },
+    { 47*256, 183*256, -47*256, -183*256 }
+};
+
+const uint32 g_id[4][16] = {
+    { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2 },
+    { 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 },
+    { 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6 }
+};
+
+const uint32 g_avg2[16] = {
+    0x00,
+    0x11,
+    0x22,
+    0x33,
+    0x44,
+    0x55,
+    0x66,
+    0x77,
+    0x88,
+    0x99,
+    0xAA,
+    0xBB,
+    0xCC,
+    0xDD,
+    0xEE,
+    0xFF
+};
+
+const uint32 g_flags[64] = {
+    0x80800402, 0x80800402, 0x80800402, 0x80800402,
+    0x80800402, 0x80800402, 0x80800402, 0x8080E002,
+    0x80800402, 0x80800402, 0x8080E002, 0x8080E002,
+    0x80800402, 0x8080E002, 0x8080E002, 0x8080E002,
+    0x80000402, 0x80000402, 0x80000402, 0x80000402,
+    0x80000402, 0x80000402, 0x80000402, 0x8000E002,
+    0x80000402, 0x80000402, 0x8000E002, 0x8000E002,
+    0x80000402, 0x8000E002, 0x8000E002, 0x8000E002,
+    0x00800402, 0x00800402, 0x00800402, 0x00800402,
+    0x00800402, 0x00800402, 0x00800402, 0x0080E002,
+    0x00800402, 0x00800402, 0x0080E002, 0x0080E002,
+    0x00800402, 0x0080E002, 0x0080E002, 0x0080E002,
+    0x00000402, 0x00000402, 0x00000402, 0x00000402,
+    0x00000402, 0x00000402, 0x00000402, 0x0000E002,
+    0x00000402, 0x00000402, 0x0000E002, 0x0000E002,
+    0x00000402, 0x0000E002, 0x0000E002, 0x0000E002
+};
+
+#ifdef __SSE4_1__
+const uint8 g_flags_AVX2[64] =
+{
+    0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x7D,
+    0x63, 0x63, 0x7D, 0x7D,
+    0x63, 0x7D, 0x7D, 0x7D,
+    0x43, 0x43, 0x43, 0x43,
+    0x43, 0x43, 0x43, 0x5D,
+    0x43, 0x43, 0x5D, 0x5D,
+    0x43, 0x5D, 0x5D, 0x5D,
+    0x23, 0x23, 0x23, 0x23,
+    0x23, 0x23, 0x23, 0x3D,
+    0x23, 0x23, 0x3D, 0x3D,
+    0x23, 0x3D, 0x3D, 0x3D,
+    0x03, 0x03, 0x03, 0x03,
+    0x03, 0x03, 0x03, 0x1D,
+    0x03, 0x03, 0x1D, 0x1D,
+    0x03, 0x1D, 0x1D, 0x1D,
+};
+
+const __m128i g_table_SIMD[2] =
+{
+    _mm_setr_epi16(   2,   5,   9,  13,  18,  24,  33,  47),
+    _mm_setr_epi16(   8,  17,  29,  42,  60,  80, 106, 183)
+};
+const __m128i g_table128_SIMD[2] =
+{
+    _mm_setr_epi16(   2*128,   5*128,   9*128,  13*128,  18*128,  24*128,  33*128,  47*128),
+    _mm_setr_epi16(   8*128,  17*128,  29*128,  42*128,  60*128,  80*128, 106*128, 183*128)
+};
+const __m128i g_table256_SIMD[4] =
+{
+    _mm_setr_epi32(  2*256,   5*256,   9*256,  13*256),
+    _mm_setr_epi32(  8*256,  17*256,  29*256,  42*256),
+    _mm_setr_epi32( 18*256,  24*256,  33*256,  47*256),
+    _mm_setr_epi32( 60*256,  80*256, 106*256, 183*256)
+};
+#endif
+
--- a/3rdparty/etc2/Tables.hpp
+++ b/3rdparty/etc2/Tables.hpp
@@ -0,0 +1,25 @@
+#ifndef __TABLES_HPP__
+#define __TABLES_HPP__
+
+#include "Types.hpp"
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
+
+extern const int32 g_table[8][4];
+extern const int64 g_table256[8][4];
+
+extern const uint32 g_id[4][16];
+
+extern const uint32 g_avg2[16];
+
+extern const uint32 g_flags[64];
+
+#ifdef __SSE4_1__
+extern const uint8 g_flags_AVX2[64];
+extern const __m128i g_table_SIMD[2];
+extern const __m128i g_table128_SIMD[2];
+extern const __m128i g_table256_SIMD[4];
+#endif
+
+#endif
--- a/3rdparty/etc2/Types.hpp
+++ b/3rdparty/etc2/Types.hpp
@@ -0,0 +1,17 @@
+#ifndef __DARKRL__TYPES_HPP__
+#define __DARKRL__TYPES_HPP__
+
+#include <stdint.h>
+
+typedef int8_t      int8;
+typedef uint8_t     uint8;
+typedef int16_t     int16;
+typedef uint16_t    uint16;
+typedef int32_t     int32;
+typedef uint32_t    uint32;
+typedef int64_t     int64;
+typedef uint64_t    uint64;
+
+typedef unsigned int uint;
+
+#endif
--- a/3rdparty/etc2/Vector.hpp
+++ b/3rdparty/etc2/Vector.hpp
@@ -0,0 +1,222 @@
+#ifndef __DARKRL__VECTOR_HPP__
+#define __DARKRL__VECTOR_HPP__
+
+#include <assert.h>
+#include <algorithm>
+#include <math.h>
+
+#include "Math.hpp"
+#include "Types.hpp"
+
+template<class T>
+struct Vector2
+{
+    Vector2() : x( 0 ), y( 0 ) {}
+    Vector2( T v ) : x( v ), y( v ) {}
+    Vector2( T _x, T _y ) : x( _x ), y( _y ) {}
+
+    bool operator==( const Vector2<T>& rhs ) const { return x == rhs.x && y == rhs.y; }
+    bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
+
+    Vector2<T>& operator+=( const Vector2<T>& rhs )
+    {
+        x += rhs.x;
+        y += rhs.y;
+        return *this;
+    }
+    Vector2<T>& operator-=( const Vector2<T>& rhs )
+    {
+        x -= rhs.x;
+        y -= rhs.y;
+        return *this;
+    }
+    Vector2<T>& operator*=( const Vector2<T>& rhs )
+    {
+        x *= rhs.x;
+        y *= rhs.y;
+        return *this;
+    }
+
+    T x, y;
+};
+
+template<class T>
+Vector2<T> operator+( const Vector2<T>& lhs, const Vector2<T>& rhs )
+{
+    return Vector2<T>( lhs.x + rhs.x, lhs.y + rhs.y );
+}
+
+template<class T>
+Vector2<T> operator-( const Vector2<T>& lhs, const Vector2<T>& rhs )
+{
+    return Vector2<T>( lhs.x - rhs.x, lhs.y - rhs.y );
+}
+
+template<class T>
+Vector2<T> operator*( const Vector2<T>& lhs, const float& rhs )
+{
+    return Vector2<T>( lhs.x * rhs, lhs.y * rhs );
+}
+
+template<class T>
+Vector2<T> operator/( const Vector2<T>& lhs, const T& rhs )
+{
+    return Vector2<T>( lhs.x / rhs, lhs.y / rhs );
+}
+
+
+typedef Vector2<int32> v2i;
+typedef Vector2<float> v2f;
+
+
+template<class T>
+struct Vector3
+{
+    Vector3() : x( 0 ), y( 0 ), z( 0 ) {}
+    Vector3( T v ) : x( v ), y( v ), z( v ) {}
+    Vector3( T _x, T _y, T _z ) : x( _x ), y( _y ), z( _z ) {}
+    template<class Y>
+    Vector3( const Vector3<Y>& v ) : x( T( v.x ) ), y( T( v.y ) ), z( T( v.z ) ) {}
+
+    T Luminance() const { return T( x * 0.3f + y * 0.59f + z * 0.11f ); }
+    void Clamp()
+    {
+        x = std::min( T(1), std::max( T(0), x ) );
+        y = std::min( T(1), std::max( T(0), y ) );
+        z = std::min( T(1), std::max( T(0), z ) );
+    }
+
+    bool operator==( const Vector3<T>& rhs ) const { return x == rhs.x && y == rhs.y && z == rhs.z; }
+    bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
+
+    T& operator[]( uint idx ) { assert( idx < 3 ); return ((T*)this)[idx]; }
+    const T& operator[]( uint idx ) const { assert( idx < 3 ); return ((T*)this)[idx]; }
+
+    Vector3<T> operator+=( const Vector3<T>& rhs )
+    {
+        x += rhs.x;
+        y += rhs.y;
+        z += rhs.z;
+        return *this;
+    }
+
+    Vector3<T> operator*=( const Vector3<T>& rhs )
+    {
+        x *= rhs.x;
+        y *= rhs.y;
+        z *= rhs.z;
+        return *this;
+    }
+
+    Vector3<T> operator*=( const float& rhs )
+    {
+        x *= rhs;
+        y *= rhs;
+        z *= rhs;
+        return *this;
+    }
+
+    T x, y, z;
+    T padding;
+};
+
+template<class T>
+Vector3<T> operator+( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z );
+}
+
+template<class T>
+Vector3<T> operator-( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z );
+}
+
+template<class T>
+Vector3<T> operator*( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z );
+}
+
+template<class T>
+Vector3<T> operator*( const Vector3<T>& lhs, const float& rhs )
+{
+    return Vector3<T>( T( lhs.x * rhs ), T( lhs.y * rhs ), T( lhs.z * rhs ) );
+}
+
+template<class T>
+Vector3<T> operator/( const Vector3<T>& lhs, const T& rhs )
+{
+    return Vector3<T>( lhs.x / rhs, lhs.y / rhs, lhs.z / rhs );
+}
+
+template<class T>
+bool operator<( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return lhs.Luminance() < rhs.Luminance();
+}
+
+typedef Vector3<int32> v3i;
+typedef Vector3<float> v3f;
+typedef Vector3<uint8> v3b;
+
+
+static inline v3b v3f_to_v3b( const v3f& v )
+{
+    return v3b( uint8( std::min( 1.f, v.x ) * 255 ), uint8( std::min( 1.f, v.y ) * 255 ), uint8( std::min( 1.f, v.z ) * 255 ) );
+}
+
+template<class T>
+Vector3<T> Mix( const Vector3<T>& v1, const Vector3<T>& v2, float amount )
+{
+    return v1 + ( v2 - v1 ) * amount;
+}
+
+template<>
+inline v3b Mix( const v3b& v1, const v3b& v2, float amount )
+{
+    return v3b( v3f( v1 ) + ( v3f( v2 ) - v3f( v1 ) ) * amount );
+}
+
+template<class T>
+Vector3<T> Desaturate( const Vector3<T>& v )
+{
+    T l = v.Luminance();
+    return Vector3<T>( l, l, l );
+}
+
+template<class T>
+Vector3<T> Desaturate( const Vector3<T>& v, float mul )
+{
+    T l = T( v.Luminance() * mul );
+    return Vector3<T>( l, l, l );
+}
+
+template<class T>
+Vector3<T> pow( const Vector3<T>& base, float exponent )
+{
+    return Vector3<T>(
+        pow( base.x, exponent ),
+        pow( base.y, exponent ),
+        pow( base.z, exponent ) );
+}
+
+template<class T>
+Vector3<T> sRGB2linear( const Vector3<T>& v )
+{
+    return Vector3<T>(
+        sRGB2linear( v.x ),
+        sRGB2linear( v.y ),
+        sRGB2linear( v.z ) );
+}
+
+template<class T>
+Vector3<T> linear2sRGB( const Vector3<T>& v )
+{
+    return Vector3<T>(
+        linear2sRGB( v.x ),
+        linear2sRGB( v.y ),
+        linear2sRGB( v.z ) );
+}
+
+#endif
--- a/3rdparty/iqa/LICENSE
+++ b/3rdparty/iqa/LICENSE
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
--- a/3rdparty/iqa/README.txt
+++ b/3rdparty/iqa/README.txt
@@ -0,0 +1,36 @@
+Doxygen documentation can be found at: http://tdistler.com/iqa
+
+BUILD:
+
+  All build artifacts end up in build/<configuration>, where <configuration> is
+  'debug' or 'release'.
+
+  Windows:
+    - Open iqa.sln, select 'Debug' or 'Release', and build. The output is a 
+      static library 'iqa.lib'.
+    - To run the tests under the debugger, first right-click the 'test' project,
+      select Properties -> Configuration Properties -> Debugging and set
+      'Working Directory' to '$(OutDir)'. Then start the application.
+
+  Linux:
+    - Change directories into the root of the IQA branch you want to build.
+    - Type `make` for a debug build, or `make RELEASE=1` for a release build.
+      The output is a static library 'libiqa.a'.
+    - Type `make test` (or `make test RELEASE=1`) to build the unit tests.
+    - Type `make clean` (or `make clean RELEASE=1`) to delete all build
+      artifacts.
+    - To run the tests, `cd` to the build/<configuration> directory and type
+      `./test`.
+
+
+USE:
+
+  - Include 'iqa.h' in your source file.
+  - Call iqa_* methods.
+  - Link against the IQA library.
+
+
+HELP & SUPPORT:
+
+  Further help can be found at: https://sourceforge.net/projects/iqa/support
+
--- a/3rdparty/iqa/include/convolve.h
+++ b/3rdparty/iqa/include/convolve.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _CONVOLVE_H_
+#define _CONVOLVE_H_
+
+typedef float (*_iqa_get_pixel)(const float *img, int w, int h, int x, int y, float bnd_const);
+
+/** Out-of-bounds array values are a mirrored reflection of the border values*/
+float KBND_SYMMETRIC(const float *img, int w, int h, int x, int y, float bnd_const);
+/** Out-of-bounds array values are set to the nearest border value */
+float KBND_REPLICATE(const float *img, int w, int h, int x, int y, float bnd_const);
+/** Out-of-bounds array values are set to 'bnd_const' */
+float KBND_CONSTANT(const float *img, int w, int h, int x, int y, float bnd_const);
+
+
+/** Defines a convolution kernel */
+struct _kernel {
+    float *kernel;          /**< Pointer to the kernel values */
+    int w;                  /**< The kernel width */
+    int h;                  /**< The kernel height */
+    int normalized;         /**< 1 if the kernel values add up to 1. 0 otherwise */
+    _iqa_get_pixel bnd_opt; /**< Defines how out-of-bounds image values are handled */
+    float bnd_const;        /**< If 'bnd_opt' is KBND_CONSTANT, this specifies the out-of-bounds value */
+};
+
+/**
+ * @brief Applies the specified kernel to the image.
+ * The kernel will be applied to all areas where it fits completely within
+ * the image. The resulting image will be smaller by half the kernel width 
+ * and height (w - kw/2 and h - kh/2).
+ *
+ * @param img Image to modify
+ * @param w Image width
+ * @param h Image height
+ * @param k The kernel to apply
+ * @param result Buffer to hold the resulting image ((w-kw)*(h-kh), where kw
+ *               and kh are the kernel width and height). If 0, the result
+ *               will be written to the original image buffer.
+ * @param rw Optional. The width of the resulting image will be stored here.
+ * @param rh Optional. The height of the resulting image will be stored here.
+ */
+void _iqa_convolve(float *img, int w, int h, const struct _kernel *k, float *result, int *rw, int *rh);
+
+/**
+ * The same as _iqa_convolve() except the kernel is applied to the entire image.
+ * In other words, the kernel is applied to all areas where the top-left corner
+ * of the kernel is in the image. Out-of-bound pixel value (off the right and
+ * bottom edges) are chosen based on the 'bnd_opt' and 'bnd_const' members of
+ * the kernel structure. The resulting array is the same size as the input
+ * image.
+ *
+ * @param img Image to modify
+ * @param w Image width
+ * @param h Image height
+ * @param k The kernel to apply
+ * @param result Buffer to hold the resulting image ((w-kw)*(h-kh), where kw
+ *               and kh are the kernel width and height). If 0, the result
+ *               will be written to the original image buffer.
+ * @return 0 if successful. Non-zero otherwise.
+ */
+int _iqa_img_filter(float *img, int w, int h, const struct _kernel *k, float *result);
+
+/**
+ * Returns the filtered version of the specified pixel. If no kernel is given,
+ * the raw pixel value is returned.
+ * 
+ * @param img Source image
+ * @param w Image width
+ * @param h Image height
+ * @param x The x location of the pixel to filter
+ * @param y The y location of the pixel to filter
+ * @param k Optional. The convolution kernel to apply to the pixel.
+ * @param kscale The scale of the kernel (for normalization). 1 for normalized
+ *               kernels. Required if 'k' is not null.
+ * @return The filtered pixel value.
+ */
+float _iqa_filter_pixel(const float *img, int w, int h, int x, int y, const struct _kernel *k, const float kscale);
+
+
+#endif /*_CONVOLVE_H_*/
--- a/3rdparty/iqa/include/decimate.h
+++ b/3rdparty/iqa/include/decimate.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _DECIMATE_H_
+#define _DECIMATE_H_
+
+#include "convolve.h"
+
+/**
+ * @brief Downsamples (decimates) an image.
+ *
+ * @param img Image to modify
+ * @param w Image width
+ * @param h Image height
+ * @param factor Decimation factor
+ * @param k The kernel to apply (e.g. low-pass filter). Can be 0.
+ * @param result Buffer to hold the resulting image (w/factor*h/factor). If 0,
+ *               the result will be written to the original image buffer.
+ * @param rw Optional. The width of the resulting image will be stored here.
+ * @param rh Optional. The height of the resulting image will be stored here.
+ * @return 0 on success.
+ */
+int _iqa_decimate(float *img, int w, int h, int factor, const struct _kernel *k, float *result, int *rw, int *rh);
+
+#endif /*_DECIMATE_H_*/
--- a/3rdparty/iqa/include/iqa.h
+++ b/3rdparty/iqa/include/iqa.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _IQA_H_
+#define _IQA_H_
+
+#include "iqa_os.h"
+
+/**
+ * Allows fine-grain control of the SSIM algorithm.
+ */
+struct iqa_ssim_args {
+    float alpha;    /**< luminance exponent */
+    float beta;     /**< contrast exponent */
+    float gamma;    /**< structure exponent */
+    int L;          /**< dynamic range (2^8 - 1)*/
+    float K1;       /**< stabilization constant 1 */
+    float K2;       /**< stabilization constant 2 */
+    int f;          /**< scale factor. 0=default scaling, 1=no scaling */
+};
+
+/**
+ * Allows fine-grain control of the MS-SSIM algorithm.
+ */
+struct iqa_ms_ssim_args {
+    int wang;             /**< 1=original algorithm by Wang, et al. 0=MS-SSIM* by Rouse/Hemami (default). */
+    int gaussian;         /**< 1=11x11 Gaussian window (default). 0=8x8 linear window. */
+    int scales;           /**< Number of scaled images to use. Default is 5. */
+    const float *alphas;  /**< Pointer to array of alpha values for each scale. Required if 'scales' isn't 5. */
+    const float *betas;   /**< Pointer to array of beta values for each scale. Required if 'scales' isn't 5. */
+    const float *gammas;  /**< Pointer to array of gamma values for each scale. Required if 'scales' isn't 5. */
+};
+
+/**
+ * Calculates the Mean Squared Error between 2 equal-sized 8-bit images.
+ * @note The images must have the same width, height, and stride.
+ * @param ref Original reference image
+ * @param cmp Distorted image
+ * @param w Width of the images
+ * @param h Height of the images
+ * @param stride The length (in bytes) of each horizontal line in the image.
+ *               This may be different from the image width.
+ * @return The MSE.
+ */
+float iqa_mse(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride);
+
+/**
+ * Calculates the Peak Signal-to-Noise-Ratio between 2 equal-sized 8-bit
+ * images.
+ * @note The images must have the same width, height, and stride.
+ * @param ref Original reference image
+ * @param cmp Distorted image
+ * @param w Width of the images
+ * @param h Height of the images
+ * @param stride The length (in bytes) of each horizontal line in the image.
+ *               This may be different from the image width.
+ * @return The PSNR.
+ */
+float iqa_psnr(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride);
+
+/**
+ * Calculates the Structural SIMilarity between 2 equal-sized 8-bit images.
+ *
+ * See https://ece.uwaterloo.ca/~z70wang/publications/ssim.html
+ * @note The images must have the same width, height, and stride.
+ * @param ref Original reference image
+ * @param cmp Distorted image
+ * @param w Width of the images
+ * @param h Height of the images
+ * @param stride The length (in bytes) of each horizontal line in the image.
+ *               This may be different from the image width.
+ * @param gaussian 0 = 8x8 square window, 1 = 11x11 circular-symmetric Gaussian
+ * weighting.
+ * @param args Optional SSIM arguments for fine control of the algorithm. 0 for
+ * defaults. Defaults are a=b=g=1.0, L=255, K1=0.01, K2=0.03
+ * @return The mean SSIM over the entire image (MSSIM), or INFINITY if error.
+ */
+float iqa_ssim(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride, 
+    int gaussian, const struct iqa_ssim_args *args);
+
+/**
+ * Calculates the Multi-Scale Structural SIMilarity between 2 equal-sized 8-bit
+ * images. The default algorithm is MS-SSIM* proposed by Rouse/Hemami 2008.
+ *
+ * See https://ece.uwaterloo.ca/~z70wang/publications/msssim.pdf and
+ * http://foulard.ece.cornell.edu/publications/dmr_hvei2008_paper.pdf
+ *
+ * @note 1. The images must have the same width, height, and stride.
+ * @note 2. The minimum image width or height is 2^(scales-1) * filter, where 'filter' is 11
+ * if a Gaussian window is being used, or 9 otherwise.
+ * @param ref Original reference image
+ * @param cmp Distorted image
+ * @param w Width of the images.
+ * @param h Height of the images.
+ * @param stride The length (in bytes) of each horizontal line in the image.
+ *               This may be different from the image width.
+ * @param args Optional MS-SSIM arguments for fine control of the algorithm. 0
+ * for defaults. Defaults are wang=0, scales=5, gaussian=1.
+ * @return The mean MS-SSIM over the entire image, or INFINITY if error.
+ */
+float iqa_ms_ssim(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride, 
+    const struct iqa_ms_ssim_args *args);
+
+#endif /*_IQA_H_*/
--- a/3rdparty/iqa/include/iqa_os.h
+++ b/3rdparty/iqa/include/iqa_os.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _OS_H_
+#define _OS_H_
+
+/* Microsoft tends to implement features early, but they have a high legacy
+ * cost because they won't break existing implementations. As such, certain
+ * features we take for granted on other platforms (like C99) aren't fully
+ * implemented. This file is meant to rectify that.
+ */
+
+#ifdef WIN32
+
+#include <windows.h>
+#include <math.h>
+
+#define IQA_INLINE __inline
+
+#ifndef INFINITY
+    #define INFINITY (float)HUGE_VAL /**< Defined in C99 (Windows is C89) */
+#endif /*INFINITY*/
+
+#ifndef NAN
+    static const unsigned long __nan[2] = {0xffffffff, 0x7fffffff};
+    #define NAN (*(const float *) __nan) /**< Defined in C99 (Windows is C99) */
+#endif
+
+#define IQA_EXPORT __declspec(dllexport)
+
+#else /* !Windows */
+
+#define IQA_INLINE inline
+#define IQA_EXPORT
+
+#endif
+
+#endif /* _OS_H_ */
--- a/3rdparty/iqa/include/math_utils.h
+++ b/3rdparty/iqa/include/math_utils.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MATH_UTILS_H_
+#define _MATH_UTILS_H_
+
+#include "iqa_os.h"
+#include <math.h>
+
+/**
+ * Rounds a float to the nearest integer.
+ */
+IQA_EXPORT int _round(float a);
+
+IQA_EXPORT int _max(int x, int y);
+
+IQA_EXPORT int _min(int x, int y);
+
+
+/** 
+ * Compares 2 floats to the specified digit of precision.
+ * @return 0 if equal, 1 otherwise.
+ */
+IQA_EXPORT int _cmp_float(float a, float b, int digits);
+
+
+/** 
+ * Compares 2 matrices with the specified precision. 'b' is assumed to be the
+ * same size as 'a' or smaller.
+ * @return 0 if equal, 1 otherwise
+ */
+IQA_EXPORT int _matrix_cmp(const float *a, const float *b, int w, int h, int digits);
+
+#endif /*_MATH_UTILS_H_*/
--- a/3rdparty/iqa/include/ssim.h
+++ b/3rdparty/iqa/include/ssim.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SSIM_H_
+#define _SSIM_H_
+
+#include "convolve.h"
+
+/*
+ * Circular-symmetric Gaussian weighting.
+ * h(x,y) = hg(x,y)/SUM(SUM(hg)) , for normalization to 1.0
+ * hg(x,y) = e^( -0.5*( (x^2+y^2)/sigma^2 ) ) , where sigma was 1.5
+ */
+#define GAUSSIAN_LEN 11
+static const float g_gaussian_window[GAUSSIAN_LEN][GAUSSIAN_LEN] = {
+    {0.000001f, 0.000008f, 0.000037f, 0.000112f, 0.000219f, 0.000274f, 0.000219f, 0.000112f, 0.000037f, 0.000008f, 0.000001f},
+    {0.000008f, 0.000058f, 0.000274f, 0.000831f, 0.001619f, 0.002021f, 0.001619f, 0.000831f, 0.000274f, 0.000058f, 0.000008f},
+    {0.000037f, 0.000274f, 0.001296f, 0.003937f, 0.007668f, 0.009577f, 0.007668f, 0.003937f, 0.001296f, 0.000274f, 0.000037f},
+    {0.000112f, 0.000831f, 0.003937f, 0.011960f, 0.023294f, 0.029091f, 0.023294f, 0.011960f, 0.003937f, 0.000831f, 0.000112f},
+    {0.000219f, 0.001619f, 0.007668f, 0.023294f, 0.045371f, 0.056662f, 0.045371f, 0.023294f, 0.007668f, 0.001619f, 0.000219f},
+    {0.000274f, 0.002021f, 0.009577f, 0.029091f, 0.056662f, 0.070762f, 0.056662f, 0.029091f, 0.009577f, 0.002021f, 0.000274f},
+    {0.000219f, 0.001619f, 0.007668f, 0.023294f, 0.045371f, 0.056662f, 0.045371f, 0.023294f, 0.007668f, 0.001619f, 0.000219f},
+    {0.000112f, 0.000831f, 0.003937f, 0.011960f, 0.023294f, 0.029091f, 0.023294f, 0.011960f, 0.003937f, 0.000831f, 0.000112f},
+    {0.000037f, 0.000274f, 0.001296f, 0.003937f, 0.007668f, 0.009577f, 0.007668f, 0.003937f, 0.001296f, 0.000274f, 0.000037f},
+    {0.000008f, 0.000058f, 0.000274f, 0.000831f, 0.001619f, 0.002021f, 0.001619f, 0.000831f, 0.000274f, 0.000058f, 0.000008f},
+    {0.000001f, 0.000008f, 0.000037f, 0.000112f, 0.000219f, 0.000274f, 0.000219f, 0.000112f, 0.000037f, 0.000008f, 0.000001f},
+};
+
+/*
+ * Equal weight square window.
+ * Each pixel is equally weighted (1/64) so that SUM(x) = 1.0
+ */
+#define SQUARE_LEN 8
+static const float g_square_window[SQUARE_LEN][SQUARE_LEN] = {
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+};
+
+/* Holds intermediate SSIM values for map-reduce operation. */
+struct _ssim_int {
+    double l;
+    double c;
+    double s;
+};
+
+/* Defines the pointers to the map-reduce functions. */
+typedef int (*_map)(const struct _ssim_int *, void *);
+typedef float (*_reduce)(int, int, void *);
+
+/* Arguments for map-reduce. The 'context' is user-defined. */
+struct _map_reduce {
+    _map map;
+    _reduce reduce;
+    void *context;
+};
+
+/**
+ * Private method that calculates the SSIM value on a pre-processed image.
+ *
+ * The input images must have stride==width. This method does not scale.
+ *
+ * @note Image buffers are modified.
+ *
+ * Map-reduce is used for doing the final SSIM calculation. The map function is
+ * called for every pixel, and the reduce is called at the end. The context is
+ * caller-defined and *not* modified by this method.
+ *
+ * @param ref Original reference image
+ * @param cmp Distorted image
+ * @param w Width of the images
+ * @param h Height of the images
+ * @param k The kernel used as the window function
+ * @param mr Optional map-reduce functions to use to calculate SSIM. Required
+ *           if 'args' is not null. Ignored if 'args' is null.
+ * @param args Optional SSIM arguments for fine control of the algorithm. 0 for defaults.
+ *             Defaults are a=b=g=1.0, L=255, K1=0.01, K2=0.03
+ * @return The mean SSIM over the entire image (MSSIM), or INFINITY if error.
+ */
+float _iqa_ssim(float *ref, float *cmp, int w, int h, const struct _kernel *k, const struct _map_reduce *mr, const struct iqa_ssim_args *args);
+
+#endif /* _SSIM_H_ */
--- a/3rdparty/iqa/source/convolve.c
+++ b/3rdparty/iqa/source/convolve.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "convolve.h"
+#include <stdlib.h>
+
+float KBND_SYMMETRIC(const float *img, int w, int h, int x, int y, float bnd_const)
+{
+    (void)bnd_const;
+    if (x<0) x=-1-x;
+    else if (x>=w) x=(w-(x-w))-1;
+    if (y<0) y=-1-y;
+    else if (y>=h) y=(h-(y-h))-1;
+    return img[y*w + x];
+}
+
+float KBND_REPLICATE(const float *img, int w, int h, int x, int y, float bnd_const)
+{
+    (void)bnd_const;
+    if (x<0) x=0;
+    if (x>=w) x=w-1;
+    if (y<0) y=0;
+    if (y>=h) y=h-1;
+    return img[y*w + x];
+}
+
+float KBND_CONSTANT(const float *img, int w, int h, int x, int y, float bnd_const)
+{
+    if (x<0) x=0;
+    if (y<0) y=0;
+    if (x>=w || y>=h)
+        return bnd_const;
+    return img[y*w + x];
+}
+
+static float _calc_scale(const struct _kernel *k)
+{
+    int ii,k_len;
+    double sum=0.0;
+
+    if (k->normalized)
+        return 1.0f;
+    else {
+        k_len = k->w * k->h;
+        for (ii=0; ii<k_len; ++ii)
+            sum += k->kernel[ii];
+        if (sum != 0.0)
+            return (float)(1.0 / sum);
+        return 1.0f;
+    }
+}
+
+void _iqa_convolve(float *img, int w, int h, const struct _kernel *k, float *result, int *rw, int *rh)
+{
+    int x,y,kx,ky,u,v;
+    int uc = k->w/2;
+    int vc = k->h/2;
+    int kw_even = (k->w&1)?0:1;
+    int kh_even = (k->h&1)?0:1;
+    int dst_w = w - k->w + 1;
+    int dst_h = h - k->h + 1;
+    int img_offset,k_offset;
+    double sum;
+    float scale, *dst=result;
+
+    if (!dst)
+        dst = img; /* Convolve in-place */
+
+    /* Kernel is applied to all positions where the kernel is fully contained
+     * in the image */
+    scale = _calc_scale(k);
+    for (y=0; y < dst_h; ++y) {
+        for (x=0; x < dst_w; ++x) {
+            sum = 0.0;
+            k_offset = 0;
+            ky = y+vc;
+            kx = x+uc;
+            for (v=-vc; v <= vc-kh_even; ++v) {
+                img_offset = (ky+v)*w + kx;
+                for (u=-uc; u <= uc-kw_even; ++u, ++k_offset) {
+                    sum += img[img_offset+u] * k->kernel[k_offset];
+                }
+            }
+            dst[y*dst_w + x] = (float)(sum * scale);
+        }
+    }
+
+    if (rw) *rw = dst_w;
+    if (rh) *rh = dst_h;
+}
+
+int _iqa_img_filter(float *img, int w, int h, const struct _kernel *k, float *result)
+{
+    int x,y;
+    int img_offset;
+    float scale, *dst=result;
+
+    if (!k || !k->bnd_opt)
+        return 1;
+
+    if (!dst) {
+        dst = (float*)malloc(w*h*sizeof(float));
+        if (!dst)
+            return 2;
+    }
+
+    scale = _calc_scale(k);
+
+    /* Kernel is applied to all positions where top-left corner is in the image */
+    for (y=0; y < h; ++y) {
+        for (x=0; x < w; ++x) {
+            dst[y*w + x] = _iqa_filter_pixel(img, w, h, x, y, k, scale);
+        }
+    }
+
+    /* If no result buffer given, copy results to image buffer */
+    if (!result) {
+        for (y=0; y<h; ++y) {
+            img_offset = y*w;
+            for (x=0; x<w; ++x, ++img_offset) {
+                img[img_offset] = dst[img_offset];
+            }
+        }
+        free(dst);
+    }
+    return 0;
+}
+
+float _iqa_filter_pixel(const float *img, int w, int h, int x, int y, const struct _kernel *k, const float kscale)
+{
+    int u,v,uc,vc;
+    int kw_even,kh_even;
+    int x_edge_left,x_edge_right,y_edge_top,y_edge_bottom;
+    int edge,img_offset,k_offset;
+    double sum;
+
+    if (!k)
+        return img[y*w + x];
+
+    uc = k->w/2;
+    vc = k->h/2;
+    kw_even = (k->w&1)?0:1;
+    kh_even = (k->h&1)?0:1;
+    x_edge_left  = uc;
+    x_edge_right = w-uc;
+    y_edge_top = vc;
+    y_edge_bottom = h-vc;
+
+    edge = 0;
+    if (x < x_edge_left || y < y_edge_top || x >= x_edge_right || y >= y_edge_bottom)
+        edge = 1;
+
+    sum = 0.0;
+    k_offset = 0;
+    for (v=-vc; v <= vc-kh_even; ++v) {
+        img_offset = (y+v)*w + x;
+        for (u=-uc; u <= uc-kw_even; ++u, ++k_offset) {
+            if (!edge)
+                sum += img[img_offset+u] * k->kernel[k_offset];
+            else
+                sum += k->bnd_opt(img, w, h, x+u, y+v, k->bnd_const) * k->kernel[k_offset];
+        }
+    }
+    return (float)(sum * kscale);
+}
--- a/3rdparty/iqa/source/decimate.c
+++ b/3rdparty/iqa/source/decimate.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "decimate.h"
+#include <stdlib.h>
+
+int _iqa_decimate(float *img, int w, int h, int factor, const struct _kernel *k, float *result, int *rw, int *rh)
+{
+    int x,y;
+    int sw = w/factor + (w&1);
+    int sh = h/factor + (h&1);
+    int dst_offset;
+    float *dst=img;
+
+    if (result)
+        dst = result;
+
+    /* Downsample */
+    for (y=0; y<sh; ++y) {
+        dst_offset = y*sw;
+        for (x=0; x<sw; ++x,++dst_offset) {
+            dst[dst_offset] = _iqa_filter_pixel(img, w, h, x*factor, y*factor, k, 1.0f);
+        }
+    }
+    
+    if (rw) *rw = sw;
+    if (rh) *rh = sh;
+    return 0;
+}
--- a/3rdparty/iqa/source/math_utils.c
+++ b/3rdparty/iqa/source/math_utils.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "math_utils.h"
+#include <math.h>
+
+int _round(float a)
+{
+    int sign_a = a > 0.0f ? 1 : -1;
+    return a-(int)a >= 0.5 ? (int)a + sign_a : (int)a;
+}
+
+int _max(int x, int y)
+{
+    return x >= y ? x : y;
+}
+
+int _min(int x, int y)
+{
+    return x <= y ? x : y;
+}
+
+int _cmp_float(float a, float b, int digits)
+{
+    /* Round */
+    int sign_a = a > 0.0f ? 1 : -1;
+    int sign_b = b > 0.0f ? 1 : -1;
+    double scale = pow(10.0, (double)digits);
+    double ax = a * scale;
+    double bx = b * scale;
+    int ai = ax-(int)ax >= 0.5 ? (int)ax + sign_a : (int)ax;
+    int bi = bx-(int)bx >= 0.5 ? (int)bx + sign_b : (int)bx;
+
+    /* Compare */
+    return ai == bi ? 0 : 1;
+}
+
+int _matrix_cmp(const float *a, const float *b, int w, int h, int digits)
+{
+    int offset;
+    int result=0;
+    int len=w*h;
+    for (offset=0; offset<len; ++offset) {
+        if (_cmp_float(a[offset], b[offset], digits)) {
+            result = 1;
+            break;
+        }
+    }
+
+    return result;
+}
+
--- a/3rdparty/iqa/source/ms_ssim.c
+++ b/3rdparty/iqa/source/ms_ssim.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "iqa.h"
+#include "ssim.h"
+#include "decimate.h"
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Default number of scales */
+#define SCALES  5
+
+/* Low-pass filter for down-sampling (9/7 biorthogonal wavelet filter) */
+#define LPF_LEN 9
+static const float g_lpf[LPF_LEN][LPF_LEN] = {
+   { 0.000714f,-0.000450f,-0.002090f, 0.007132f, 0.016114f, 0.007132f,-0.002090f,-0.000450f, 0.000714f},
+   {-0.000450f, 0.000283f, 0.001316f,-0.004490f,-0.010146f,-0.004490f, 0.001316f, 0.000283f,-0.000450f},
+   {-0.002090f, 0.001316f, 0.006115f,-0.020867f,-0.047149f,-0.020867f, 0.006115f, 0.001316f,-0.002090f},
+   { 0.007132f,-0.004490f,-0.020867f, 0.071207f, 0.160885f, 0.071207f,-0.020867f,-0.004490f, 0.007132f},
+   { 0.016114f,-0.010146f,-0.047149f, 0.160885f, 0.363505f, 0.160885f,-0.047149f,-0.010146f, 0.016114f},
+   { 0.007132f,-0.004490f,-0.020867f, 0.071207f, 0.160885f, 0.071207f,-0.020867f,-0.004490f, 0.007132f},
+   {-0.002090f, 0.001316f, 0.006115f,-0.020867f,-0.047149f,-0.020867f, 0.006115f, 0.001316f,-0.002090f},
+   {-0.000450f, 0.000283f, 0.001316f,-0.004490f,-0.010146f,-0.004490f, 0.001316f, 0.000283f,-0.000450f},
+   { 0.000714f,-0.000450f,-0.002090f, 0.007132f, 0.016114f, 0.007132f,-0.002090f,-0.000450f, 0.000714f},
+};
+
+/* Alpha, beta, and gamma values for each scale */
+static float g_alphas[] = { 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.1333f };
+static float g_betas[]  = { 0.0448f, 0.2856f, 0.3001f, 0.2363f, 0.1333f };
+static float g_gammas[] = { 0.0448f, 0.2856f, 0.3001f, 0.2363f, 0.1333f };
+
+
+struct _context {
+    double l;  /* Luminance */
+    double c;  /* Contrast */
+    double s;  /* Structure */
+    float alpha;
+    float beta;
+    float gamma;
+};
+
+/* Called for each pixel */
+int _ms_ssim_map(const struct _ssim_int *si, void *ctx)
+{
+    struct _context *ms_ctx = (struct _context*)ctx;
+    ms_ctx->l += si->l;
+    ms_ctx->c += si->c;
+    ms_ctx->s += si->s;
+    return 0;
+}
+
+/* Called to calculate the final result */
+float _ms_ssim_reduce(int w, int h, void *ctx)
+{
+    double size = (double)(w*h);
+    struct _context *ms_ctx = (struct _context*)ctx;
+    ms_ctx->l = pow(ms_ctx->l / size, (double)ms_ctx->alpha);
+    ms_ctx->c = pow(ms_ctx->c / size, (double)ms_ctx->beta);
+    ms_ctx->s = pow(fabs(ms_ctx->s / size), (double)ms_ctx->gamma);
+    return (float)(ms_ctx->l * ms_ctx->c * ms_ctx->s);
+}
+
+/* Releases the scaled buffers */
+void _free_buffers(float **buf, int scales)
+{
+    int idx;
+    for (idx=0; idx<scales; ++idx)
+        free(buf[idx]);
+}
+
+/* Allocates the scaled buffers. If error, all buffers are free'd */
+int _alloc_buffers(float **buf, int w, int h, int scales)
+{
+    int idx;
+    int cur_w = w;
+    int cur_h = h;
+    for (idx=0; idx<scales; ++idx) {
+        buf[idx] = (float*)malloc(cur_w*cur_h*sizeof(float));
+        if (!buf[idx]) {
+            _free_buffers(buf, idx);
+            return 1;
+        }
+        cur_w = cur_w/2 + (cur_w&1);
+        cur_h = cur_h/2 + (cur_h&1);
+    }
+    return 0;
+}
+
+/*
+ * MS_SSIM(X,Y) = Lm(x,y)^aM * MULT[j=1->M]( Cj(x,y)^bj  *  Sj(x,y)^gj )
+ * where,
+ *  L = mean
+ *  C = variance
+ *  S = cross-correlation
+ *
+ *  b1=g1=0.0448, b2=g2=0.2856, b3=g3=0.3001, b4=g4=0.2363, a5=b5=g5=0.1333
+ */
+float iqa_ms_ssim(const unsigned char *ref, const unsigned char *cmp, int w, int h, 
+    int stride, const struct iqa_ms_ssim_args *args)
+{
+    int wang=0;
+    int scales=SCALES;
+    int gauss=1;
+    const float *alphas=g_alphas, *betas=g_betas, *gammas=g_gammas;
+    int idx,x,y,cur_w,cur_h;
+    int offset,src_offset;
+    float **ref_imgs, **cmp_imgs; /* Array of pointers to scaled images */
+    float msssim;
+    struct _kernel lpf, window;
+    struct iqa_ssim_args s_args;
+    struct _map_reduce mr;
+    struct _context ms_ctx;
+
+    if (args) {
+        wang   = args->wang;
+        gauss  = args->gaussian;
+        scales = args->scales;
+        if (args->alphas)
+            alphas = args->alphas;
+        if (args->betas)
+            betas  = args->betas;
+        if (args->gammas)
+            gammas = args->gammas;
+    }
+
+    /* Make sure we won't scale below 1x1 */
+    cur_w = w;
+    cur_h = h;
+    for (idx=0; idx<scales; ++idx) {
+        if ( gauss ? cur_w<GAUSSIAN_LEN || cur_h<GAUSSIAN_LEN : cur_w<LPF_LEN || cur_h<LPF_LEN )
+            return INFINITY;
+        cur_w /= 2;
+        cur_h /= 2;
+    }
+
+    window.kernel = (float*)g_square_window;
+    window.w = window.h = SQUARE_LEN;
+    window.normalized = 1;
+    window.bnd_opt = KBND_SYMMETRIC;
+    if (gauss) {
+        window.kernel = (float*)g_gaussian_window;
+        window.w = window.h = GAUSSIAN_LEN;
+    }
+
+    mr.map     = _ms_ssim_map;
+    mr.reduce  = _ms_ssim_reduce;
+
+    /* Allocate the scaled image buffers */
+    ref_imgs = (float**)malloc(scales*sizeof(float*));
+    cmp_imgs = (float**)malloc(scales*sizeof(float*));
+    if (!ref_imgs || !cmp_imgs) {
+        if (ref_imgs) free(ref_imgs);
+        if (cmp_imgs) free(cmp_imgs);
+        return INFINITY;
+    }
+    if (_alloc_buffers(ref_imgs, w, h, scales)) {
+        free(ref_imgs);
+        free(cmp_imgs);
+        return INFINITY;
+    }
+    if (_alloc_buffers(cmp_imgs, w, h, scales)) {
+        _free_buffers(ref_imgs, scales);
+        free(ref_imgs);
+        free(cmp_imgs);
+        return INFINITY;
+    }
+
+    /* Copy original images into first scale buffer, forcing stride = width. */
+    for (y=0; y<h; ++y) {
+        src_offset = y*stride;
+        offset = y*w;
+        for (x=0; x<w; ++x, ++offset, ++src_offset) {
+            ref_imgs[0][offset] = (float)ref[src_offset];
+            cmp_imgs[0][offset] = (float)cmp[src_offset];
+        }
+    }
+
+    /* Create scaled versions of the images */
+    cur_w=w;
+    cur_h=h;
+    lpf.kernel = (float*)g_lpf;
+    lpf.w = lpf.h = LPF_LEN;
+    lpf.normalized = 1;
+    lpf.bnd_opt = KBND_SYMMETRIC;
+    for (idx=1; idx<scales; ++idx) {
+        if (_iqa_decimate(ref_imgs[idx-1], cur_w, cur_h, 2, &lpf, ref_imgs[idx], 0, 0) ||
+            _iqa_decimate(cmp_imgs[idx-1], cur_w, cur_h, 2, &lpf, cmp_imgs[idx], &cur_w, &cur_h))
+        {
+            _free_buffers(ref_imgs, scales);
+            _free_buffers(cmp_imgs, scales);
+            free(ref_imgs);
+            free(cmp_imgs);
+            return INFINITY;
+        }
+    }
+
+    cur_w=w;
+    cur_h=h;
+    msssim = 1.0;
+    for (idx=0; idx<scales; ++idx) {
+
+        ms_ctx.l = 0;
+        ms_ctx.c = 0;
+        ms_ctx.s = 0;
+        ms_ctx.alpha = alphas[idx];
+        ms_ctx.beta  = betas[idx];
+        ms_ctx.gamma = gammas[idx];
+
+        if (!wang) {
+            /* MS-SSIM* (Rouse/Hemami) */
+            s_args.alpha = 1.0f;
+            s_args.beta  = 1.0f;
+            s_args.gamma = 1.0f;
+            s_args.K1 = 0.0f; /* Force stabilization constants to 0 */
+            s_args.K2 = 0.0f;
+            s_args.L  = 255;
+            s_args.f  = 1; /* Don't resize */
+            mr.context = &ms_ctx;
+            msssim *= _iqa_ssim(ref_imgs[idx], cmp_imgs[idx], cur_w, cur_h, &window, &mr, &s_args);
+        }
+        else {
+            /* MS-SSIM (Wang) */
+            s_args.alpha = 1.0f;
+            s_args.beta  = 1.0f;
+            s_args.gamma = 1.0f;
+            s_args.K1 = 0.01f;
+            s_args.K2 = 0.03f;
+            s_args.L  = 255;
+            s_args.f  = 1; /* Don't resize */
+            mr.context = &ms_ctx;
+            msssim *= _iqa_ssim(ref_imgs[idx], cmp_imgs[idx], cur_w, cur_h, &window, &mr, &s_args);
+        }
+
+        if (msssim == INFINITY)
+            break;
+        cur_w = cur_w/2 + (cur_w&1);
+        cur_h = cur_h/2 + (cur_h&1);
+    }
+
+    _free_buffers(ref_imgs, scales);
+    _free_buffers(cmp_imgs, scales);
+    free(ref_imgs);
+    free(cmp_imgs);
+
+    return msssim;
+}
--- a/3rdparty/iqa/source/mse.c
+++ b/3rdparty/iqa/source/mse.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "iqa.h"
+
+/* MSE(a,b) = 1/N * SUM((a-b)^2) */
+float iqa_mse(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride)
+{
+    int error, offset;
+    unsigned long long sum=0;
+    int ww,hh;
+    for (hh=0; hh<h; ++hh) {
+        offset = hh*stride;
+        for (ww=0; ww<w; ++ww, ++offset) {
+            error = ref[offset] - cmp[offset];
+            sum += error * error;
+        }
+    }
+    return (float)( (double)sum / (double)(w*h) );
+}
--- a/3rdparty/iqa/source/psnr.c
+++ b/3rdparty/iqa/source/psnr.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "iqa.h"
+#include <math.h>
+
+/* PSNR(a,b) = 10*log10(L^2 / MSE(a,b)), where L=2^b - 1 (8bit = 255) */
+float iqa_psnr(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride)
+{
+    const int L_sqd = 255 * 255;
+    return (float)( 10.0 * log10( L_sqd / iqa_mse(ref,cmp,w,h,stride) ) );
+}
--- a/3rdparty/iqa/source/ssim.c
+++ b/3rdparty/iqa/source/ssim.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "iqa.h"
+#include "convolve.h"
+#include "decimate.h"
+#include "math_utils.h"
+#include "ssim.h"
+#include <stdlib.h>
+#include <math.h>
+
+
+/* Forward declarations. */
+IQA_INLINE static double _calc_luminance(float, float, float, float);
+IQA_INLINE static double _calc_contrast(double, float, float, float, float);
+IQA_INLINE static double _calc_structure(float, double, float, float, float, float);
+static int _ssim_map(const struct _ssim_int *, void *);
+static float _ssim_reduce(int, int, void *);
+
+/* 
+ * SSIM(x,y)=(2*ux*uy + C1)*(2sxy + C2) / (ux^2 + uy^2 + C1)*(sx^2 + sy^2 + C2)
+ * where,
+ *  ux = SUM(w*x)
+ *  sx = (SUM(w*(x-ux)^2)^0.5
+ *  sxy = SUM(w*(x-ux)*(y-uy))
+ *
+ * Returns mean SSIM. MSSIM(X,Y) = 1/M * SUM(SSIM(x,y))
+ */
+float iqa_ssim(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride,
+    int gaussian, const struct iqa_ssim_args *args)
+{
+    int scale;
+    int x,y,src_offset,offset;
+    float *ref_f,*cmp_f;
+    struct _kernel low_pass;
+    struct _kernel window;
+    float result;
+    double ssim_sum=0.0;
+    struct _map_reduce mr;
+
+    /* Initialize algorithm parameters */
+    scale = _max( 1, _round( (float)_min(w,h) / 256.0f ) );
+    if (args) {
+        if(args->f)
+            scale = args->f;
+        mr.map     = _ssim_map;
+        mr.reduce  = _ssim_reduce;
+        mr.context = (void*)&ssim_sum;
+    }
+    window.kernel = (float*)g_square_window;
+    window.w = window.h = SQUARE_LEN;
+    window.normalized = 1;
+    window.bnd_opt = KBND_SYMMETRIC;
+    if (gaussian) {
+        window.kernel = (float*)g_gaussian_window;
+        window.w = window.h = GAUSSIAN_LEN;
+    }
+
+    /* Convert image values to floats. Forcing stride = width. */
+    ref_f = (float*)malloc(w*h*sizeof(float));
+    cmp_f = (float*)malloc(w*h*sizeof(float));
+    if (!ref_f || !cmp_f) {
+        if (ref_f) free(ref_f);
+        if (cmp_f) free(cmp_f);
+        return INFINITY;
+    }
+    for (y=0; y<h; ++y) {
+        src_offset = y*stride;
+        offset = y*w;
+        for (x=0; x<w; ++x, ++offset, ++src_offset) {
+            ref_f[offset] = (float)ref[src_offset];
+            cmp_f[offset] = (float)cmp[src_offset];
+        }
+    }
+
+    /* Scale the images down if required */
+    if (scale > 1) {
+        /* Generate simple low-pass filter */
+        low_pass.kernel = (float*)malloc(scale*scale*sizeof(float));
+        if (!low_pass.kernel) {
+            free(ref_f);
+            free(cmp_f);
+            return INFINITY;
+        }
+        low_pass.w = low_pass.h = scale;
+        low_pass.normalized = 0;
+        low_pass.bnd_opt = KBND_SYMMETRIC;
+        for (offset=0; offset<scale*scale; ++offset)
+            low_pass.kernel[offset] = 1.0f/(scale*scale);
+
+        /* Resample */
+        if (_iqa_decimate(ref_f, w, h, scale, &low_pass, 0, 0, 0) ||
+            _iqa_decimate(cmp_f, w, h, scale, &low_pass, 0, &w, &h)) { /* Update w/h */
+            free(ref_f);
+            free(cmp_f);
+            free(low_pass.kernel);
+            return INFINITY;
+        }
+        free(low_pass.kernel);
+    }
+
+    result = _iqa_ssim(ref_f, cmp_f, w, h, &window, &mr, args);
+    
+    free(ref_f);
+    free(cmp_f);
+
+    return result;
+}
+
+
+/* _iqa_ssim */
+float _iqa_ssim(float *ref, float *cmp, int w, int h, const struct _kernel *k, const struct _map_reduce *mr, const struct iqa_ssim_args *args)
+{
+    float alpha=1.0f, beta=1.0f, gamma=1.0f;
+    int L=255;
+    float K1=0.01f, K2=0.03f;
+    float C1,C2,C3;
+    int x,y,offset;
+    float *ref_mu,*cmp_mu,*ref_sigma_sqd,*cmp_sigma_sqd,*sigma_both;
+    double ssim_sum, numerator, denominator;
+    double luminance_comp, contrast_comp, structure_comp, sigma_root;
+    struct _ssim_int sint;
+
+    /* Initialize algorithm parameters */
+    if (args) {
+        if (!mr)
+            return INFINITY;
+        alpha = args->alpha;
+        beta  = args->beta;
+        gamma = args->gamma;
+        L     = args->L;
+        K1    = args->K1;
+        K2    = args->K2;
+    }
+    C1 = (K1*L)*(K1*L);
+    C2 = (K2*L)*(K2*L);
+    C3 = C2 / 2.0f;
+
+    ref_mu = (float*)malloc(w*h*sizeof(float));
+    cmp_mu = (float*)malloc(w*h*sizeof(float));
+    ref_sigma_sqd = (float*)malloc(w*h*sizeof(float));
+    cmp_sigma_sqd = (float*)malloc(w*h*sizeof(float));
+    sigma_both = (float*)malloc(w*h*sizeof(float));
+    if (!ref_mu || !cmp_mu || !ref_sigma_sqd || !cmp_sigma_sqd || !sigma_both) {
+        if (ref_mu) free(ref_mu);
+        if (cmp_mu) free(cmp_mu);
+        if (ref_sigma_sqd) free(ref_sigma_sqd);
+        if (cmp_sigma_sqd) free(cmp_sigma_sqd);
+        if (sigma_both) free(sigma_both);
+        return INFINITY;
+    }
+
+    /* Calculate mean */
+    _iqa_convolve(ref, w, h, k, ref_mu, 0, 0);
+    _iqa_convolve(cmp, w, h, k, cmp_mu, 0, 0);
+
+    for (y=0; y<h; ++y) {
+        offset = y*w;
+        for (x=0; x<w; ++x, ++offset) {
+            ref_sigma_sqd[offset] = ref[offset] * ref[offset];
+            cmp_sigma_sqd[offset] = cmp[offset] * cmp[offset];
+            sigma_both[offset] = ref[offset] * cmp[offset];
+        }
+    }
+
+    /* Calculate sigma */
+    _iqa_convolve(ref_sigma_sqd, w, h, k, 0, 0, 0);
+    _iqa_convolve(cmp_sigma_sqd, w, h, k, 0, 0, 0);
+    _iqa_convolve(sigma_both, w, h, k, 0, &w, &h); /* Update the width and height */
+
+    /* The convolution results are smaller by the kernel width and height */
+    for (y=0; y<h; ++y) {
+        offset = y*w;
+        for (x=0; x<w; ++x, ++offset) {
+            ref_sigma_sqd[offset] -= ref_mu[offset] * ref_mu[offset];
+            cmp_sigma_sqd[offset] -= cmp_mu[offset] * cmp_mu[offset];
+            sigma_both[offset] -= ref_mu[offset] * cmp_mu[offset];
+        }
+    }
+
+    ssim_sum = 0.0;
+    for (y=0; y<h; ++y) {
+        offset = y*w;
+        for (x=0; x<w; ++x, ++offset) {
+
+            if (!args) {
+                /* The default case */
+                numerator   = (2.0 * ref_mu[offset] * cmp_mu[offset] + C1) * (2.0 * sigma_both[offset] + C2);
+                denominator = (ref_mu[offset]*ref_mu[offset] + cmp_mu[offset]*cmp_mu[offset] + C1) * 
+                    (ref_sigma_sqd[offset] + cmp_sigma_sqd[offset] + C2);
+                ssim_sum += numerator / denominator;
+            }
+            else {
+                /* User tweaked alpha, beta, or gamma */
+
+                /* passing a negative number to sqrt() cause a domain error */
+                if (ref_sigma_sqd[offset] < 0.0f)
+                    ref_sigma_sqd[offset] = 0.0f;
+                if (cmp_sigma_sqd[offset] < 0.0f)
+                    cmp_sigma_sqd[offset] = 0.0f;
+                sigma_root = sqrt(ref_sigma_sqd[offset] * cmp_sigma_sqd[offset]);
+
+                luminance_comp = _calc_luminance(ref_mu[offset], cmp_mu[offset], C1, alpha);
+                contrast_comp  = _calc_contrast(sigma_root, ref_sigma_sqd[offset], cmp_sigma_sqd[offset], C2, beta);
+                structure_comp = _calc_structure(sigma_both[offset], sigma_root, ref_sigma_sqd[offset], cmp_sigma_sqd[offset], C3, gamma);
+
+                sint.l = luminance_comp;
+                sint.c = contrast_comp;
+                sint.s = structure_comp;
+
+                if (mr->map(&sint, mr->context))
+                    return INFINITY;
+            }
+        }
+    }
+
+    free(ref_mu);
+    free(cmp_mu);
+    free(ref_sigma_sqd);
+    free(cmp_sigma_sqd);
+    free(sigma_both);
+
+    if (!args)
+        return (float)(ssim_sum / (double)(w*h));
+    return mr->reduce(w, h, mr->context);
+}
+
+
+/* _ssim_map */
+int _ssim_map(const struct _ssim_int *si, void *ctx)
+{
+    double *ssim_sum = (double*)ctx;
+    *ssim_sum += si->l * si->c * si->s;
+    return 0;
+}
+
+/* _ssim_reduce */
+float _ssim_reduce(int w, int h, void *ctx)
+{
+    double *ssim_sum = (double*)ctx;
+    return (float)(*ssim_sum / (double)(w*h));
+}
+
+
+/* _calc_luminance */
+IQA_INLINE static double _calc_luminance(float mu1, float mu2, float C1, float alpha)
+{
+    double result;
+    float sign;
+    /* For MS-SSIM* */
+    if (C1 == 0 && mu1*mu1 == 0 && mu2*mu2 == 0)
+        return 1.0;
+    result = (2.0 * mu1 * mu2 + C1) / (mu1*mu1 + mu2*mu2 + C1);
+    if (alpha == 1.0f)
+        return result;
+    sign = result < 0.0 ? -1.0f : 1.0f;
+    return sign * pow(fabs(result),(double)alpha);
+}
+
+/* _calc_contrast */
+IQA_INLINE static double _calc_contrast(double sigma_comb_12, float sigma1_sqd, float sigma2_sqd, float C2, float beta)
+{
+    double result;
+    float sign;
+    /* For MS-SSIM* */
+    if (C2 == 0 && sigma1_sqd + sigma2_sqd == 0)
+        return 1.0;
+    result = (2.0 * sigma_comb_12 + C2) / (sigma1_sqd + sigma2_sqd + C2);
+    if (beta == 1.0f)
+        return result;
+    sign = result < 0.0 ? -1.0f : 1.0f;
+    return sign * pow(fabs(result),(double)beta);
+}
+
+/* _calc_structure */
+IQA_INLINE static double _calc_structure(float sigma_12, double sigma_comb_12, float sigma1, float sigma2, float C3, float gamma)
+{
+    double result;
+    float sign;
+    /* For MS-SSIM* */
+    if (C3 == 0 && sigma_comb_12 == 0) {
+        if (sigma1 == 0 && sigma2 == 0)
+            return 1.0;
+        else if (sigma1 == 0 || sigma2 == 0)
+            return 0.0;
+    }
+    result = (sigma_12 + C3) / (sigma_comb_12 + C3);
+    if (gamma == 1.0f)
+        return result;
+    sign = result < 0.0 ? -1.0f : 1.0f;
+    return sign * pow(fabs(result),(double)gamma);
+}
--- a/3rdparty/libsquish/LICENSE
+++ b/3rdparty/libsquish/LICENSE
@@ -0,0 +1,20 @@
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/3rdparty/libsquish/README
+++ b/3rdparty/libsquish/README
@@ -0,0 +1,35 @@
+LICENSE
+-------
+
+The squish library is distributed under the terms and conditions of the MIT
+license. This license is specified at the top of each source file and must be
+preserved in its entirety.
+
+BUILDING AND INSTALLING THE LIBRARY
+-----------------------------------
+
+If you are using Visual Studio 2003 or above under Windows then load the Visual
+Studio 2003 project in the vs7 folder. By default, the library is built using
+SSE2 optimisations. To change this either change or remove the SQUISH_USE_SSE=2
+from the preprocessor symbols.
+
+If you are using a Mac then load the Xcode 2.2 project in the distribution. By
+default, the library is built using Altivec optimisations. To change this
+either change or remove SQUISH_USE_ALTIVEC=1 from the preprocessor symbols. I
+guess I'll have to think about changing this for the new Intel Macs that are
+rolling out...
+
+If you are using unix then first edit the config file in the base directory of
+the distribution, enabling Altivec or SSE with the USE_ALTIVEC or USE_SSE
+variables, and editing the optimisation flags passed to the C++ compiler if
+necessary. Then make can be used to build the library, and make install (from
+the superuser account) can be used to install (into /usr/local by default).
+
+REPORTING BUGS OR FEATURE REQUESTS
+----------------------------------
+
+Feedback can be sent to Simon Brown (the developer) at si@sjbrown.co.uk
+
+New releases are announced on the squish library homepage at
+http://sjbrown.co.uk/?code=squish
+
--- a/3rdparty/libsquish/alpha.cpp
+++ b/3rdparty/libsquish/alpha.cpp
@@ -0,0 +1,350 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "alpha.h"
+
+#include <climits>
+#include <algorithm>
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+void CompressAlphaDxt3( u8 const* rgba, int mask, void* block )
+{
+	u8* bytes = reinterpret_cast< u8* >( block );
+	
+	// quantise and pack the alpha values pairwise
+	for( int i = 0; i < 8; ++i )
+	{
+		// quantise down to 4 bits
+		float alpha1 = ( float )rgba[8*i + 3] * ( 15.0f/255.0f );
+		float alpha2 = ( float )rgba[8*i + 7] * ( 15.0f/255.0f );
+		int quant1 = FloatToInt( alpha1, 15 );
+		int quant2 = FloatToInt( alpha2, 15 );
+		
+		// set alpha to zero where masked
+		int bit1 = 1 << ( 2*i );
+		int bit2 = 1 << ( 2*i + 1 );
+		if( ( mask & bit1 ) == 0 )
+			quant1 = 0;
+		if( ( mask & bit2 ) == 0 )
+			quant2 = 0;
+
+		// pack into the byte
+		bytes[i] = ( u8 )( quant1 | ( quant2 << 4 ) );
+	}
+}
+
+void DecompressAlphaDxt3( u8* rgba, void const* block )
+{
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	
+	// unpack the alpha values pairwise
+	for( int i = 0; i < 8; ++i )
+	{
+		// quantise down to 4 bits
+		u8 quant = bytes[i];
+		
+		// unpack the values
+		u8 lo = quant & 0x0f;
+		u8 hi = quant & 0xf0;
+
+		// convert back up to bytes
+		rgba[8*i + 3] = lo | ( lo << 4 );
+		rgba[8*i + 7] = hi | ( hi >> 4 );
+	}
+}
+
+static void FixRange( int& min, int& max, int steps )
+{
+	if( max - min < steps )
+		max = std::min( min + steps, 255 );
+	if( max - min < steps )
+		min = std::max( 0, max - steps );
+}
+
+static int FitCodes( u8 const* rgba, int mask, u8 const* codes, u8* indices )
+{
+	// fit each alpha value to the codebook
+	int err = 0;
+	for( int i = 0; i < 16; ++i )
+	{
+		// check this pixel is valid
+		int bit = 1 << i;
+		if( ( mask & bit ) == 0 )
+		{
+			// use the first code
+			indices[i] = 0;
+			continue;
+		}
+		
+		// find the least error and corresponding index
+		int value = rgba[4*i + 3];
+		int least = INT_MAX;
+		int index = 0;
+		for( int j = 0; j < 8; ++j )
+		{
+			// get the squared error from this code
+			int dist = ( int )value - ( int )codes[j];
+			dist *= dist;
+			
+			// compare with the best so far
+			if( dist < least )
+			{
+				least = dist;
+				index = j;
+			}
+		}
+		
+		// save this index and accumulate the error
+		indices[i] = ( u8 )index;
+		err += least;
+	}
+	
+	// return the total error
+	return err;
+}
+
+static void WriteAlphaBlock( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	u8* bytes = reinterpret_cast< u8* >( block );
+	
+	// write the first two bytes
+	bytes[0] = ( u8 )alpha0;
+	bytes[1] = ( u8 )alpha1;
+	
+	// pack the indices with 3 bits each
+	u8* dest = bytes + 2;
+	u8 const* src = indices;
+	for( int i = 0; i < 2; ++i )
+	{
+		// pack 8 3-bit values
+		int value = 0;
+		for( int j = 0; j < 8; ++j )
+		{
+			int index = *src++;
+			value |= ( index << 3*j );
+		}
+			
+		// store in 3 bytes
+		for( int j = 0; j < 3; ++j )
+		{
+			int byte = ( value >> 8*j ) & 0xff;
+			*dest++ = ( u8 )byte;
+		}
+	}
+}
+
+static void WriteAlphaBlock5( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	// check the relative values of the endpoints
+	if( alpha0 > alpha1 )
+	{
+		// swap the indices
+		u8 swapped[16];
+		for( int i = 0; i < 16; ++i )
+		{
+			u8 index = indices[i];
+			if( index == 0 )
+				swapped[i] = 1;
+			else if( index == 1 )
+				swapped[i] = 0;
+			else if( index <= 5 )
+				swapped[i] = 7 - index;
+			else 
+				swapped[i] = index;
+		}
+		
+		// write the block
+		WriteAlphaBlock( alpha1, alpha0, swapped, block );
+	}
+	else
+	{
+		// write the block
+		WriteAlphaBlock( alpha0, alpha1, indices, block );
+	}	
+}
+
+static void WriteAlphaBlock7( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	// check the relative values of the endpoints
+	if( alpha0 < alpha1 )
+	{
+		// swap the indices
+		u8 swapped[16];
+		for( int i = 0; i < 16; ++i )
+		{
+			u8 index = indices[i];
+			if( index == 0 )
+				swapped[i] = 1;
+			else if( index == 1 )
+				swapped[i] = 0;
+			else
+				swapped[i] = 9 - index;
+		}
+		
+		// write the block
+		WriteAlphaBlock( alpha1, alpha0, swapped, block );
+	}
+	else
+	{
+		// write the block
+		WriteAlphaBlock( alpha0, alpha1, indices, block );
+	}	
+}
+
+void CompressAlphaDxt5( u8 const* rgba, int mask, void* block )
+{
+	// get the range for 5-alpha and 7-alpha interpolation
+	int min5 = 255;
+	int max5 = 0;
+	int min7 = 255;
+	int max7 = 0;
+	for( int i = 0; i < 16; ++i )
+	{
+		// check this pixel is valid
+		int bit = 1 << i;
+		if( ( mask & bit ) == 0 )
+			continue;
+
+		// incorporate into the min/max
+		int value = rgba[4*i + 3];
+		if( value < min7 )
+			min7 = value;
+		if( value > max7 )
+			max7 = value;
+		if( value != 0 && value < min5 )
+			min5 = value;
+		if( value != 255 && value > max5 )
+			max5 = value;
+	}
+	
+	// handle the case that no valid range was found
+	if( min5 > max5 )
+		min5 = max5;
+	if( min7 > max7 )
+		min7 = max7;
+		
+	// fix the range to be the minimum in each case
+	FixRange( min5, max5, 5 );
+	FixRange( min7, max7, 7 );
+	
+	// set up the 5-alpha code book
+	u8 codes5[8];
+	codes5[0] = ( u8 )min5;
+	codes5[1] = ( u8 )max5;
+	for( int i = 1; i < 5; ++i )
+		codes5[1 + i] = ( u8 )( ( ( 5 - i )*min5 + i*max5 )/5 );
+	codes5[6] = 0;
+	codes5[7] = 255;
+	
+	// set up the 7-alpha code book
+	u8 codes7[8];
+	codes7[0] = ( u8 )min7;
+	codes7[1] = ( u8 )max7;
+	for( int i = 1; i < 7; ++i )
+		codes7[1 + i] = ( u8 )( ( ( 7 - i )*min7 + i*max7 )/7 );
+		
+	// fit the data to both code books
+	u8 indices5[16];
+	u8 indices7[16];
+	int err5 = FitCodes( rgba, mask, codes5, indices5 );
+	int err7 = FitCodes( rgba, mask, codes7, indices7 );
+	
+	// save the block with least error
+	if( err5 <= err7 )
+		WriteAlphaBlock5( min5, max5, indices5, block );
+	else
+		WriteAlphaBlock7( min7, max7, indices7, block );
+}
+
+void DecompressAlphaDxt5( u8* rgba, void const* block )
+{
+	// get the two alpha values
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	int alpha0 = bytes[0];
+	int alpha1 = bytes[1];
+	
+	// compare the values to build the codebook
+	u8 codes[8];
+	codes[0] = ( u8 )alpha0;
+	codes[1] = ( u8 )alpha1;
+	if( alpha0 <= alpha1 )
+	{
+		// use 5-alpha codebook
+		for( int i = 1; i < 5; ++i )
+			codes[1 + i] = ( u8 )( ( ( 5 - i )*alpha0 + i*alpha1 )/5 );
+		codes[6] = 0;
+		codes[7] = 255;
+	}
+	else
+	{
+		// use 7-alpha codebook
+		for( int i = 1; i < 7; ++i )
+			codes[1 + i] = ( u8 )( ( ( 7 - i )*alpha0 + i*alpha1 )/7 );
+	}
+	
+	// decode the indices
+	u8 indices[16];
+	u8 const* src = bytes + 2;
+	u8* dest = indices;
+	for( int i = 0; i < 2; ++i )
+	{
+		// grab 3 bytes
+		int value = 0;
+		for( int j = 0; j < 3; ++j )
+		{
+			int byte = *src++;
+			value |= ( byte << 8*j );
+		}
+		
+		// unpack 8 3-bit values from it
+		for( int j = 0; j < 8; ++j )
+		{
+			int index = ( value >> 3*j ) & 0x7;
+			*dest++ = ( u8 )index;
+		}
+	}
+	
+	// write out the indexed codebook values
+	for( int i = 0; i < 16; ++i )
+		rgba[4*i + 3] = codes[indices[i]];
+}
+
+} // namespace squish
--- a/3rdparty/libsquish/alpha.h
+++ b/3rdparty/libsquish/alpha.h
@@ -0,0 +1,41 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_ALPHA_H
+#define SQUISH_ALPHA_H
+
+#include "squish.h"
+
+namespace squish {
+
+void CompressAlphaDxt3( u8 const* rgba, int mask, void* block );
+void CompressAlphaDxt5( u8 const* rgba, int mask, void* block );
+
+void DecompressAlphaDxt3( u8* rgba, void const* block );
+void DecompressAlphaDxt5( u8* rgba, void const* block );
+
+} // namespace squish
+
+#endif // ndef SQUISH_ALPHA_H
--- a/3rdparty/libsquish/clusterfit.cpp
+++ b/3rdparty/libsquish/clusterfit.cpp
@@ -0,0 +1,392 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2007 Ignacio Castano                   icastano@nvidia.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "clusterfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+ClusterFit::ClusterFit( ColourSet const* colours, int flags, float* metric ) 
+  : ColourFit( colours, flags )
+{
+	// set the iteration count
+	m_iterationCount = ( m_flags & kColourIterativeClusterFit ) ? kMaxIterations : 1;
+
+	// initialise the metric (old perceptual = 0.2126f, 0.7152f, 0.0722f)
+	if( metric )
+		m_metric = Vec4( metric[0], metric[1], metric[2], 1.0f );
+	else
+		m_metric = VEC4_CONST( 1.0f );	
+
+	// initialise the best error
+	m_besterror = VEC4_CONST( FLT_MAX );
+
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights() );
+	
+	// compute the principle component
+	m_principle = ComputePrincipleComponent( covariance );
+}
+
+bool ClusterFit::ConstructOrdering( Vec3 const& axis, int iteration )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+
+	// build the list of dot products
+	float dps[16];
+	u8* order = ( u8* )m_order + 16*iteration;
+	for( int i = 0; i < count; ++i )
+	{
+		dps[i] = Dot( values[i], axis );
+		order[i] = ( u8 )i;
+	}
+		
+	// stable sort using them
+	for( int i = 0; i < count; ++i )
+	{
+		for( int j = i; j > 0 && dps[j] < dps[j - 1]; --j )
+		{
+			std::swap( dps[j], dps[j - 1] );
+			std::swap( order[j], order[j - 1] );
+		}
+	}
+	
+	// check this ordering is unique
+	for( int it = 0; it < iteration; ++it )
+	{
+		u8 const* prev = ( u8* )m_order + 16*it;
+		bool same = true;
+		for( int i = 0; i < count; ++i )
+		{
+			if( order[i] != prev[i] )
+			{
+				same = false;
+				break;
+			}
+		}
+		if( same )
+			return false;
+	}
+	
+	// copy the ordering and weight all the points
+	Vec3 const* unweighted = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	m_xsum_wsum = VEC4_CONST( 0.0f );
+	for( int i = 0; i < count; ++i )
+	{
+		int j = order[i];
+		Vec4 p( unweighted[j].X(), unweighted[j].Y(), unweighted[j].Z(), 1.0f );
+		Vec4 w( weights[j] );
+		Vec4 x = p*w;
+		m_points_weights[i] = x;
+		m_xsum_wsum += x;
+	}
+	return true;
+}
+
+void ClusterFit::Compress3( void* block )
+{
+	// declare variables
+	int const count = m_colours->GetCount();
+	Vec4 const two = VEC4_CONST( 2.0 );
+	Vec4 const one = VEC4_CONST( 1.0f );
+	Vec4 const half_half2( 0.5f, 0.5f, 0.5f, 0.25f );
+	Vec4 const zero = VEC4_CONST( 0.0f );
+	Vec4 const half = VEC4_CONST( 0.5f );
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+	// prepare an ordering using the principle axis
+	ConstructOrdering( m_principle, 0 );
+	
+	// check all possible clusters and iterate on the total order
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = m_besterror;
+	u8 bestindices[16];
+	int bestiteration = 0;
+	int besti = 0, bestj = 0;
+	
+	// loop over iterations (we avoid the case that all points in first or last cluster)
+	for( int iterationIndex = 0;; )
+	{
+		// first cluster [0,i) is at the start
+		Vec4 part0 = VEC4_CONST( 0.0f );
+		for( int i = 0; i < count; ++i )
+		{
+			// second cluster [i,j) is half along
+			Vec4 part1 = ( i == 0 ) ? m_points_weights[0] : VEC4_CONST( 0.0f );
+			int jmin = ( i == 0 ) ? 1 : i;
+			for( int j = jmin;; )
+			{
+				// last cluster [j,count) is at the end
+				Vec4 part2 = m_xsum_wsum - part1 - part0;
+				
+				// compute least squares terms directly
+				Vec4 alphax_sum = MultiplyAdd( part1, half_half2, part0 );
+				Vec4 alpha2_sum = alphax_sum.SplatW();
+
+				Vec4 betax_sum = MultiplyAdd( part1, half_half2, part2 );
+				Vec4 beta2_sum = betax_sum.SplatW();
+
+				Vec4 alphabeta_sum = ( part1*half_half2 ).SplatW();
+
+				// compute the least-squares optimal points
+				Vec4 factor = Reciprocal( NegativeMultiplySubtract( alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum ) );
+				Vec4 a = NegativeMultiplySubtract( betax_sum, alphabeta_sum, alphax_sum*beta2_sum )*factor;
+				Vec4 b = NegativeMultiplySubtract( alphax_sum, alphabeta_sum, betax_sum*alpha2_sum )*factor;
+
+				// clamp to the grid
+				a = Min( one, Max( zero, a ) );
+				b = Min( one, Max( zero, b ) );
+				a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
+				b = Truncate( MultiplyAdd( grid, b, half ) )*gridrcp;
+				
+				// compute the error (we skip the constant xxsum)
+				Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+				Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+				Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+				Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
+				// apply the metric to the error term
+				Vec4 e5 = e4*m_metric;
+				Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+				
+				// keep the solution if it wins
+				if( CompareAnyLessThan( error, besterror ) )
+				{
+					beststart = a;
+					bestend = b;
+					besti = i;
+					bestj = j;
+					besterror = error;
+					bestiteration = iterationIndex;
+				}
+
+				// advance
+				if( j == count )
+					break;
+				part1 += m_points_weights[j];
+				++j;
+			}
+
+			// advance
+			part0 += m_points_weights[i];
+		}
+		
+		// stop if we didn't improve in this iteration
+		if( bestiteration != iterationIndex )
+			break;
+			
+		// advance if possible
+		++iterationIndex;
+		if( iterationIndex == m_iterationCount )
+			break;
+			
+		// stop if a new iteration is an ordering that has already been tried
+		Vec3 axis = ( bestend - beststart ).GetVec3();
+		if( !ConstructOrdering( axis, iterationIndex ) )
+			break;
+	}
+		
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+	{
+		// remap the indices
+		u8 const* order = ( u8* )m_order + 16*bestiteration;
+
+		u8 unordered[16];
+		for( int m = 0; m < besti; ++m )
+			unordered[order[m]] = 0;
+		for( int m = besti; m < bestj; ++m )
+			unordered[order[m]] = 2;
+		for( int m = bestj; m < count; ++m )
+			unordered[order[m]] = 1;
+
+		m_colours->RemapIndices( unordered, bestindices );
+		
+		// save the block
+		WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+void ClusterFit::Compress4( void* block )
+{
+	// declare variables
+	int const count = m_colours->GetCount();
+	Vec4 const two = VEC4_CONST( 2.0f );
+	Vec4 const one = VEC4_CONST( 1.0f );
+	Vec4 const onethird_onethird2( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
+	Vec4 const twothirds_twothirds2( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
+	Vec4 const twonineths = VEC4_CONST( 2.0f/9.0f );
+	Vec4 const zero = VEC4_CONST( 0.0f );
+	Vec4 const half = VEC4_CONST( 0.5f );
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+	// prepare an ordering using the principle axis
+	ConstructOrdering( m_principle, 0 );
+	
+	// check all possible clusters and iterate on the total order
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = m_besterror;
+	u8 bestindices[16];
+	int bestiteration = 0;
+	int besti = 0, bestj = 0, bestk = 0;
+	
+	// loop over iterations (we avoid the case that all points in first or last cluster)
+	for( int iterationIndex = 0;; )
+	{
+		// first cluster [0,i) is at the start
+		Vec4 part0 = VEC4_CONST( 0.0f );
+		for( int i = 0; i < count; ++i )
+		{
+			// second cluster [i,j) is one third along
+			Vec4 part1 = VEC4_CONST( 0.0f );
+			for( int j = i;; )
+			{
+				// third cluster [j,k) is two thirds along
+				Vec4 part2 = ( j == 0 ) ? m_points_weights[0] : VEC4_CONST( 0.0f );
+				int kmin = ( j == 0 ) ? 1 : j;
+				for( int k = kmin;; )
+				{
+					// last cluster [k,count) is at the end
+					Vec4 part3 = m_xsum_wsum - part2 - part1 - part0;
+
+					// compute least squares terms directly
+					Vec4 const alphax_sum = MultiplyAdd( part2, onethird_onethird2, MultiplyAdd( part1, twothirds_twothirds2, part0 ) );
+					Vec4 const alpha2_sum = alphax_sum.SplatW();
+					
+					Vec4 const betax_sum = MultiplyAdd( part1, onethird_onethird2, MultiplyAdd( part2, twothirds_twothirds2, part3 ) );
+					Vec4 const beta2_sum = betax_sum.SplatW();
+					
+					Vec4 const alphabeta_sum = twonineths*( part1 + part2 ).SplatW();
+
+					// compute the least-squares optimal points
+					Vec4 factor = Reciprocal( NegativeMultiplySubtract( alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum ) );
+					Vec4 a = NegativeMultiplySubtract( betax_sum, alphabeta_sum, alphax_sum*beta2_sum )*factor;
+					Vec4 b = NegativeMultiplySubtract( alphax_sum, alphabeta_sum, betax_sum*alpha2_sum )*factor;
+
+					// clamp to the grid
+					a = Min( one, Max( zero, a ) );
+					b = Min( one, Max( zero, b ) );
+					a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
+					b = Truncate( MultiplyAdd( grid, b, half ) )*gridrcp;
+					
+					// compute the error (we skip the constant xxsum)
+					Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+					Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+					Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+					Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
+					// apply the metric to the error term
+					Vec4 e5 = e4*m_metric;
+					Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+
+					// keep the solution if it wins
+					if( CompareAnyLessThan( error, besterror ) )
+					{
+						beststart = a;
+						bestend = b;
+						besterror = error;
+						besti = i;
+						bestj = j;
+						bestk = k;
+						bestiteration = iterationIndex;
+					}
+
+					// advance
+					if( k == count )
+						break;
+					part2 += m_points_weights[k];
+					++k;
+				}
+
+				// advance
+				if( j == count )
+					break;
+				part1 += m_points_weights[j];
+				++j;
+			}
+
+			// advance
+			part0 += m_points_weights[i];
+		}
+		
+		// stop if we didn't improve in this iteration
+		if( bestiteration != iterationIndex )
+			break;
+			
+		// advance if possible
+		++iterationIndex;
+		if( iterationIndex == m_iterationCount )
+			break;
+			
+		// stop if a new iteration is an ordering that has already been tried
+		Vec3 axis = ( bestend - beststart ).GetVec3();
+		if( !ConstructOrdering( axis, iterationIndex ) )
+			break;
+	}
+
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+	{
+		// remap the indices
+		u8 const* order = ( u8* )m_order + 16*bestiteration;
+
+		u8 unordered[16];
+		for( int m = 0; m < besti; ++m )
+			unordered[order[m]] = 0;
+		for( int m = besti; m < bestj; ++m )
+			unordered[order[m]] = 2;
+		for( int m = bestj; m < bestk; ++m )
+			unordered[order[m]] = 3;
+		for( int m = bestk; m < count; ++m )
+			unordered[order[m]] = 1;
+
+		m_colours->RemapIndices( unordered, bestindices );
+		
+		// save the block
+		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+} // namespace squish
--- a/3rdparty/libsquish/clusterfit.h
+++ b/3rdparty/libsquish/clusterfit.h
@@ -0,0 +1,61 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2007 Ignacio Castano                   icastano@nvidia.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_CLUSTERFIT_H
+#define SQUISH_CLUSTERFIT_H
+
+#include "squish.h"
+#include "maths.h"
+#include "simd.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class ClusterFit : public ColourFit
+{
+public:
+	ClusterFit( ColourSet const* colours, int flags, float* metric );
+	
+private:
+	bool ConstructOrdering( Vec3 const& axis, int iteration );
+
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+
+	enum { kMaxIterations = 8 };
+
+	int m_iterationCount;
+	Vec3 m_principle;
+	u8 m_order[16*kMaxIterations];
+	Vec4 m_points_weights[16];
+	Vec4 m_xsum_wsum;
+	Vec4 m_metric;
+	Vec4 m_besterror;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_CLUSTERFIT_H
--- a/3rdparty/libsquish/colourblock.cpp
+++ b/3rdparty/libsquish/colourblock.cpp
@@ -0,0 +1,214 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourblock.h"
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+static int FloatTo565( Vec3::Arg colour )
+{
+	// get the components in the correct range
+	int r = FloatToInt( 31.0f*colour.X(), 31 );
+	int g = FloatToInt( 63.0f*colour.Y(), 63 );
+	int b = FloatToInt( 31.0f*colour.Z(), 31 );
+	
+	// pack into a single value
+	return ( r << 11 ) | ( g << 5 ) | b;
+}
+
+static void WriteColourBlock( int a, int b, u8* indices, void* block )
+{
+	// get the block as bytes
+	u8* bytes = ( u8* )block;
+
+	// write the endpoints
+	bytes[0] = ( u8 )( a & 0xff );
+	bytes[1] = ( u8 )( a >> 8 );
+	bytes[2] = ( u8 )( b & 0xff );
+	bytes[3] = ( u8 )( b >> 8 );
+	
+	// write the indices
+	for( int i = 0; i < 4; ++i )
+	{
+		u8 const* ind = indices + 4*i;
+		bytes[4 + i] = ind[0] | ( ind[1] << 2 ) | ( ind[2] << 4 ) | ( ind[3] << 6 );
+	}
+}
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	u8 remapped[16];
+	if( a <= b )
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
+	else
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+		{
+			if( indices[i] == 0 )
+				remapped[i] = 1;
+			else if( indices[i] == 1 )
+				remapped[i] = 0;
+			else
+				remapped[i] = indices[i];
+		}
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
+}
+
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	u8 remapped[16];
+	if( a < b )
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = ( indices[i] ^ 0x1 ) & 0x3;
+	}
+	else if( a == b )
+	{
+		// use index 0
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = 0;
+	}
+	else
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
+}
+
+static int Unpack565( u8 const* packed, u8* colour )
+{
+	// build the packed value
+	int value = ( int )packed[0] | ( ( int )packed[1] << 8 );
+	
+	// get the components in the stored range
+	u8 red = ( u8 )( ( value >> 11 ) & 0x1f );
+	u8 green = ( u8 )( ( value >> 5 ) & 0x3f );
+	u8 blue = ( u8 )( value & 0x1f );
+
+	// scale up to 8 bits
+	colour[0] = ( red << 3 ) | ( red >> 2 );
+	colour[1] = ( green << 2 ) | ( green >> 4 );
+	colour[2] = ( blue << 3 ) | ( blue >> 2 );
+	colour[3] = 255;
+	
+	// return the value
+	return value;
+}
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 )
+{
+	// get the block bytes
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	
+	// unpack the endpoints
+	u8 codes[16];
+	int a = Unpack565( bytes, codes );
+	int b = Unpack565( bytes + 2, codes + 4 );
+	
+	// generate the midpoints
+	for( int i = 0; i < 3; ++i )
+	{
+		int c = codes[i];
+		int d = codes[4 + i];
+
+		if( isDxt1 && a <= b )
+		{
+			codes[8 + i] = ( u8 )( ( c + d )/2 );
+			codes[12 + i] = 0;
+		}
+		else
+		{
+			codes[8 + i] = ( u8 )( ( 2*c + d )/3 );
+			codes[12 + i] = ( u8 )( ( c + 2*d )/3 );
+		}
+	}
+	
+	// fill in alpha for the intermediate values
+	codes[8 + 3] = 255;
+	codes[12 + 3] = ( isDxt1 && a <= b ) ? 0 : 255;
+	
+	// unpack the indices
+	u8 indices[16];
+	for( int i = 0; i < 4; ++i )
+	{
+		u8* ind = indices + 4*i;
+		u8 packed = bytes[4 + i];
+		
+		ind[0] = packed & 0x3;
+		ind[1] = ( packed >> 2 ) & 0x3;
+		ind[2] = ( packed >> 4 ) & 0x3;
+		ind[3] = ( packed >> 6 ) & 0x3;
+	}
+
+	// store out the colours
+	for( int i = 0; i < 16; ++i )
+	{
+		u8 offset = 4*indices[i];
+		for( int j = 0; j < 4; ++j )
+			rgba[4*i + j] = codes[offset + j];
+	}
+}
+
+} // namespace squish
--- a/3rdparty/libsquish/colourblock.h
+++ b/3rdparty/libsquish/colourblock.h
@@ -0,0 +1,41 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURBLOCK_H
+#define SQUISH_COLOURBLOCK_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 );
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURBLOCK_H
--- a/3rdparty/libsquish/colourfit.cpp
+++ b/3rdparty/libsquish/colourfit.cpp
@@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourfit.h"
+#include "colourset.h"
+
+namespace squish {
+
+ColourFit::ColourFit( ColourSet const* colours, int flags ) 
+  : m_colours( colours ), 
+	m_flags( flags )
+{
+}
+
+ColourFit::~ColourFit()
+{
+}
+
+void ColourFit::Compress( void* block )
+{
+	bool isDxt1 = ( ( m_flags & kDxt1 ) != 0 );
+	if( isDxt1 )
+	{
+		Compress3( block );
+		if( !m_colours->IsTransparent() )
+			Compress4( block );
+	}
+	else
+		Compress4( block );
+}
+
+} // namespace squish
--- a/3rdparty/libsquish/colourfit.h
+++ b/3rdparty/libsquish/colourfit.h
@@ -0,0 +1,56 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURFIT_H
+#define SQUISH_COLOURFIT_H
+
+#include "squish.h"
+#include "maths.h"
+
+#include <climits>
+
+namespace squish {
+
+class ColourSet;
+
+class ColourFit
+{
+public:
+	ColourFit( ColourSet const* colours, int flags );
+	virtual ~ColourFit();
+
+	void Compress( void* block );
+
+protected:
+	virtual void Compress3( void* block ) = 0;
+	virtual void Compress4( void* block ) = 0;
+
+	ColourSet const* m_colours;
+	int m_flags;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURFIT_H
--- a/3rdparty/libsquish/colourset.cpp
+++ b/3rdparty/libsquish/colourset.cpp
@@ -0,0 +1,121 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourset.h"
+
+namespace squish {
+
+ColourSet::ColourSet( u8 const* rgba, int mask, int flags )
+  : m_count( 0 ), 
+	m_transparent( false )
+{
+	// check the compression mode for dxt1
+	bool isDxt1 = ( ( flags & kDxt1 ) != 0 );
+	bool weightByAlpha = ( ( flags & kWeightColourByAlpha ) != 0 );
+
+	// create the minimal set
+	for( int i = 0; i < 16; ++i )
+	{
+		// check this pixel is enabled
+		int bit = 1 << i;
+		if( ( mask & bit ) == 0 )
+		{
+			m_remap[i] = -1;
+			continue;
+		}
+	
+		// check for transparent pixels when using dxt1
+		if( isDxt1 && rgba[4*i + 3] < 128 )
+		{
+			m_remap[i] = -1;
+			m_transparent = true;
+			continue;
+		}
+
+		// loop over previous points for a match
+		for( int j = 0;; ++j )
+		{
+			// allocate a new point
+			if( j == i )
+			{
+				// normalise coordinates to [0,1]
+				float x = ( float )rgba[4*i] / 255.0f;
+				float y = ( float )rgba[4*i + 1] / 255.0f;
+				float z = ( float )rgba[4*i + 2] / 255.0f;
+				
+				// ensure there is always non-zero weight even for zero alpha
+				float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+				// add the point
+				m_points[m_count] = Vec3( x, y, z );
+				m_weights[m_count] = ( weightByAlpha ? w : 1.0f );
+				m_remap[i] = m_count;
+				
+				// advance
+				++m_count;
+				break;
+			}
+		
+			// check for a match
+			int oldbit = 1 << j;
+			bool match = ( ( mask & oldbit ) != 0 )
+				&& ( rgba[4*i] == rgba[4*j] )
+				&& ( rgba[4*i + 1] == rgba[4*j + 1] )
+				&& ( rgba[4*i + 2] == rgba[4*j + 2] )
+				&& ( rgba[4*j + 3] >= 128 || !isDxt1 );
+			if( match )
+			{
+				// get the index of the match
+				int index = m_remap[j];
+				
+				// ensure there is always non-zero weight even for zero alpha
+				float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+				// map to this point and increase the weight
+				m_weights[index] += ( weightByAlpha ? w : 1.0f );
+				m_remap[i] = index;
+				break;
+			}
+		}
+	}
+
+	// square root the weights
+	for( int i = 0; i < m_count; ++i )
+		m_weights[i] = std::sqrt( m_weights[i] );
+}
+
+void ColourSet::RemapIndices( u8 const* source, u8* target ) const
+{
+	for( int i = 0; i < 16; ++i )
+	{
+		int j = m_remap[i];
+		if( j == -1 )
+			target[i] = 3;
+		else
+			target[i] = source[j];
+	}
+}
+
+} // namespace squish
--- a/3rdparty/libsquish/colourset.h
+++ b/3rdparty/libsquish/colourset.h
@@ -0,0 +1,58 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURSET_H
+#define SQUISH_COLOURSET_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+/*! @brief Represents a set of block colours
+*/
+class ColourSet
+{
+public:
+	ColourSet( u8 const* rgba, int mask, int flags );
+
+	int GetCount() const { return m_count; }
+	Vec3 const* GetPoints() const { return m_points; }
+	float const* GetWeights() const { return m_weights; }
+	bool IsTransparent() const { return m_transparent; }
+
+	void RemapIndices( u8 const* source, u8* target ) const;
+
+private:
+	int m_count;
+	Vec3 m_points[16];
+	float m_weights[16];
+	int m_remap[16];
+	bool m_transparent;
+};
+
+} // namespace sqish
+
+#endif // ndef SQUISH_COLOURSET_H
--- a/3rdparty/libsquish/config.h
+++ b/3rdparty/libsquish/config.h
@@ -0,0 +1,49 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_CONFIG_H
+#define SQUISH_CONFIG_H
+
+// Set to 1 when building squish to use Altivec instructions.
+#ifndef SQUISH_USE_ALTIVEC
+#define SQUISH_USE_ALTIVEC 0
+#endif
+
+// Set to 1 or 2 when building squish to use SSE or SSE2 instructions.
+#ifndef SQUISH_USE_SSE
+#define SQUISH_USE_SSE 0
+#endif
+
+// Internally set SQUISH_USE_SIMD when either Altivec or SSE is available.
+#if SQUISH_USE_ALTIVEC && SQUISH_USE_SSE
+#error "Cannot enable both Altivec and SSE!"
+#endif
+#if SQUISH_USE_ALTIVEC || SQUISH_USE_SSE
+#define SQUISH_USE_SIMD 1
+#else
+#define SQUISH_USE_SIMD 0
+#endif
+
+#endif // ndef SQUISH_CONFIG_H
--- a/3rdparty/libsquish/maths.cpp
+++ b/3rdparty/libsquish/maths.cpp
@@ -0,0 +1,259 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+/*! @file
+
+	The symmetric eigensystem solver algorithm is from 
+	http://www.geometrictools.com/Documentation/EigenSymmetric3x3.pdf
+*/
+
+#include "maths.h"
+#include "simd.h"
+#include <cfloat>
+
+namespace squish {
+
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights )
+{
+	// compute the centroid
+	float total = 0.0f;
+	Vec3 centroid( 0.0f );
+	for( int i = 0; i < n; ++i )
+	{
+		total += weights[i];
+		centroid += weights[i]*points[i];
+	}
+	if( total > FLT_EPSILON )
+		centroid /= total;
+
+	// accumulate the covariance matrix
+	Sym3x3 covariance( 0.0f );
+	for( int i = 0; i < n; ++i )
+	{
+		Vec3 a = points[i] - centroid;
+		Vec3 b = weights[i]*a;
+		
+		covariance[0] += a.X()*b.X();
+		covariance[1] += a.X()*b.Y();
+		covariance[2] += a.X()*b.Z();
+		covariance[3] += a.Y()*b.Y();
+		covariance[4] += a.Y()*b.Z();
+		covariance[5] += a.Z()*b.Z();
+	}
+	
+	// return it
+	return covariance;
+}
+
+#if 0
+
+static Vec3 GetMultiplicity1Evector( Sym3x3 const& matrix, float evalue )
+{
+	// compute M
+	Sym3x3 m;
+	m[0] = matrix[0] - evalue;
+	m[1] = matrix[1];
+	m[2] = matrix[2];
+	m[3] = matrix[3] - evalue;
+	m[4] = matrix[4];
+	m[5] = matrix[5] - evalue;
+
+	// compute U
+	Sym3x3 u;
+	u[0] = m[3]*m[5] - m[4]*m[4];
+	u[1] = m[2]*m[4] - m[1]*m[5];
+	u[2] = m[1]*m[4] - m[2]*m[3];
+	u[3] = m[0]*m[5] - m[2]*m[2];
+	u[4] = m[1]*m[2] - m[4]*m[0];
+	u[5] = m[0]*m[3] - m[1]*m[1];
+
+	// find the largest component
+	float mc = std::fabs( u[0] );
+	int mi = 0;
+	for( int i = 1; i < 6; ++i )
+	{
+		float c = std::fabs( u[i] );
+		if( c > mc )
+		{
+			mc = c;
+			mi = i;
+		}
+	}
+
+	// pick the column with this component
+	switch( mi )
+	{
+	case 0:
+		return Vec3( u[0], u[1], u[2] );
+
+	case 1:
+	case 3:
+		return Vec3( u[1], u[3], u[4] );
+
+	default:
+		return Vec3( u[2], u[4], u[5] );
+	}
+}
+
+static Vec3 GetMultiplicity2Evector( Sym3x3 const& matrix, float evalue )
+{
+	// compute M
+	Sym3x3 m;
+	m[0] = matrix[0] - evalue;
+	m[1] = matrix[1];
+	m[2] = matrix[2];
+	m[3] = matrix[3] - evalue;
+	m[4] = matrix[4];
+	m[5] = matrix[5] - evalue;
+
+	// find the largest component
+	float mc = std::fabs( m[0] );
+	int mi = 0;
+	for( int i = 1; i < 6; ++i )
+	{
+		float c = std::fabs( m[i] );
+		if( c > mc )
+		{
+			mc = c;
+			mi = i;
+		}
+	}
+
+	// pick the first eigenvector based on this index
+	switch( mi )
+	{
+	case 0:
+	case 1:
+		return Vec3( -m[1], m[0], 0.0f );
+
+	case 2:
+		return Vec3( m[2], 0.0f, -m[0] );
+
+	case 3:
+	case 4:
+		return Vec3( 0.0f, -m[4], m[3] );
+
+	default:
+		return Vec3( 0.0f, -m[5], m[4] );
+	}
+}
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+	// compute the cubic coefficients
+	float c0 = matrix[0]*matrix[3]*matrix[5] 
+		+ 2.0f*matrix[1]*matrix[2]*matrix[4] 
+		- matrix[0]*matrix[4]*matrix[4] 
+		- matrix[3]*matrix[2]*matrix[2] 
+		- matrix[5]*matrix[1]*matrix[1];
+	float c1 = matrix[0]*matrix[3] + matrix[0]*matrix[5] + matrix[3]*matrix[5]
+		- matrix[1]*matrix[1] - matrix[2]*matrix[2] - matrix[4]*matrix[4];
+	float c2 = matrix[0] + matrix[3] + matrix[5];
+
+	// compute the quadratic coefficients
+	float a = c1 - ( 1.0f/3.0f )*c2*c2;
+	float b = ( -2.0f/27.0f )*c2*c2*c2 + ( 1.0f/3.0f )*c1*c2 - c0;
+
+	// compute the root count check
+	float Q = 0.25f*b*b + ( 1.0f/27.0f )*a*a*a;
+
+	// test the multiplicity
+	if( FLT_EPSILON < Q )
+	{
+		// only one root, which implies we have a multiple of the identity
+        return Vec3( 1.0f );
+	}
+	else if( Q < -FLT_EPSILON )
+	{
+		// three distinct roots
+		float theta = std::atan2( std::sqrt( -Q ), -0.5f*b );
+		float rho = std::sqrt( 0.25f*b*b - Q );
+
+		float rt = std::pow( rho, 1.0f/3.0f );
+		float ct = std::cos( theta/3.0f );
+		float st = std::sin( theta/3.0f );
+
+		float l1 = ( 1.0f/3.0f )*c2 + 2.0f*rt*ct;
+		float l2 = ( 1.0f/3.0f )*c2 - rt*( ct + ( float )sqrt( 3.0f )*st );
+		float l3 = ( 1.0f/3.0f )*c2 - rt*( ct - ( float )sqrt( 3.0f )*st );
+
+		// pick the larger
+		if( std::fabs( l2 ) > std::fabs( l1 ) )
+			l1 = l2;
+		if( std::fabs( l3 ) > std::fabs( l1 ) )
+			l1 = l3;
+
+		// get the eigenvector
+		return GetMultiplicity1Evector( matrix, l1 );
+	}
+	else // if( -FLT_EPSILON <= Q && Q <= FLT_EPSILON )
+	{
+		// two roots
+		float rt;
+		if( b < 0.0f )
+			rt = -std::pow( -0.5f*b, 1.0f/3.0f );
+		else
+			rt = std::pow( 0.5f*b, 1.0f/3.0f );
+		
+		float l1 = ( 1.0f/3.0f )*c2 + rt;		// repeated
+		float l2 = ( 1.0f/3.0f )*c2 - 2.0f*rt;
+		
+		// get the eigenvector
+		if( std::fabs( l1 ) > std::fabs( l2 ) )
+			return GetMultiplicity2Evector( matrix, l1 );
+		else
+			return GetMultiplicity1Evector( matrix, l2 );
+	}
+}
+
+#else
+
+#define POWER_ITERATION_COUNT 	8
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+	Vec4 const row0( matrix[0], matrix[1], matrix[2], 0.0f );
+	Vec4 const row1( matrix[1], matrix[3], matrix[4], 0.0f );
+	Vec4 const row2( matrix[2], matrix[4], matrix[5], 0.0f );
+	Vec4 v = VEC4_CONST( 1.0f );
+	for( int i = 0; i < POWER_ITERATION_COUNT; ++i )
+	{
+		// matrix multiply
+		Vec4 w = row0*v.SplatX();
+		w = MultiplyAdd(row1, v.SplatY(), w);
+		w = MultiplyAdd(row2, v.SplatZ(), w);
+
+		// get max component from xyz in all channels
+		Vec4 a = Max(w.SplatX(), Max(w.SplatY(), w.SplatZ()));
+
+		// divide through and advance
+		v = w*Reciprocal(a);
+	}
+	return v.GetVec3();
+}
+
+#endif
+
+} // namespace squish
--- a/3rdparty/libsquish/maths.h
+++ b/3rdparty/libsquish/maths.h
@@ -0,0 +1,233 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_MATHS_H
+#define SQUISH_MATHS_H
+
+#include <cmath>
+#include <algorithm>
+#include "config.h"
+
+namespace squish {
+
+class Vec3
+{
+public:
+	typedef Vec3 const& Arg;
+
+	Vec3()
+	{
+	}
+
+	explicit Vec3( float s )
+	{
+		m_x = s;
+		m_y = s;
+		m_z = s;
+	}
+
+	Vec3( float x, float y, float z )
+	{
+		m_x = x;
+		m_y = y;
+		m_z = z;
+	}
+	
+	float X() const { return m_x; }
+	float Y() const { return m_y; }
+	float Z() const { return m_z; }
+	
+	Vec3 operator-() const
+	{
+		return Vec3( -m_x, -m_y, -m_z );
+	}
+	
+	Vec3& operator+=( Arg v )
+	{
+		m_x += v.m_x;
+		m_y += v.m_y;
+		m_z += v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator-=( Arg v )
+	{
+		m_x -= v.m_x;
+		m_y -= v.m_y;
+		m_z -= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator*=( Arg v )
+	{
+		m_x *= v.m_x;
+		m_y *= v.m_y;
+		m_z *= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator*=( float s )
+	{
+		m_x *= s;
+		m_y *= s;
+		m_z *= s;
+		return *this;
+	}
+	
+	Vec3& operator/=( Arg v )
+	{
+		m_x /= v.m_x;
+		m_y /= v.m_y;
+		m_z /= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator/=( float s )
+	{
+		float t = 1.0f/s;
+		m_x *= t;
+		m_y *= t;
+		m_z *= t;
+		return *this;
+	}
+	
+	friend Vec3 operator+( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy += right;
+	}
+	
+	friend Vec3 operator-( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy -= right;
+	}
+	
+	friend Vec3 operator*( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy *= right;
+	}
+	
+	friend Vec3 operator*( Arg left, float right )
+	{
+		Vec3 copy( left );
+		return copy *= right;
+	}
+	
+	friend Vec3 operator*( float left, Arg right )
+	{
+		Vec3 copy( right );
+		return copy *= left;
+	}
+	
+	friend Vec3 operator/( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy /= right;
+	}
+	
+	friend Vec3 operator/( Arg left, float right )
+	{
+		Vec3 copy( left );
+		return copy /= right;
+	}
+	
+	friend float Dot( Arg left, Arg right )
+	{
+		return left.m_x*right.m_x + left.m_y*right.m_y + left.m_z*right.m_z;
+	}
+	
+	friend Vec3 Min( Arg left, Arg right )
+	{
+		return Vec3(
+			std::min( left.m_x, right.m_x ), 
+			std::min( left.m_y, right.m_y ), 
+			std::min( left.m_z, right.m_z )
+		);
+	}
+
+	friend Vec3 Max( Arg left, Arg right )
+	{
+		return Vec3(
+			std::max( left.m_x, right.m_x ), 
+			std::max( left.m_y, right.m_y ), 
+			std::max( left.m_z, right.m_z )
+		);
+	}
+
+	friend Vec3 Truncate( Arg v )
+	{
+		return Vec3(
+			v.m_x > 0.0f ? std::floor( v.m_x ) : std::ceil( v.m_x ), 
+			v.m_y > 0.0f ? std::floor( v.m_y ) : std::ceil( v.m_y ), 
+			v.m_z > 0.0f ? std::floor( v.m_z ) : std::ceil( v.m_z )
+		);
+	}
+
+private:
+	float m_x;
+	float m_y;
+	float m_z;
+};
+
+inline float LengthSquared( Vec3::Arg v )
+{
+	return Dot( v, v );
+}
+
+class Sym3x3
+{
+public:
+	Sym3x3()
+	{
+	}
+
+	Sym3x3( float s )
+	{
+		for( int i = 0; i < 6; ++i )
+			m_x[i] = s;
+	}
+
+	float operator[]( int index ) const
+	{
+		return m_x[index];
+	}
+
+	float& operator[]( int index )
+	{
+		return m_x[index];
+	}
+
+private:
+	float m_x[6];
+};
+
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights );
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix );
+
+} // namespace squish
+
+#endif // ndef SQUISH_MATHS_H
--- a/3rdparty/libsquish/rangefit.cpp
+++ b/3rdparty/libsquish/rangefit.cpp
@@ -0,0 +1,201 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "rangefit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+RangeFit::RangeFit( ColourSet const* colours, int flags, float* metric ) 
+  : ColourFit( colours, flags )
+{
+	// initialise the metric (old perceptual = 0.2126f, 0.7152f, 0.0722f)
+	if( metric )
+		m_metric = Vec3( metric[0], metric[1], metric[2] );
+	else
+		m_metric = Vec3( 1.0f );	
+
+	// initialise the best error
+	m_besterror = FLT_MAX;
+
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, weights );
+	
+	// compute the principle component
+	Vec3 principle = ComputePrincipleComponent( covariance );
+
+	// get the min and max range as the codebook endpoints
+	Vec3 start( 0.0f );
+	Vec3 end( 0.0f );
+	if( count > 0 )
+	{
+		float min, max;
+		
+		// compute the range
+		start = end = values[0];
+		min = max = Dot( values[0], principle );
+		for( int i = 1; i < count; ++i )
+		{
+			float val = Dot( values[i], principle );
+			if( val < min )
+			{
+				start = values[i];
+				min = val;
+			}
+			else if( val > max )
+			{
+				end = values[i];
+				max = val;
+			}
+		}
+	}
+			
+	// clamp the output to [0, 1]
+	Vec3 const one( 1.0f );
+	Vec3 const zero( 0.0f );
+	start = Min( one, Max( zero, start ) );
+	end = Min( one, Max( zero, end ) );
+
+	// clamp to the grid and save
+	Vec3 const grid( 31.0f, 63.0f, 31.0f );
+	Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+	Vec3 const half( 0.5f );
+	m_start = Truncate( grid*start + half )*gridrcp;
+	m_end = Truncate( grid*end + half )*gridrcp;
+}
+
+void RangeFit::Compress3( void* block )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// create a codebook
+	Vec3 codes[3];
+	codes[0] = m_start;
+	codes[1] = m_end;
+	codes[2] = 0.5f*m_start + 0.5f*m_end;
+
+	// match each point to the closest code
+	u8 closest[16];
+	float error = 0.0f;
+	for( int i = 0; i < count; ++i )
+	{
+		// find the closest code
+		float dist = FLT_MAX;
+		int idx = 0;
+		for( int j = 0; j < 3; ++j )
+		{
+			float d = LengthSquared( m_metric*( values[i] - codes[j] ) );
+			if( d < dist )
+			{
+				dist = d;
+				idx = j;
+			}
+		}
+		
+		// save the index
+		closest[i] = ( u8 )idx;
+		
+		// accumulate the error
+		error += dist;
+	}
+	
+	// save this scheme if it wins
+	if( error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( closest, indices );
+		
+		// save the block
+		WriteColourBlock3( m_start, m_end, indices, block );
+		
+		// save the error
+		m_besterror = error;
+	}
+}
+
+void RangeFit::Compress4( void* block )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// create a codebook
+	Vec3 codes[4];
+	codes[0] = m_start;
+	codes[1] = m_end;
+	codes[2] = ( 2.0f/3.0f )*m_start + ( 1.0f/3.0f )*m_end;
+	codes[3] = ( 1.0f/3.0f )*m_start + ( 2.0f/3.0f )*m_end;
+
+	// match each point to the closest code
+	u8 closest[16];
+	float error = 0.0f;
+	for( int i = 0; i < count; ++i )
+	{
+		// find the closest code
+		float dist = FLT_MAX;
+		int idx = 0;
+		for( int j = 0; j < 4; ++j )
+		{
+			float d = LengthSquared( m_metric*( values[i] - codes[j] ) );
+			if( d < dist )
+			{
+				dist = d;
+				idx = j;
+			}
+		}
+		
+		// save the index
+		closest[i] = ( u8 )idx;
+		
+		// accumulate the error
+		error += dist;
+	}
+	
+	// save this scheme if it wins
+	if( error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( closest, indices );
+		
+		// save the block
+		WriteColourBlock4( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = error;
+	}
+}
+
+} // namespace squish
--- a/3rdparty/libsquish/rangefit.h
+++ b/3rdparty/libsquish/rangefit.h
@@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_RANGEFIT_H
+#define SQUISH_RANGEFIT_H
+
+#include "squish.h"
+#include "colourfit.h"
+#include "maths.h"
+
+namespace squish {
+
+class ColourSet;
+
+class RangeFit : public ColourFit
+{
+public:
+	RangeFit( ColourSet const* colours, int flags, float* metric );
+	
+private:
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+	
+	Vec3 m_metric;
+	Vec3 m_start;
+	Vec3 m_end;
+	float m_besterror;
+};
+
+} // squish
+
+#endif // ndef SQUISH_RANGEFIT_H
--- a/3rdparty/libsquish/simd.h
+++ b/3rdparty/libsquish/simd.h
@@ -0,0 +1,32 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_H
+#define SQUISH_SIMD_H
+
+#include "maths.h"
+#include "simd_float.h"
+
+#endif // ndef SQUISH_SIMD_H
--- a/3rdparty/libsquish/simd_float.h
+++ b/3rdparty/libsquish/simd_float.h
@@ -0,0 +1,183 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_FLOAT_H
+#define SQUISH_SIMD_FLOAT_H
+
+#include <algorithm>
+
+namespace squish {
+
+#define VEC4_CONST( X ) Vec4( X )
+
+class Vec4
+{
+public:
+	typedef Vec4 const& Arg;
+
+	Vec4() {}
+		
+	explicit Vec4( float s )
+	  : m_x( s ),
+		m_y( s ),
+		m_z( s ),
+		m_w( s )
+	{
+	}
+	
+	Vec4( float x, float y, float z, float w )
+	  : m_x( x ),
+		m_y( y ),
+		m_z( z ),
+		m_w( w )
+	{
+	}
+	
+	Vec3 GetVec3() const
+	{
+		return Vec3( m_x, m_y, m_z );
+	}
+	
+	Vec4 SplatX() const { return Vec4( m_x ); }
+	Vec4 SplatY() const { return Vec4( m_y ); }
+	Vec4 SplatZ() const { return Vec4( m_z ); }
+	Vec4 SplatW() const { return Vec4( m_w ); }
+
+	Vec4& operator+=( Arg v )
+	{
+		m_x += v.m_x;
+		m_y += v.m_y;
+		m_z += v.m_z;
+		m_w += v.m_w;
+		return *this;
+	}
+	
+	Vec4& operator-=( Arg v )
+	{
+		m_x -= v.m_x;
+		m_y -= v.m_y;
+		m_z -= v.m_z;
+		m_w -= v.m_w;
+		return *this;
+	}
+	
+	Vec4& operator*=( Arg v )
+	{
+		m_x *= v.m_x;
+		m_y *= v.m_y;
+		m_z *= v.m_z;
+		m_w *= v.m_w;
+		return *this;
+	}
+	
+	friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right  )
+	{
+		Vec4 copy( left );
+		return copy += right;
+	}
+	
+	friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right  )
+	{
+		Vec4 copy( left );
+		return copy -= right;
+	}
+	
+	friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
+	{
+		Vec4 copy( left );
+		return copy *= right;
+	}
+	
+	//! Returns a*b + c
+	friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return a*b + c;
+	}
+	
+	//! Returns -( a*b - c )
+	friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return c - a*b;
+	}
+	
+	friend Vec4 Reciprocal( Vec4::Arg v )
+	{
+		return Vec4( 
+			1.0f/v.m_x, 
+			1.0f/v.m_y, 
+			1.0f/v.m_z, 
+			1.0f/v.m_w 
+		);
+	}
+	
+	friend Vec4 Min( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( 
+			std::min( left.m_x, right.m_x ), 
+			std::min( left.m_y, right.m_y ), 
+			std::min( left.m_z, right.m_z ), 
+			std::min( left.m_w, right.m_w ) 
+		);
+	}
+	
+	friend Vec4 Max( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( 
+			std::max( left.m_x, right.m_x ), 
+			std::max( left.m_y, right.m_y ), 
+			std::max( left.m_z, right.m_z ), 
+			std::max( left.m_w, right.m_w ) 
+		);
+	}
+	
+	friend Vec4 Truncate( Vec4::Arg v )
+	{
+		return Vec4(
+			v.m_x > 0.0f ? std::floor( v.m_x ) : std::ceil( v.m_x ), 
+			v.m_y > 0.0f ? std::floor( v.m_y ) : std::ceil( v.m_y ), 
+			v.m_z > 0.0f ? std::floor( v.m_z ) : std::ceil( v.m_z ),
+			v.m_w > 0.0f ? std::floor( v.m_w ) : std::ceil( v.m_w )
+		);
+	}
+	
+	friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right ) 
+	{
+		return left.m_x < right.m_x
+			|| left.m_y < right.m_y
+			|| left.m_z < right.m_z
+			|| left.m_w < right.m_w;
+	}
+	
+private:
+	float m_x;
+	float m_y;
+	float m_z;
+	float m_w;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SIMD_FLOAT_H
+
--- a/3rdparty/libsquish/singlecolourfit.cpp
+++ b/3rdparty/libsquish/singlecolourfit.cpp
@@ -0,0 +1,172 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "singlecolourfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+
+namespace squish {
+
+struct SourceBlock
+{
+	u8 start;
+	u8 end;
+	u8 error;
+};
+
+struct SingleColourLookup
+{
+	SourceBlock sources[2];
+};
+
+#include "singlecolourlookup.inl"
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+SingleColourFit::SingleColourFit( ColourSet const* colours, int flags )
+  : ColourFit( colours, flags )
+{
+	// grab the single colour
+	Vec3 const* values = m_colours->GetPoints();
+	m_colour[0] = ( u8 )FloatToInt( 255.0f*values->X(), 255 );
+	m_colour[1] = ( u8 )FloatToInt( 255.0f*values->Y(), 255 );
+	m_colour[2] = ( u8 )FloatToInt( 255.0f*values->Z(), 255 );
+		
+	// initialise the best error
+	m_besterror = INT_MAX;
+}
+
+void SingleColourFit::Compress3( void* block )
+{
+	// build the table of lookups
+	SingleColourLookup const* const lookups[] = 
+	{
+		lookup_5_3, 
+		lookup_6_3, 
+		lookup_5_3
+	};
+	
+	// find the best end-points and index
+	ComputeEndPoints( lookups );
+	
+	// build the block if we win
+	if( m_error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( &m_index, indices );
+		
+		// save the block
+		WriteColourBlock3( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = m_error;
+	}
+}
+
+void SingleColourFit::Compress4( void* block )
+{
+	// build the table of lookups
+	SingleColourLookup const* const lookups[] = 
+	{
+		lookup_5_4, 
+		lookup_6_4, 
+		lookup_5_4
+	};
+	
+	// find the best end-points and index
+	ComputeEndPoints( lookups );
+	
+	// build the block if we win
+	if( m_error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( &m_index, indices );
+		
+		// save the block
+		WriteColourBlock4( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = m_error;
+	}
+}
+
+void SingleColourFit::ComputeEndPoints( SingleColourLookup const* const* lookups )
+{
+	// check each index combination (endpoint or intermediate)
+	m_error = INT_MAX;
+	for( int index = 0; index < 2; ++index )
+	{
+		// check the error for this codebook index
+		SourceBlock const* sources[3];
+		int error = 0;
+		for( int channel = 0; channel < 3; ++channel )
+		{
+			// grab the lookup table and index for this channel
+			SingleColourLookup const* lookup = lookups[channel];
+			int target = m_colour[channel];
+			
+			// store a pointer to the source for this channel
+			sources[channel] = lookup[target].sources + index;
+			
+			// accumulate the error
+			int diff = sources[channel]->error;
+			error += diff*diff;			
+		}
+		
+		// keep it if the error is lower
+		if( error < m_error )
+		{
+			m_start = Vec3(
+				( float )sources[0]->start/31.0f, 
+				( float )sources[1]->start/63.0f, 
+				( float )sources[2]->start/31.0f
+			);
+			m_end = Vec3(
+				( float )sources[0]->end/31.0f, 
+				( float )sources[1]->end/63.0f, 
+				( float )sources[2]->end/31.0f
+			);
+			m_index = ( u8 )( 2*index );
+			m_error = error;
+		}
+	}
+}
+
+} // namespace squish
--- a/3rdparty/libsquish/singlecolourfit.h
+++ b/3rdparty/libsquish/singlecolourfit.h
@@ -0,0 +1,58 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SINGLECOLOURFIT_H
+#define SQUISH_SINGLECOLOURFIT_H
+
+#include "squish.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class ColourSet;
+struct SingleColourLookup;
+
+class SingleColourFit : public ColourFit
+{
+public:
+	SingleColourFit( ColourSet const* colours, int flags );
+	
+private:
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+	
+	void ComputeEndPoints( SingleColourLookup const* const* lookups );
+	
+	u8 m_colour[3];
+	Vec3 m_start;
+	Vec3 m_end;
+	u8 m_index;
+	int m_error;
+	int m_besterror;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SINGLECOLOURFIT_H
--- a/3rdparty/libsquish/singlecolourlookup.inl
+++ b/3rdparty/libsquish/singlecolourlookup.inl
--- a/3rdparty/libsquish/squish.cpp
+++ b/3rdparty/libsquish/squish.cpp
@@ -0,0 +1,260 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "squish.h"
+#include "colourset.h"
+#include "maths.h"
+#include "rangefit.h"
+#include "clusterfit.h"
+#include "colourblock.h"
+#include "alpha.h"
+#include "singlecolourfit.h"
+
+namespace squish {
+
+static int FixFlags( int flags )
+{
+	// grab the flag bits
+	int method = flags & ( kDxt1 | kDxt3 | kDxt5 | kBc4 | kBc5 );
+	int fit = flags & ( kColourIterativeClusterFit | kColourClusterFit | kColourRangeFit );
+	int extra = flags & kWeightColourByAlpha;
+	
+	// set defaults
+	if ( method != kDxt3
+	&&   method != kDxt5
+	&&   method != kBc4
+	&&   method != kBc5 )
+	{
+		method = kDxt1;
+	}
+	if( fit != kColourRangeFit && fit != kColourIterativeClusterFit )
+		fit = kColourClusterFit;
+		
+	// done
+	return method | fit | extra;
+}
+
+void CompressMasked( u8 const* rgba, int mask, void* block, int flags, float* metric )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	if ( ( flags & ( kBc4 | kBc5 ) ) != 0 )
+	{
+		u8 alpha[16*4];
+		for( int i = 0; i < 16; ++i )
+		{
+			alpha[i*4 + 3] = rgba[i*4 + 0]; // copy R to A
+		}
+
+		u8* rBlock = reinterpret_cast< u8* >( block );
+		CompressAlphaDxt5( alpha, mask, rBlock );
+
+		if ( ( flags & ( kBc5 ) ) != 0 )
+		{
+			for( int i = 0; i < 16; ++i )
+			{
+				alpha[i*4 + 3] = rgba[i*4 + 1]; // copy G to A
+			}
+
+			u8* gBlock = reinterpret_cast< u8* >( block ) + 8;
+			CompressAlphaDxt5( alpha, mask, gBlock );
+		}
+
+		return;
+	}
+
+	// get the block locations
+	void* colourBlock = block;
+	void* alphaBlock = block;
+	if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
+		colourBlock = reinterpret_cast< u8* >( block ) + 8;
+
+	// create the minimal point set
+	ColourSet colours( rgba, mask, flags );
+	
+	// check the compression type and compress colour
+	if( colours.GetCount() == 1 )
+	{
+		// always do a single colour fit
+		SingleColourFit fit( &colours, flags );
+		fit.Compress( colourBlock );
+	}
+	else if( ( flags & kColourRangeFit ) != 0 || colours.GetCount() == 0 )
+	{
+		// do a range fit
+		RangeFit fit( &colours, flags, metric );
+		fit.Compress( colourBlock );
+	}
+	else
+	{
+		// default to a cluster fit (could be iterative or not)
+		ClusterFit fit( &colours, flags, metric );
+		fit.Compress( colourBlock );
+	}
+	
+	// compress alpha separately if necessary
+	if( ( flags & kDxt3 ) != 0 )
+		CompressAlphaDxt3( rgba, mask, alphaBlock );
+	else if( ( flags & kDxt5 ) != 0 )
+		CompressAlphaDxt5( rgba, mask, alphaBlock );
+}
+
+void Decompress( u8* rgba, void const* block, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// get the block locations
+	void const* colourBlock = block;
+	void const* alphaBock = block;
+	if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
+		colourBlock = reinterpret_cast< u8 const* >( block ) + 8;
+
+	// decompress colour
+	DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
+
+	// decompress alpha separately if necessary
+	if( ( flags & kDxt3 ) != 0 )
+		DecompressAlphaDxt3( rgba, alphaBock );
+	else if( ( flags & kDxt5 ) != 0 )
+		DecompressAlphaDxt5( rgba, alphaBock );
+}
+
+int GetStorageRequirements( int width, int height, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+	
+	// compute the storage requirements
+	int blockcount = ( ( width + 3 )/4 ) * ( ( height + 3 )/4 );
+	int blocksize = ( ( flags & ( kDxt1 | kBc4 ) ) != 0 ) ? 8 : 16;
+	return blockcount*blocksize;
+}
+
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags, float* metric )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// initialise the block output
+	u8* targetBlock = reinterpret_cast< u8* >( blocks );
+	int bytesPerBlock = ( ( flags & ( kDxt1 | kBc4 ) ) != 0 ) ? 8 : 16;
+
+	// loop over blocks
+	for( int y = 0; y < height; y += 4 )
+	{
+		for( int x = 0; x < width; x += 4 )
+		{
+			// build the 4x4 block of pixels
+			u8 sourceRgba[16*4];
+			u8* targetPixel = sourceRgba;
+			int mask = 0;
+			for( int py = 0; py < 4; ++py )
+			{
+				for( int px = 0; px < 4; ++px )
+				{
+					// get the source pixel in the image
+					int sx = x + px;
+					int sy = y + py;
+					
+					// enable if we're in the image
+					if( sx < width && sy < height )
+					{
+						// copy the rgba value
+						u8 const* sourcePixel = rgba + 4*( width*sy + sx );
+						for( int i = 0; i < 4; ++i )
+							*targetPixel++ = *sourcePixel++;
+							
+						// enable this pixel
+						mask |= ( 1 << ( 4*py + px ) );
+					}
+					else
+					{
+						// skip this pixel as its outside the image
+						targetPixel += 4;
+					}
+				}
+			}
+			
+			// compress it into the output
+			CompressMasked( sourceRgba, mask, targetBlock, flags, metric );
+			
+			// advance
+			targetBlock += bytesPerBlock;
+		}
+	}
+}
+
+void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// initialise the block input
+	u8 const* sourceBlock = reinterpret_cast< u8 const* >( blocks );
+	int bytesPerBlock = ( ( flags & ( kDxt1 | kBc4 ) ) != 0 ) ? 8 : 16;
+
+	// loop over blocks
+	for( int y = 0; y < height; y += 4 )
+	{
+		for( int x = 0; x < width; x += 4 )
+		{
+			// decompress the block
+			u8 targetRgba[4*16];
+			Decompress( targetRgba, sourceBlock, flags );
+			
+			// write the decompressed pixels to the correct image locations
+			u8 const* sourcePixel = targetRgba;
+			for( int py = 0; py < 4; ++py )
+			{
+				for( int px = 0; px < 4; ++px )
+				{
+					// get the target location
+					int sx = x + px;
+					int sy = y + py;
+					if( sx < width && sy < height )
+					{
+						u8* targetPixel = rgba + 4*( width*sy + sx );
+						
+						// copy the rgba value
+						for( int i = 0; i < 4; ++i )
+							*targetPixel++ = *sourcePixel++;
+					}
+					else
+					{
+						// skip this pixel as its outside the image
+						sourcePixel += 4;
+					}
+				}
+			}
+			
+			// advance
+			sourceBlock += bytesPerBlock;
+		}
+	}
+}
+
+} // namespace squish
--- a/3rdparty/libsquish/squish.h
+++ b/3rdparty/libsquish/squish.h
@@ -0,0 +1,269 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_H
+#define SQUISH_H
+
+//! All squish API functions live in this namespace.
+namespace squish {
+
+// -----------------------------------------------------------------------------
+
+//! Typedef a quantity that is a single unsigned byte.
+typedef unsigned char u8;
+
+// -----------------------------------------------------------------------------
+
+enum
+{
+	//! Use DXT1 compression.
+	kDxt1 = ( 1 << 0 ),
+
+	//! Use DXT3 compression.
+	kDxt3 = ( 1 << 1 ),
+
+	//! Use DXT5 compression.
+	kDxt5 = ( 1 << 2 ),
+
+	//! Use BC4 compression.
+	kBc4 = ( 1 << 3 ),
+
+	//! Use BC5 compression.
+	kBc5 = ( 1 << 4 ),
+
+	//! Use a slow but high quality colour compressor (the default).
+	kColourClusterFit = ( 1 << 5 ),
+
+	//! Use a fast but low quality colour compressor.
+	kColourRangeFit	= ( 1 << 6 ),
+
+	//! Weight the colour by alpha during cluster fit (disabled by default).
+	kWeightColourByAlpha = ( 1 << 7 ),
+
+	//! Use a very slow but very high quality colour compressor.
+	kColourIterativeClusterFit = ( 1 << 8 ),
+};
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses a 4x4 block of pixels.
+
+	@param rgba		The rgba values of the 16 source pixels.
+	@param mask		The valid pixel mask.
+	@param block	Storage for the compressed DXT block.
+	@param flags	Compression flags.
+	@param metric	An optional perceptual metric.
+	
+	The source pixels should be presented as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+		
+	The mask parameter enables only certain pixels within the block. The lowest
+	bit enables the first pixel and so on up to the 16th bit. Bits beyond the
+	16th bit are ignored. Pixels that are not enabled are allowed to take
+	arbitrary colours in the output block. An example of how this can be used
+	is in the CompressImage function to disable pixels outside the bounds of
+	the image when the width or height is not divisible by 4.
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for the compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor to use 
+	when fitting the RGB components of the data. Possible colour compressors 
+	are: kColourClusterFit (the default), kColourRangeFit (very fast, low 
+	quality) or kColourIterativeClusterFit (slowest, best quality).
+		
+	When using kColourClusterFit or kColourIterativeClusterFit, an additional 
+	flag can be specified to weight the importance of each pixel by its alpha 
+	value. For images that are rendered using alpha blending, this can 
+	significantly increase the perceived quality.
+	
+	The metric parameter can be used to weight the relative importance of each
+	colour channel, or pass NULL to use the default uniform weight of 
+	{ 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that 
+	allowed either uniform or "perceptual" weights with the fixed values
+	{ 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a 
+	contiguous array of 3 floats.
+*/
+void CompressMasked( u8 const* rgba, int mask, void* block, int flags, float* metric = 0 );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses a 4x4 block of pixels.
+
+	@param rgba		The rgba values of the 16 source pixels.
+	@param block	Storage for the compressed DXT block.
+	@param flags	Compression flags.
+	@param metric	An optional perceptual metric.
+	
+	The source pixels should be presented as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for the compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor to use 
+	when fitting the RGB components of the data. Possible colour compressors 
+	are: kColourClusterFit (the default), kColourRangeFit (very fast, low 
+	quality) or kColourIterativeClusterFit (slowest, best quality).
+		
+	When using kColourClusterFit or kColourIterativeClusterFit, an additional 
+	flag can be specified to weight the importance of each pixel by its alpha 
+	value. For images that are rendered using alpha blending, this can 
+	significantly increase the perceived quality.
+	
+	The metric parameter can be used to weight the relative importance of each
+	colour channel, or pass NULL to use the default uniform weight of 
+	{ 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that 
+	allowed either uniform or "perceptual" weights with the fixed values
+	{ 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a 
+	contiguous array of 3 floats.
+	
+	This method is an inline that calls CompressMasked with a mask of 0xffff, 
+	provided for compatibility with older versions of squish.
+*/
+inline void Compress( u8 const* rgba, void* block, int flags, float* metric = 0 )
+{
+	CompressMasked( rgba, 0xffff, block, flags, metric );
+}
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Decompresses a 4x4 block of pixels.
+
+	@param rgba		Storage for the 16 decompressed pixels.
+	@param block	The compressed DXT block.
+	@param flags	Compression flags.
+
+	The decompressed pixels will be written as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this is:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+*/
+void Decompress( u8* rgba, void const* block, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Computes the amount of compressed storage required.
+
+	@param width	The width of the image.
+	@param height	The height of the image.
+	@param flags	Compression flags.
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+	
+	Most DXT images will be a multiple of 4 in each dimension, but this 
+	function supports arbitrary size images by allowing the outer blocks to
+	be only partially used.
+*/
+int GetStorageRequirements( int width, int height, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses an image in memory.
+
+	@param rgba		The pixels of the source.
+	@param width	The width of the source image.
+	@param height	The height of the source image.
+	@param blocks	Storage for the compressed output.
+	@param flags	Compression flags.
+	@param metric	An optional perceptual metric.
+	
+	The source pixels should be presented as a contiguous array of width*height
+	rgba values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height
+		
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for each compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor to use 
+	when fitting the RGB components of the data. Possible colour compressors 
+	are: kColourClusterFit (the default), kColourRangeFit (very fast, low 
+	quality) or kColourIterativeClusterFit (slowest, best quality).
+		
+	When using kColourClusterFit or kColourIterativeClusterFit, an additional 
+	flag can be specified to weight the importance of each pixel by its alpha 
+	value. For images that are rendered using alpha blending, this can 
+	significantly increase the perceived quality.
+	
+	The metric parameter can be used to weight the relative importance of each
+	colour channel, or pass NULL to use the default uniform weight of 
+	{ 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that 
+	allowed either uniform or "perceptual" weights with the fixed values
+	{ 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a 
+	contiguous array of 3 floats.
+	
+	Internally this function calls squish::CompressMasked for each block, which 
+	allows for pixels outside the image to take arbitrary values. The function 
+	squish::GetStorageRequirements can be called to compute the amount of memory
+	to allocate for the compressed output.
+*/
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags, float* metric = 0 );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Decompresses an image in memory.
+
+	@param rgba		Storage for the decompressed pixels.
+	@param width	The width of the source image.
+	@param height	The height of the source image.
+	@param blocks	The compressed DXT blocks.
+	@param flags	Compression flags.
+	
+	The decompressed pixels will be written as a contiguous array of width*height
+	16 rgba values, with each component as 1 byte each. In memory this is:
+	
+		{ r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height
+		
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+
+	Internally this function calls squish::Decompress for each block.
+*/
+void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags );
+
+// -----------------------------------------------------------------------------
+
+} // namespace squish
+
+#endif // ndef SQUISH_H
+
--- a/3rdparty/lodepng/README.md
+++ b/3rdparty/lodepng/README.md
@@ -0,0 +1,10 @@
+LodePNG
+-------
+
+PNG encoder and decoder in C and C++.
+
+Home page: http://lodev.org/lodepng/
+
+Only two files are needed to allow your program to read and write PNG files: lodepng.cpp and lodepng.h.
+
+The other files in the project are just examples, unit tests, etc...
--- a/3rdparty/lodepng/lodepng.cpp
+++ b/3rdparty/lodepng/lodepng.cpp
--- a/3rdparty/lodepng/lodepng.h
+++ b/3rdparty/lodepng/lodepng.h
--- a/3rdparty/maratis-tcl/LICENSE
+++ b/3rdparty/maratis-tcl/LICENSE
@@ -0,0 +1,22 @@
+Maratis Tiny C library
+
+Copyright (c) 2015 Anael Seghezzi <www.maratis3d.com>
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would
+    be appreciated but is not required.
+
+ 2. Altered source versions must be plainly marked as such, and must not
+    be misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source
+    distribution.
--- a/3rdparty/maratis-tcl/m_image.h
+++ b/3rdparty/maratis-tcl/m_image.h
--- a/3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt
+++ b/3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt
@@ -0,0 +1,24 @@
+NVIDIA Texture Tools 2.0 is licensed under the MIT license.
+
+Copyright (c) 2007 NVIDIA Corporation
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
--- a/3rdparty/nvtt/bc6h/bits.h
+++ b/3rdparty/nvtt/bc6h/bits.h
@@ -0,0 +1,75 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#ifndef _ZOH_BITS_H
+#define _ZOH_BITS_H
+
+// read/write a bitstream
+
+#include "nvcore/debug.h"
+
+namespace ZOH {
+
+class Bits
+{
+public:
+
+	Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
+	Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
+
+	void write(int value, int nbits) {
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		for (int i=0; i<nbits; ++i)
+			writeone(value>>i);
+	}
+	int read(int nbits) { 
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		int out = 0;
+		for (int i=0; i<nbits; ++i)
+			out |= readone() << i;
+		return out;
+	}
+	int getptr() { return bptr; }
+	void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
+	int getsize() { return bend; }
+
+private:
+	int	bptr;		// next bit to read
+	int bend;		// last written bit + 1
+	char *bits;		// ptr to user bit stream
+	const char *cbits;	// ptr to const user bit stream
+	int maxbits;	// max size of user bit stream
+	char readonly;	// 1 if this is a read-only stream
+
+	int readone() {
+		nvAssert (bptr < bend);
+		if (bptr >= bend) return 0;
+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
+		++bptr;
+		return bit != 0;
+	}
+	void writeone(int bit) {
+		nvAssert (!readonly); // "Writing a read-only bit stream"
+		nvAssert (bptr < maxbits);
+		if (bptr >= maxbits) return;
+		if (bit&1)
+			bits[bptr>>3] |= 1 << (bptr & 7);
+		else
+			bits[bptr>>3] &= ~(1 << (bptr & 7));
+		if (bptr++ >= bend) bend = bptr;
+	}
+};
+
+}
+
+#endif
--- a/3rdparty/nvtt/bc6h/shapes_two.h
+++ b/3rdparty/nvtt/bc6h/shapes_two.h
@@ -0,0 +1,133 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_SHAPES_TWO_H
+#define _ZOH_SHAPES_TWO_H
+
+// shapes for two regions
+
+#define NREGIONS 2
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static const int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
+
+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
+
+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
+
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static const int shapeindex_to_compressed_indices[NSHAPES*2] = 
+{
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+
+	0,15,  0, 2,  0, 8,  0, 2,
+	0, 2,  0, 8,  0, 8,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 8,  0, 8,  0, 2,  0, 2,
+
+	0,15,  0,15,  0, 6,  0, 8,
+	0, 2,  0, 8,  0,15,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 2,  0,15,  0,15,  0, 6,
+
+	0, 6,  0, 2,  0, 6,  0, 8,
+	0,15,  0,15,  0, 2,  0, 2,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0, 2,  0, 2,  0,15
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
+
+#endif
--- a/3rdparty/nvtt/bc6h/tile.h
+++ b/3rdparty/nvtt/bc6h/tile.h
@@ -0,0 +1,82 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#ifndef _ZOH_TILE_H
+#define _ZOH_TILE_H
+
+#include "zoh_utils.h"
+#include "nvmath/vector.h"
+#include <math.h>
+
+namespace ZOH {
+
+//#define	USE_IMPORTANCE_MAP	1		// define this if you want to increase importance of some pixels in tile
+class Tile
+{
+public:
+	// NOTE: this returns the appropriately-clamped BIT PATTERN of the half as an INTEGRAL float value
+	static float half2float(uint16 h)
+	{
+		return (float) Utils::ushort_to_format(h);
+	}
+	// NOTE: this is the inverse of the above operation
+	static uint16 float2half(float f)
+	{
+		return Utils::format_to_ushort((int)f);
+	}
+
+	// look for adjacent pixels that are identical. if there are enough of them, increase their importance
+	void generate_importance_map()
+	{
+		// initialize
+		for (int y=0; y<size_y; ++y)
+		for (int x=0; x<size_x; ++x)
+		{
+			// my importance is increased if I am identical to any of my 4-neighbors
+			importance_map[y][x] = match_4_neighbor(x,y) ? 5.0f : 1.0f;
+		}
+	}
+	bool is_equal(int x, int y, int xn, int yn)
+	{
+		if (xn < 0 || xn >= size_x || yn < 0 || yn >= size_y)
+			return false;
+		return( (data[y][x].x == data[yn][xn].x) &&
+				(data[y][x].y == data[yn][xn].y) &&
+				(data[y][x].z == data[yn][xn].z) );
+	}
+
+#ifdef USE_IMPORTANCE_MAP
+	bool match_4_neighbor(int x, int y)
+	{
+		return is_equal(x,y,x-1,y) || is_equal(x,y,x+1,y) || is_equal(x,y,x,y-1) || is_equal(x,y,x,y+1);
+	}
+#else
+	bool match_4_neighbor(int, int)
+	{
+		return false;
+	}
+#endif
+
+	Tile() {};
+	~Tile(){};
+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
+
+	static const int TILE_H = 4;
+	static const int TILE_W = 4;
+	static const int TILE_TOTAL = TILE_H * TILE_W;
+    nv::Vector3 data[TILE_H][TILE_W];
+	float importance_map[TILE_H][TILE_W];
+	int	size_x, size_y;			// actual size of tile
+};
+
+}
+
+#endif // _ZOH_TILE_H
--- a/3rdparty/nvtt/bc6h/zoh.cpp
+++ b/3rdparty/nvtt/bc6h/zoh.cpp
@@ -0,0 +1,197 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// the zoh compressor and decompressor
+
+#include "tile.h"
+#include "zoh.h"
+
+#include <string.h> // memcpy
+
+using namespace ZOH;
+
+
+bool ZOH::isone(const char *block)
+{
+	char code = block[0] & 0x1F;
+
+	return (code == 0x03 || code == 0x07 || code == 0x0b || code == 0x0f);
+}
+
+void ZOH::compress(const Tile &t, char *block)
+{
+	char oneblock[ZOH::BLOCKSIZE], twoblock[ZOH::BLOCKSIZE];
+
+	float mseone = ZOH::compressone(t, oneblock);
+	float msetwo = ZOH::compresstwo(t, twoblock);
+
+	if (mseone <= msetwo)
+		memcpy(block, oneblock, ZOH::BLOCKSIZE);
+	else
+		memcpy(block, twoblock, ZOH::BLOCKSIZE);
+}
+
+void ZOH::decompress(const char *block, Tile &t)
+{
+	if (ZOH::isone(block))
+		ZOH::decompressone(block, t);
+	else
+		ZOH::decompresstwo(block, t);
+}
+
+/*
+void ZOH::compress(string inf, string zohf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	Exr::readRgba(inf, pixels, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "wb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for write";
+
+	// stuff for progress bar O.o
+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
+	int tilecnt = 0;
+	int ndots = 25;
+	int dotcnt = 0;
+	printf("Progress [");
+	for (int i=0; i<ndots;++i) printf(" ");
+	printf("]\rProgress ["); fflush(stdout);
+
+	// convert to tiles and compress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			t.insert(pixels, x, y);
+
+			ZOH::compress(t, block);
+			if (fwrite(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on write";
+
+			// progress bar
+			++tilecnt;
+			if (tilecnt > (ntiles * dotcnt)/ndots) { printf("."); fflush(stdout); ++dotcnt; }
+		}
+	}
+
+	printf("]\n");		// advance to next line finally
+
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+}
+
+static int str2int(std::string s)
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+// zoh file name is ...-w-h.zoh, extract width and height
+static void extract(string zohf, int &w, int &h)
+{
+	size_t n = zohf.rfind('.', zohf.length()-1);
+	size_t n1 = zohf.rfind('-', n-1);
+	size_t n2 = zohf.rfind('-', n1-1);
+	string width = zohf.substr(n2+1, n1-n2-1);
+	w = str2int(width);
+	string height = zohf.substr(n1+1, n-n1-1);
+	h = str2int(height);
+}
+
+static int mode_to_prec[] = {
+	10,7,11,10,
+	10,7,11,11,
+	10,7,11,12,
+	10,7,9,16,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,6,-1,
+};
+
+static int shapeindexhist[32], modehist[32], prechistone[16], prechisttwo[16], oneregion, tworegions;
+
+static void stats(char block[ZOH::BLOCKSIZE])
+{
+	char mode = block[0] & 0x1F; if ((mode & 0x3) == 0) mode = 0; if ((mode & 0x3) == 1) mode = 1; modehist[mode]++;
+	int prec = mode_to_prec[mode];
+	nvAssert (prec != -1);
+	if (!ZOH::isone(block))
+	{
+		tworegions++;
+		prechisttwo[prec]++;
+		int shapeindex = ((block[0] & 0xe0) >> 5) | ((block[1] & 0x3) << 3);
+		shapeindexhist[shapeindex]++;
+	}
+	else
+	{
+		oneregion++;
+		prechistone[prec]++;
+	}
+}
+
+static void printstats()
+{
+	printf("\nPrecision histogram 10b to 16b one region: "); for (int i=10; i<=16; ++i) printf("%d,", prechistone[i]);
+	printf("\nPrecision histogram 6b to 11b two regions: "); for (int i=6; i<=11; ++i) printf("%d,", prechisttwo[i]);
+	printf("\nMode histogram: "); for (int i=0; i<32; ++i) printf("%d,", modehist[i]);
+	printf("\nShape index histogram: "); for (int i=0; i<32; ++i) printf("%d,", shapeindexhist[i]);
+	printf("\nOne region %5.2f%%  Two regions %5.2f%%", 100.0*oneregion/float(oneregion+tworegions), 100.0*tworegions/float(oneregion+tworegions));
+	printf("\n");
+}
+
+void ZOH::decompress(string zohf, string outf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	extract(zohf, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "rb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for read";
+	pixels.resizeErase(h, w);
+
+	// convert to tiles and decompress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			if (fread(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on read";
+
+			stats(block);	// collect statistics
+
+			ZOH::decompress(block, t);
+
+			t.extract(pixels, x, y);
+		}
+	}
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+	Exr::writeRgba(outf, pixels, w, h);
+
+#ifndef EXTERNAL_RELEASE
+	printstats();	// print statistics
+#endif
+}
+*/
--- a/3rdparty/nvtt/bc6h/zoh.h
+++ b/3rdparty/nvtt/bc6h/zoh.h
@@ -0,0 +1,65 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_H
+#define _ZOH_H
+
+#include "tile.h"
+
+namespace ZOH {
+
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f
+
+static const int NREGIONS_TWO	= 2;
+static const int NREGIONS_ONE	= 1;
+static const int NCHANNELS		= 3;
+
+struct FltEndpts
+{
+    nv::Vector3 A;
+    nv::Vector3 B;
+};
+
+struct IntEndpts
+{
+	int A[NCHANNELS];
+	int B[NCHANNELS];
+};
+
+struct ComprEndpts
+{
+	uint A[NCHANNELS];
+	uint B[NCHANNELS];
+};
+
+static const int BLOCKSIZE=16;
+static const int BITSIZE=128;
+
+void compress(const Tile &t, char *block);
+void decompress(const char *block, Tile &t);
+
+float compressone(const Tile &t, char *block);
+float compresstwo(const Tile &t, char *block);
+void decompressone(const char *block, Tile &t);
+void decompresstwo(const char *block, Tile &t);
+
+float refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block);
+float roughtwo(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_TWO]);
+
+float refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block);
+float roughone(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_ONE]);
+
+bool isone(const char *block);
+
+}
+
+#endif // _ZOH_H
--- a/3rdparty/nvtt/bc6h/zoh_utils.cpp
+++ b/3rdparty/nvtt/bc6h/zoh_utils.cpp
@@ -0,0 +1,324 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Utility and common routines
+
+#include "zoh_utils.h"
+#include "nvmath/vector.inl"
+#include <math.h>
+
+using namespace nv;
+using namespace ZOH;
+
+static const int denom7_weights_64[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
+static const int denom15_weights_64[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
+
+/*static*/ Format Utils::FORMAT;
+
+int Utils::lerp(int a, int b, int i, int denom)
+{
+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
+	nvDebugCheck (i >= 0 && i <= denom);
+
+	int round = 32, shift = 6;
+	const int *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	nvDebugCheck(0);
+	}
+
+	return (a*weights[denom-i] +b*weights[i] + round) >> shift;
+}
+
+Vector3 Utils::lerp(const Vector3& a, const Vector3 &b, int i, int denom)
+{
+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
+	nvDebugCheck (i >= 0 && i <= denom);
+
+	int shift = 6;
+	const int *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	nvUnreachable();
+	}
+
+	// no need to round these as this is an exact division
+	return (a*float(weights[denom-i]) +b*float(weights[i])) / float(1 << shift);
+}
+
+
+/*
+	For unsigned f16, clamp the input to [0,F16MAX]. Thus u15.
+	For signed f16, clamp the input to [-F16MAX,F16MAX]. Thus s16.
+
+	The conversions proceed as follows:
+
+	unsigned f16: get bits. if high bit set, clamp to 0, else clamp to F16MAX.
+	signed f16: get bits. extract exp+mantissa and clamp to F16MAX. return -value if sign bit was set, else value
+	unsigned int: get bits. return as a positive value.
+	signed int. get bits. return as a value in -32768..32767.
+
+	The inverse conversions are just the inverse of the above.
+*/
+
+// clamp the 3 channels of the input vector to the allowable range based on FORMAT
+// note that each channel is a float storing the allowable range as a bit pattern converted to float
+// that is, for unsigned f16 say, we would clamp each channel to the range [0, F16MAX]
+
+void Utils::clamp(Vector3 &v)
+{
+	for (int i=0; i<3; ++i)
+	{
+		switch(Utils::FORMAT)
+		{
+		case UNSIGNED_F16:
+			if (v.component[i] < 0.0) v.component[i] = 0;
+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
+			break;
+
+		case SIGNED_F16:
+			if (v.component[i] < -F16MAX) v.component[i] = -F16MAX;
+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
+			break;
+
+		default:
+			nvUnreachable();
+		}
+	}
+}
+
+// convert a u16 value to s17 (represented as an int) based on the format expected
+int Utils::ushort_to_format(unsigned short input)
+{
+	int out, s;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		if (input & F16S_MASK) out = 0;
+		else if (input > F16MAX) out = F16MAX;
+		else out = input;
+		break;
+
+	case SIGNED_F16:
+		s = input & F16S_MASK;
+		input &= F16EM_MASK;
+		if (input > F16MAX) out = F16MAX;
+		else out = input;
+		out = s ? -out : out;
+		break;
+	}
+	return out;
+}
+
+// convert a s17 value to u16 based on the format expected
+unsigned short Utils::format_to_ushort(int input)
+{
+	unsigned short out;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		nvDebugCheck (input >= 0 && input <= F16MAX);
+		out = input;
+		break;
+
+	case SIGNED_F16:
+		nvDebugCheck (input >= -F16MAX && input <= F16MAX);
+		// convert to sign-magnitude
+		int s;
+		if (input < 0) { s = F16S_MASK; input = -input; }
+		else           { s = 0; }
+		out = s | input;
+		break;
+	}
+	return out;
+}
+
+// quantize the input range into equal-sized bins
+int Utils::quantize(float value, int prec)
+{
+	int q, ivalue, s;
+
+	nvDebugCheck (prec > 1);	// didn't bother to make it work for 1
+
+	value = (float)floor(value + 0.5);
+
+	int bias = (prec > 10) ? ((1<<(prec-1))-1) : 0;	// bias precisions 11..16 to get a more accurate quantization
+
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		nvDebugCheck (value >= 0 && value <= F16MAX);
+		ivalue = (int)value;
+		q = ((ivalue << prec) + bias) / (F16MAX+1);
+		nvDebugCheck (q >= 0 && q < (1 << prec));
+		break;
+
+	case SIGNED_F16:
+		nvDebugCheck (value >= -F16MAX && value <= F16MAX);
+		// convert to sign-magnitude
+		ivalue = (int)value;
+		if (ivalue < 0) { s = 1; ivalue = -ivalue; } else s = 0;
+
+		q = ((ivalue << (prec-1)) + bias) / (F16MAX+1);
+		if (s)
+			q = -q;
+		nvDebugCheck (q > -(1 << (prec-1)) && q < (1 << (prec-1)));
+		break;
+	}
+
+	return q;
+}
+
+int Utils::finish_unquantize(int q, int prec)
+{
+	if (Utils::FORMAT == UNSIGNED_F16)
+		return (q * 31) >> 6;										// scale the magnitude by 31/64
+	else if (Utils::FORMAT == SIGNED_F16)
+		return (q < 0) ? -(((-q) * 31) >> 5) : (q * 31) >> 5;		// scale the magnitude by 31/32
+	else
+		return q;
+}
+
+// unquantize each bin to midpoint of original bin range, except
+// for the end bins which we push to an endpoint of the bin range.
+// we do this to ensure we can represent all possible original values.
+// the asymmetric end bins do not affect PSNR for the test images.
+//
+// code this function assuming an arbitrary bit pattern as the encoded block
+int Utils::unquantize(int q, int prec)
+{
+	int unq, s;
+
+	nvDebugCheck (prec > 1);	// not implemented for prec 1
+
+	switch (Utils::FORMAT)
+	{
+	// modify this case to move the multiplication by 31 after interpolation.
+	// Need to use finish_unquantize.
+
+	// since we have 16 bits available, let's unquantize this to 16 bits unsigned
+	// thus the scale factor is [0-7c00)/[0-10000) = 31/64
+	case UNSIGNED_F16:
+		if (prec >= 15) 
+			unq = q;
+		else if (q == 0) 
+			unq = 0;
+		else if (q == ((1<<prec)-1)) 
+			unq = U16MAX;
+		else
+			unq = (q * (U16MAX+1) + (U16MAX+1)/2) >> prec;
+		break;
+
+	// here, let's stick with S16 (no apparent quality benefit from going to S17)
+	// range is (-7c00..7c00)/(-8000..8000) = 31/32
+	case SIGNED_F16:
+		// don't remove this test even though it appears equivalent to the code below
+		// as it isn't -- the code below can overflow for prec = 16
+		if (prec >= 16)
+			unq = q;
+		else
+		{
+			if (q < 0) { s = 1; q = -q; } else s = 0;
+
+			if (q == 0)
+				unq = 0;
+			else if (q >= ((1<<(prec-1))-1))
+				unq = s ? -S16MAX : S16MAX;
+			else
+			{
+				unq = (q * (S16MAX+1) + (S16MAX+1)/2) >> (prec-1);
+				if (s)
+					unq = -unq;
+			}
+		}
+		break;
+	}
+	return unq;
+}
+
+
+
+// pick a norm!
+#define	NORM_EUCLIDEAN 1
+
+float Utils::norm(const Vector3 &a, const Vector3 &b)
+{
+#ifdef	NORM_EUCLIDEAN
+	return lengthSquared(a - b);
+#endif
+#ifdef	NORM_ABS
+	Vector3 err = a - b;
+	return fabs(err.x) + fabs(err.y) + fabs(err.z);
+#endif
+}
+
+// parse <name>[<start>{:<end>}]{,}	
+// the pointer starts here         ^
+// name is 1 or 2 chars and matches field names. start and end are decimal numbers
+void Utils::parse(const char *encoding, int &ptr, Field &field, int &endbit, int &len)
+{
+	if (ptr <= 0) return;
+	--ptr;
+	if (encoding[ptr] == ',') --ptr;
+	nvDebugCheck (encoding[ptr] == ']');
+	--ptr;
+	endbit = 0;
+	int scale = 1;
+	while (encoding[ptr] != ':' && encoding[ptr] != '[')
+	{
+		nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+		endbit += (encoding[ptr--] - '0') * scale;
+		scale *= 10;
+	}
+	int startbit = 0; scale = 1;
+	if (encoding[ptr] == '[')
+		startbit = endbit;
+	else  
+	{
+		ptr--;
+		while (encoding[ptr] != '[')
+		{
+			nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+			startbit += (encoding[ptr--] - '0') * scale;
+			scale *= 10;
+		}
+	}
+	len = startbit - endbit + 1;	// startbit>=endbit note
+	--ptr;
+	if (encoding[ptr] == 'm')		field = FIELD_M;
+	else if (encoding[ptr] == 'd')	field = FIELD_D;
+	else {
+		// it's wxyz
+		nvDebugCheck (encoding[ptr] >= 'w' && encoding[ptr] <= 'z');
+		int foo = encoding[ptr--] - 'w';
+		// now it is r g or b
+		if (encoding[ptr] == 'r')		foo += 10;
+		else if (encoding[ptr] == 'g')	foo += 20;
+		else if (encoding[ptr] == 'b')	foo += 30;
+		else nvDebugCheck(0);
+		field = (Field) foo;
+	}
+}
+
+
--- a/3rdparty/nvtt/bc6h/zoh_utils.h
+++ b/3rdparty/nvtt/bc6h/zoh_utils.h
@@ -0,0 +1,72 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// utility class holding common routines
+#ifndef _ZOH_UTILS_H
+#define _ZOH_UTILS_H
+
+#include "nvmath/vector.h"
+
+namespace ZOH {
+
+inline int SIGN_EXTEND(int x, int nb) { return ((((signed(x))&(1<<((nb)-1)))?((~0)<<(nb)):0)|(signed(x))); }
+
+enum Field {
+    FIELD_M = 1,	// mode
+    FIELD_D = 2,	// distribution/shape
+    FIELD_RW = 10+0, FIELD_RX = 10+1, FIELD_RY = 10+2, FIELD_RZ = 10+3,	// red channel endpoints or deltas
+    FIELD_GW = 20+0, FIELD_GX = 20+1, FIELD_GY = 20+2, FIELD_GZ = 20+3,	// green channel endpoints or deltas
+    FIELD_BW = 30+0, FIELD_BX = 30+1, FIELD_BY = 30+2, FIELD_BZ = 30+3,	// blue channel endpoints or deltas
+};
+
+// some constants
+static const int F16S_MASK	=  0x8000;		// f16 sign mask
+static const int F16EM_MASK	=  0x7fff;		// f16 exp & mantissa mask
+static const int U16MAX		=  0xffff;
+static const int S16MIN		= -0x8000;
+static const int S16MAX		=  0x7fff;
+static const int INT16_MASK	=  0xffff;
+static const int F16MAX		=  0x7bff;		// MAXFLT bit pattern for halfs
+
+enum Format { UNSIGNED_F16, SIGNED_F16 };
+
+class Utils
+{
+public:
+    static Format FORMAT;     // this is a global -- we're either handling unsigned or unsigned half values
+
+    // error metrics
+    static float norm(const nv::Vector3 &a, const nv::Vector3 &b);
+    static float mpsnr_norm(const nv::Vector3 &a, int exposure, const nv::Vector3 &b);
+
+    // conversion & clamp
+    static int ushort_to_format(unsigned short input);
+    static unsigned short format_to_ushort(int input);
+
+    // clamp to format
+    static void clamp(nv::Vector3 &v);
+
+    // quantization and unquantization
+    static int finish_unquantize(int q, int prec);
+    static int unquantize(int q, int prec);
+    static int quantize(float value, int prec);
+
+    static void parse(const char *encoding, int &ptr, Field & field, int &endbit, int &len);
+
+    // lerping
+    static int lerp(int a, int b, int i, int denom);
+    static nv::Vector3 lerp(const nv::Vector3 & a, const nv::Vector3 & b, int i, int denom);
+};
+
+}
+
+#endif // _ZOH_UTILS_H
--- a/3rdparty/nvtt/bc6h/zohone.cpp
+++ b/3rdparty/nvtt/bc6h/zohone.cpp
@@ -0,0 +1,799 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// one region zoh compress/decompress code
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "zoh_utils.h"
+
+#include "nvmath/vector.inl"
+#include "nvmath/fitting.h"
+
+#include <string.h> // strlen
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+using namespace ZOH;
+
+#define NINDICES	16
+#define	INDEXBITS	4
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+#define	NSHAPES	1
+
+static const int shapes[NSHAPES] =
+{
+    0x0000
+};	// only 1 shape
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	2
+
+struct Chanpat
+{
+    int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+    Chanpat chan[NCHANNELS];// allow different bit patterns per channel -- but we still want constant precision per channel
+    int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+    int mode;				// associated mode value
+    int modebits;			// number of mode bits
+    const char *encoding;	// verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 4
+
+static const Pattern patterns[NPATTERNS] =
+{
+    16,4,  16,4,  16,4,   1, 0x0f, 5, "bw[10],bw[11],bw[12],bw[13],bw[14],bw[15],bx[3:0],gw[10],gw[11],gw[12],gw[13],gw[14],gw[15],gx[3:0],rw[10],rw[11],rw[12],rw[13],rw[14],rw[15],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    12,8,  12,8,  12,8,   1, 0x0b, 5, "bw[10],bw[11],bx[7:0],gw[10],gw[11],gx[7:0],rw[10],rw[11],rx[7:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,9,  11,9,  11,9,   1, 0x07, 5, "bw[10],bx[8:0],gw[10],gx[8:0],rw[10],rx[8:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    10,10, 10,10, 10,10,  0, 0x03, 5, "bx[9:0],gx[9:0],rx[9:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+static const int mode_to_pat[MAXMODES] = {
+    -1,-1,-1,
+    3,	// 0x03
+    -1,-1,-1,
+    2,	// 0x07
+    -1,-1,-1,
+    1,	// 0x0b
+    -1,-1,-1,
+    0,	// 0x0f
+    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_ONE], ComprEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+        }
+    }
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_ONE], IntEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+    bool issigned = Utils::FORMAT == SIGNED_F16;
+
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            int t;
+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+        }
+    }
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_ONE], int prec, IntEndpts q_endpts[NREGIONS_ONE])
+{
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
+    }
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_one have a 0 high-order bit
+// index_one is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndpts endpts[NREGIONS_ONE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+    int index_positions[NREGIONS_ONE];
+
+    index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        int x = index_positions[region] & 3;
+        int y = (index_positions[region] >> 2) & 3;
+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
+        if (indices[y][x] & HIGH_INDEXBIT)
+        {
+            // high bit is set, swap the endpts and indices for this region
+            int t;
+            for (int i=0; i<NCHANNELS; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+            for (int y = 0; y < Tile::TILE_H; y++)
+                for (int x = 0; x < Tile::TILE_W; x++)
+                    if (REGION(x,y,shapeindex) == region)
+                        indices[y][x] = NINDICES - 1 - indices[y][x];
+        }
+    }
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_ONE], const ComprEndpts compressed[NREGIONS_ONE], const Pattern &p)
+{
+    IntEndpts uncompressed[NREGIONS_ONE];
+
+    decompress_endpts(compressed, uncompressed, p);
+
+    for (int j=0; j<NREGIONS_ONE; ++j)
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+        if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+        if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+    }
+    return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_ONE], const Pattern &p, Bits &out)
+{
+    // interpret the verilog backwards and process it
+    int m = p.mode;
+    int rw = endpts[0].A[0], rx = endpts[0].B[0];
+    int gw = endpts[0].A[1], gx = endpts[0].B[1];
+    int bw = endpts[0].A[2], bx = endpts[0].B[2];
+    int ptr = int(strlen(p.encoding));
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+        switch(field)
+        {
+        case FIELD_M:	out.write( m >> endbit, len); break;
+        case FIELD_RW:	out.write(rw >> endbit, len); break;
+        case FIELD_RX:	out.write(rx >> endbit, len); break;
+        case FIELD_GW:	out.write(gw >> endbit, len); break;
+        case FIELD_GX:	out.write(gx >> endbit, len); break;
+        case FIELD_BW:	out.write(bw >> endbit, len); break;
+        case FIELD_BX:	out.write(bx >> endbit, len); break;
+
+        case FIELD_D:
+        case FIELD_RY:
+        case FIELD_RZ:
+        case FIELD_GY:
+        case FIELD_GZ:
+        case FIELD_BY:
+        case FIELD_BZ:
+        default: nvUnreachable();
+        }
+    }
+}
+
+static void read_header(Bits &in, ComprEndpts endpts[NREGIONS_ONE], Pattern &p)
+{
+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+    int mode = in.read(2);
+    if (mode != 0x00 && mode != 0x01)
+        mode = (in.read(3) << 2) | mode;
+
+    int pat_index = mode_to_pat[mode];
+
+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
+
+    p = patterns[pat_index];
+
+    int d;
+    int rw, rx;
+    int gw, gx;
+    int bw, bx;
+
+    d = 0;
+    rw = rx = 0;
+    gw = gx = 0;
+    bw = bx = 0;
+
+    int ptr = int(strlen(p.encoding));
+
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+
+        switch(field)
+        {
+        case FIELD_M:	break;	// already processed so ignore
+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
+
+        case FIELD_D:
+        case FIELD_RY:
+        case FIELD_RZ:
+        case FIELD_GY:
+        case FIELD_GZ:
+        case FIELD_BY:
+        case FIELD_BZ:
+        default: nvUnreachable();
+        }
+    }
+
+    nvDebugCheck (in.getptr() == 128 - 63);
+
+    endpts[0].A[0] = rw; endpts[0].B[0] = rx;
+    endpts[0].A[1] = gw; endpts[0].B[1] = gx;
+    endpts[0].A[2] = bw; endpts[0].B[2] = bx;
+}
+
+// compress index 0
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        out.write(indices[y][x], INDEXBITS - ((pos == 0) ? 1 : 0));
+    }
+}
+
+static void emit_block(const ComprEndpts endpts[NREGIONS_ONE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+    Bits out(block, ZOH::BITSIZE);
+
+    write_header(endpts, p, out);
+
+    write_indices(indices, shapeindex, out);
+
+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
+{
+    // scale endpoints
+    int a, b;			// really need a IntVector3...
+
+    a = Utils::unquantize(endpts.A[0], prec);
+    b = Utils::unquantize(endpts.B[0], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[1], prec);
+    b = Utils::unquantize(endpts.B[1], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[2], prec);
+    b = Utils::unquantize(endpts.B[2], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+}
+
+// position 0 was compressed
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        indices[y][x]= in.read(INDEXBITS - ((pos == 0) ? 1 : 0));
+    }
+}
+
+void ZOH::decompressone(const char *block, Tile &t)
+{
+    Bits in(block, ZOH::BITSIZE);
+
+    Pattern p;
+    IntEndpts endpts[NREGIONS_ONE];
+    ComprEndpts compr_endpts[NREGIONS_ONE];
+
+    read_header(in, compr_endpts, p);
+    int shapeindex = 0;		// only one shape
+
+    decompress_endpts(compr_endpts, endpts, p);
+
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+    for (int r = 0; r < NREGIONS_ONE; ++r)
+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+    // read indices
+    int indices[Tile::TILE_H][Tile::TILE_W];
+
+    read_indices(in, shapeindex, indices);
+
+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
+
+    // lookup
+    for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+            t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+    Vector3 palette[NINDICES];
+    float toterr = 0;
+    Vector3 err;
+
+    generate_palette_quantized(endpts, prec, palette);
+
+    for (int i = 0; i < np; ++i)
+    {
+        float err, besterr;
+
+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
+        {
+            err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_ONE], int prec, 
+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_ONE])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+        toterr[region] = 0;
+    }
+
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+        indices[y][x] = 0;
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+            {
+                besterr = err;
+                indices[y][x] = i;
+            }
+        }
+        toterr[region] += besterr;
+    }
+}
+
+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
+                          float old_err, int do_b)
+{
+    // we have the old endpoints: old_endpts
+    // we have the perturbed endpoints: new_endpts
+    // we have the temporary endpoints: temp_endpts
+
+    IntEndpts temp_endpts;
+    float min_err = old_err;		// start with the best current error
+    int beststep;
+
+    // copy real endpoints so we can perturb them
+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+    // do a logarithmic search for the best error for this endpoint (which)
+    for (int step = 1 << (prec-1); step; step >>= 1)
+    {
+        bool improved = false;
+        for (int sign = -1; sign <= 1; sign += 2)
+        {
+            if (do_b == 0)
+            {
+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+                    continue;
+            }
+            else
+            {
+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+                    continue;
+            }
+
+            float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+            if (err < min_err)
+            {
+                improved = true;
+                min_err = err;
+                beststep = sign * step;
+            }
+        }
+        // if this was an improvement, move the endpoint and continue search from there
+        if (improved)
+        {
+            if (do_b == 0)
+                new_endpts.A[ch] += beststep;
+            else
+                new_endpts.B[ch] += beststep;
+        }
+    }
+    return min_err;
+}
+
+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+    float opt_err = orig_err;
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        opt_endpts.A[ch] = orig_endpts.A[ch];
+        opt_endpts.B[ch] = orig_endpts.B[ch];
+    }
+    /*
+        err0 = perturb(rgb0, delta0)
+        err1 = perturb(rgb1, delta1)
+        if (err0 < err1)
+            if (err0 >= initial_error) break
+            rgb0 += delta0
+            next = 1
+        else
+            if (err1 >= initial_error) break
+            rgb1 += delta1
+            next = 0
+        initial_err = map()
+        for (;;)
+            err = perturb(next ? rgb1:rgb0, delta)
+            if (err >= initial_err) break
+            next? rgb1 : rgb0 += delta
+            initial_err = err
+	*/
+    IntEndpts new_a, new_b;
+    IntEndpts new_endpt;
+    int do_b;
+
+    // now optimize each channel separately
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        // figure out which endpoint when perturbed gives the most improvement and start there
+        // if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+        if (err0 < err1)
+        {
+            if (err0 >= opt_err)
+                continue;
+
+            opt_endpts.A[ch] = new_a.A[ch];
+            opt_err = err0;
+            do_b = 1;		// do B next
+        }
+        else
+        {
+            if (err1 >= opt_err)
+                continue;
+            opt_endpts.B[ch] = new_b.B[ch];
+            opt_err = err1;
+            do_b = 0;		// do A next
+        }
+
+        // now alternate endpoints and keep trying until there is no improvement
+        for (;;)
+        {
+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+            if (err >= opt_err)
+                break;
+            if (do_b == 0)
+                opt_endpts.A[ch] = new_endpt.A[ch];
+            else
+                opt_endpts.B[ch] = new_endpt.B[ch];
+            opt_err = err;
+            do_b = 1 - do_b;	// now move the other endpoint
+        }
+    }
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_ONE], 
+                            const IntEndpts orig_endpts[NREGIONS_ONE], int prec, IntEndpts opt_endpts[NREGIONS_ONE])
+{
+    Vector3 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+    float err = 0;
+
+    for (int region=0; region<NREGIONS_ONE; ++region)
+    {
+        // collect the pixels in the region
+        int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    ++np;
+                }
+            }
+        }
+
+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+    }
+}
+
+/* optimization algorithm
+    for each pattern
+        convert endpoints using pattern precision
+        assign indices and get initial error
+        compress indices (and possibly reorder endpoints)
+        transform endpoints
+        if transformed endpoints fit pattern
+            get original endpoints back
+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+            compress new indices
+            transform new endpoints
+            if new endpoints fit pattern AND if error is improved
+                emit compressed block with new data
+            else
+                emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+float ZOH::refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block)
+{
+    float orig_err[NREGIONS_ONE], opt_err[NREGIONS_ONE], orig_toterr, opt_toterr;
+    IntEndpts orig_endpts[NREGIONS_ONE], opt_endpts[NREGIONS_ONE];
+    ComprEndpts compr_orig[NREGIONS_ONE], compr_opt[NREGIONS_ONE];
+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+    for (int sp = 0; sp < NPATTERNS; ++sp)
+    {
+        // precisions for all channels need to be the same
+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+        {
+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+            orig_toterr = opt_toterr = 0;
+            for (int i=0; i < NREGIONS_ONE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+
+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+            {
+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+                return opt_toterr;
+            }
+            else
+            {
+                // either it stopped fitting when we optimized it, or there was no improvement
+                // so go back to the unoptimized endpoints which we know will fit
+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+                return orig_toterr;
+            }
+        }
+    }
+
+	nvAssert (false); // "No candidate found, should never happen (refineone.)";
+	return FLT_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_ONE], Vector3 palette[NREGIONS_ONE][NINDICES])
+{
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_ONE])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+
+    generate_palette_unquantized(endpts, palette);
+
+    float toterr = 0;
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+float ZOH::roughone(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_ONE])
+{
+    for (int region=0; region<NREGIONS_ONE; ++region)
+    {
+        int np = 0;
+        Vector3 colors[Tile::TILE_TOTAL];
+        Vector3 mean(0,0,0);
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x,y,shapeindex) == region)
+                {
+                    colors[np] = tile.data[y][x];
+                    mean += tile.data[y][x];
+                    ++np;
+                }
+            }
+        }
+
+        // handle simple cases
+        if (np == 0)
+        {
+            Vector3 zero(0,0,0);
+            endpts[region].A = zero;
+            endpts[region].B = zero;
+            continue;
+        }
+        else if (np == 1)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[0];
+            continue;
+        }
+        else if (np == 2)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[1];
+            continue;
+        }
+
+        mean /= float(np);
+
+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+        // project each pixel value along the principal direction
+        float minp = FLT_MAX, maxp = -FLT_MAX;
+        for (int i = 0; i < np; i++)
+        {
+            float dp = dot(colors[i]-mean, direction);
+            if (dp < minp) minp = dp;
+            if (dp > maxp) maxp = dp;
+        }
+
+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+        endpts[region].A = mean + minp*direction;
+        endpts[region].B = mean + maxp*direction;
+
+        // clamp endpoints
+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+        // shape based on endpoints being clamped
+        Utils::clamp(endpts[region].A);
+        Utils::clamp(endpts[region].B);
+    }
+
+    return map_colors(tile, shapeindex, endpts);
+}
+
+float ZOH::compressone(const Tile &t, char *block)
+{
+    int shapeindex_best = 0;
+    FltEndpts endptsbest[NREGIONS_ONE], tempendpts[NREGIONS_ONE];
+    float msebest = FLT_MAX;
+
+    /*
+		collect the mse values that are within 5% of the best values
+		optimize each one and choose the best
+	*/
+    // hack for now -- just use the best value WORK
+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+    {
+        float mse = roughone(t, i, tempendpts);
+        if (mse < msebest)
+        {
+            msebest = mse;
+            shapeindex_best = i;
+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+        }
+
+    }
+    return refineone(t, shapeindex_best, endptsbest, block);
+}
--- a/3rdparty/nvtt/bc6h/zohtwo.cpp
+++ b/3rdparty/nvtt/bc6h/zohtwo.cpp
@@ -0,0 +1,883 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// two regions zoh compress/decompress code
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+/* optimization algorithm
+
+	get initial float endpoints
+	convert endpoints using 16 bit precision, transform, and get bit delta. choose likely endpoint compression candidates.
+		note that there will be 1 or 2 candidates; 2 will be chosen when the delta values are close to the max possible.
+	for each EC candidate in order from max precision to smaller precision
+		convert endpoints using the appropriate precision.
+		optimize the endpoints and minimize square error. save the error and index assignments. apply index compression as well.
+			(thus the endpoints and indices are in final form.)
+		transform and get bit delta.
+		if the bit delta fits, exit
+	if we ended up with no candidates somehow, choose the tail set of EC candidates and retry. this should happen hardly ever.
+		add a state variable to nvDebugCheck we only do this once.
+	convert to bit stream.
+	return the error.
+
+	Global optimization
+		order all tiles based on their errors
+		do something special for high-error tiles
+			the goal here is to try to avoid tiling artifacts. but I think this is a research problem. let's just generate an error image...
+
+	display an image that shows partitioning and precision selected for each tile
+*/
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "zoh_utils.h"
+
+#include "nvmath/fitting.h"
+#include "nvmath/vector.inl"
+
+#include <string.h> // strlen
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+using namespace ZOH;
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#include "shapes_two.h"
+// use only the first 32 available shapes
+#undef NSHAPES
+#undef SHAPEBITS
+#define NSHAPES 32
+#define SHAPEBITS 5
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	4
+
+struct Chanpat
+{
+    int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+    Chanpat chan[NCHANNELS];    // allow different bit patterns per channel -- but we still want constant precision per channel
+    int transformed;            // if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+    int mode;                   // associated mode value
+    int modebits;               // number of mode bits
+    const char *encoding;       // verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 10
+
+static const Pattern patterns[NPATTERNS] =
+{
+    11,5,5,5,	11,4,4,4,	11,4,4,4,	1,	0x02, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],rw[10],rx[4:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,4,4,4,	11,5,5,5,	11,4,4,4,	1,	0x06, 5, "d[4:0],bz[3],gy[4],rz[3:0],bz[2],bz[0],ry[3:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],gw[10],gx[4:0],gy[3:0],gz[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,4,4,4,	11,4,4,4,	11,5,5,5,	1,	0x0a, 5, "d[4:0],bz[3],bz[4],rz[3:0],bz[2:1],ry[3:0],by[3:0],bw[10],bx[4:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],by[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    10,5,5,5,	10,5,5,5,	10,5,5,5,	1,	0x00, 2, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bw[9:0],gw[9:0],rw[9:0],bz[4],by[4],gy[4],m[1:0]",
+    9,5,5,5,	9,5,5,5,	9,5,5,5,	1,	0x0e, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bw[8:0],gy[4],gw[8:0],by[4],rw[8:0],m[4:0]",
+    8,6,6,6,	8,5,5,5,	8,5,5,5,	1,	0x12, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],rx[5:0],bz[4:3],bw[7:0],gy[4],bz[2],gw[7:0],by[4],gz[4],rw[7:0],m[4:0]",
+    8,5,5,5,	8,6,6,6,	8,5,5,5,	1,	0x16, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],gx[5:0],gy[3:0],gz[4],rx[4:0],bz[4],gz[5],bw[7:0],gy[4],gy[5],gw[7:0],by[4],bz[0],rw[7:0],m[4:0]",
+    8,5,5,5,	8,5,5,5,	8,6,6,6,	1,	0x1a, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bx[5:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bz[5],bw[7:0],gy[4],by[5],gw[7:0],by[4],bz[1],rw[7:0],m[4:0]",
+    7,6,6,6,	7,6,6,6,	7,6,6,6,	1,	0x01, 2, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],bw[6:0],gy[4],bz[2],by[5],gw[6:0],by[4],bz[1:0],rw[6:0],gz[5:4],gy[5],m[1:0]",
+    6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x1e, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],gz[5],bw[5:0],gy[4],bz[2],by[5],gy[5],gw[5:0],by[4],bz[1:0],gz[4],rw[5:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f -- return -2 for these
+static const int mode_to_pat[MAXMODES] = {	
+    3,	// 0x00
+    8,	// 0x01
+    0,	// 0x02
+    -1,-1,-1,
+    1,	// 0x06
+    -1,-1,-1,
+    2,	// 0x0a
+    -1,-1,-1,
+    4,	// 0x0e
+    -1,-1,-1,
+    5,	// 0x12
+    -2,-1,-1,
+    6,	// 0x16
+    -2,-1,-1,
+    7,	// 0x1a
+    -2,-1,-1,
+    9,	// 0x1e
+    -2
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	R_2(ep)	(ep)[1].A[i]
+#define	R_3(ep)	(ep)[1].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_TWO], ComprEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+            R_2(out) = (R_2(in) - R_0(in)) & MASK(p.chan[i].prec[2]);
+            R_3(out) = (R_3(in) - R_0(in)) & MASK(p.chan[i].prec[3]);
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+            R_2(out) = R_2(in) & MASK(p.chan[i].prec[2]);
+            R_3(out) = R_3(in) & MASK(p.chan[i].prec[3]);
+        }
+    }
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_TWO], IntEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+    bool issigned = Utils::FORMAT == SIGNED_F16;
+
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            int t;
+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+            t = SIGN_EXTEND(R_2(in), p.chan[i].prec[2]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_2(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+            t = SIGN_EXTEND(R_3(in), p.chan[i].prec[3]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_3(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+            R_2(out) = issigned ? SIGN_EXTEND(R_2(in),p.chan[i].prec[2]) : R_2(in);
+            R_3(out) = issigned ? SIGN_EXTEND(R_3(in),p.chan[i].prec[3]) : R_3(in);
+        }
+    }
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_TWO], int prec, IntEndpts q_endpts[NREGIONS_TWO])
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
+    }
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndpts endpts[NREGIONS_TWO], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+        int x = POS_TO_X(position);
+        int y = POS_TO_Y(position);
+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
+        if (indices[y][x] & HIGH_INDEXBIT)
+        {
+            // high bit is set, swap the endpts and indices for this region
+            int t;
+            for (int i=0; i<NCHANNELS; ++i)
+            {
+                t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+            }
+
+            for (int y = 0; y < Tile::TILE_H; y++)
+                for (int x = 0; x < Tile::TILE_W; x++)
+                    if (REGION(x,y,shapeindex) == region)
+                        indices[y][x] = NINDICES - 1 - indices[y][x];
+        }
+    }
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_TWO], const ComprEndpts compressed[NREGIONS_TWO], const Pattern &p)
+{
+    IntEndpts uncompressed[NREGIONS_TWO];
+
+    decompress_endpts(compressed, uncompressed, p);
+
+    for (int j=0; j<NREGIONS_TWO; ++j)
+    {
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+            if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+            if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+        }
+    }
+    return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, Bits &out)
+{
+    // interpret the verilog backwards and process it
+    int m = p.mode;
+    int d = shapeindex;
+    int rw = endpts[0].A[0], rx = endpts[0].B[0], ry = endpts[1].A[0], rz = endpts[1].B[0];
+    int gw = endpts[0].A[1], gx = endpts[0].B[1], gy = endpts[1].A[1], gz = endpts[1].B[1];
+    int bw = endpts[0].A[2], bx = endpts[0].B[2], by = endpts[1].A[2], bz = endpts[1].B[2];
+    int ptr = int(strlen(p.encoding));
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+        switch(field)
+        {
+        case FIELD_M:	out.write( m >> endbit, len); break;
+        case FIELD_D:	out.write( d >> endbit, len); break;
+        case FIELD_RW:	out.write(rw >> endbit, len); break;
+        case FIELD_RX:	out.write(rx >> endbit, len); break;
+        case FIELD_RY:	out.write(ry >> endbit, len); break;
+        case FIELD_RZ:	out.write(rz >> endbit, len); break;
+        case FIELD_GW:	out.write(gw >> endbit, len); break;
+        case FIELD_GX:	out.write(gx >> endbit, len); break;
+        case FIELD_GY:	out.write(gy >> endbit, len); break;
+        case FIELD_GZ:	out.write(gz >> endbit, len); break;
+        case FIELD_BW:	out.write(bw >> endbit, len); break;
+        case FIELD_BX:	out.write(bx >> endbit, len); break;
+        case FIELD_BY:	out.write(by >> endbit, len); break;
+        case FIELD_BZ:	out.write(bz >> endbit, len); break;
+        default: nvUnreachable();
+        }
+    }
+}
+
+static bool read_header(Bits &in, ComprEndpts endpts[NREGIONS_TWO], int &shapeindex, Pattern &p)
+{
+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+    int mode = in.read(2);
+    if (mode != 0x00 && mode != 0x01)
+        mode = (in.read(3) << 2) | mode;
+
+    int pat_index = mode_to_pat[mode];
+
+    if (pat_index == -2)
+        return false;		// reserved mode found
+
+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
+
+    p = patterns[pat_index];
+
+    int d;
+    int rw, rx, ry, rz;
+    int gw, gx, gy, gz;
+    int bw, bx, by, bz;
+
+    d = 0;
+    rw = rx = ry = rz = 0;
+    gw = gx = gy = gz = 0;
+    bw = bx = by = bz = 0;
+
+    int ptr = int(strlen(p.encoding));
+
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+
+        switch(field)
+        {
+        case FIELD_M:	break;	// already processed so ignore
+        case FIELD_D:	 d |= in.read(len) << endbit; break;
+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
+        case FIELD_RY:	ry |= in.read(len) << endbit; break;
+        case FIELD_RZ:	rz |= in.read(len) << endbit; break;
+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
+        case FIELD_GY:	gy |= in.read(len) << endbit; break;
+        case FIELD_GZ:	gz |= in.read(len) << endbit; break;
+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
+        case FIELD_BY:	by |= in.read(len) << endbit; break;
+        case FIELD_BZ:	bz |= in.read(len) << endbit; break;
+        default: nvUnreachable();
+        }
+    }
+
+    nvDebugCheck (in.getptr() == 128 - 46);
+
+    shapeindex = d;
+    endpts[0].A[0] = rw; endpts[0].B[0] = rx; endpts[1].A[0] = ry; endpts[1].B[0] = rz;
+    endpts[0].A[1] = gw; endpts[0].B[1] = gx; endpts[1].A[1] = gy; endpts[1].B[1] = gz;
+    endpts[0].A[2] = bw; endpts[0].B[2] = bx; endpts[1].A[2] = by; endpts[1].B[2] = bz;
+
+    return true;
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+    int positions[NREGIONS_TWO];
+
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        bool match = false;
+
+        for (int r = 0; r < NREGIONS_TWO; ++r)
+            if (positions[r] == pos) { match = true; break; }
+
+        out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+    }
+}
+
+static void emit_block(const ComprEndpts compr_endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+    Bits out(block, ZOH::BITSIZE);
+
+    write_header(compr_endpts, shapeindex, p, out);
+
+    write_indices(indices, shapeindex, out);
+
+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
+{
+    // scale endpoints
+    int a, b;			// really need a IntVector3...
+
+    a = Utils::unquantize(endpts.A[0], prec);
+    b = Utils::unquantize(endpts.B[0], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[1], prec);
+    b = Utils::unquantize(endpts.B[1], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[2], prec);
+    b = Utils::unquantize(endpts.B[2], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+    int positions[NREGIONS_TWO];
+
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        bool match = false;
+
+        for (int r = 0; r < NREGIONS_TWO; ++r)
+            if (positions[r] == pos) { match = true; break; }
+
+        indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+    }
+}
+
+void ZOH::decompresstwo(const char *block, Tile &t)
+{
+    Bits in(block, ZOH::BITSIZE);
+
+    Pattern p;
+    IntEndpts endpts[NREGIONS_TWO];
+    ComprEndpts compr_endpts[NREGIONS_TWO];
+    int shapeindex;
+
+    if (!read_header(in, compr_endpts, shapeindex, p))
+    {
+        // reserved mode, return all zeroes
+        for (int y = 0; y < Tile::TILE_H; y++)
+            for (int x = 0; x < Tile::TILE_W; x++)
+                t.data[y][x] = Vector3(0.0f);
+
+        return;
+    }
+
+    decompress_endpts(compr_endpts, endpts, p);
+
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+    int indices[Tile::TILE_H][Tile::TILE_W];
+
+    read_indices(in, shapeindex, indices);
+
+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
+
+    // lookup
+    for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+        t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+    Vector3 palette[NINDICES];
+    float toterr = 0;
+    Vector3 err;
+
+    generate_palette_quantized(endpts, prec, palette);
+
+    for (int i = 0; i < np; ++i)
+    {
+        float err, besterr;
+
+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
+        {
+            err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_TWO], int prec, 
+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_TWO])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+        toterr[region] = 0;
+    }
+
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+        indices[y][x] = 0;
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+            {
+                besterr = err;
+                indices[y][x] = i;
+            }
+        }
+        toterr[region] += besterr;
+    }
+}
+
+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
+                          float old_err, int do_b)
+{
+    // we have the old endpoints: old_endpts
+    // we have the perturbed endpoints: new_endpts
+    // we have the temporary endpoints: temp_endpts
+
+    IntEndpts temp_endpts;
+    float min_err = old_err;		// start with the best current error
+    int beststep;
+
+    // copy real endpoints so we can perturb them
+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+    // do a logarithmic search for the best error for this endpoint (which)
+    for (int step = 1 << (prec-1); step; step >>= 1)
+    {
+        bool improved = false;
+        for (int sign = -1; sign <= 1; sign += 2)
+        {
+            if (do_b == 0)
+            {
+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+                    continue;
+            }
+            else
+            {
+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+                    continue;
+            }
+
+            float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+            if (err < min_err)
+            {
+                improved = true;
+                min_err = err;
+                beststep = sign * step;
+            }
+        }
+        // if this was an improvement, move the endpoint and continue search from there
+        if (improved)
+        {
+            if (do_b == 0)
+                new_endpts.A[ch] += beststep;
+            else
+                new_endpts.B[ch] += beststep;
+        }
+    }
+    return min_err;
+}
+
+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+    float opt_err = orig_err;
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        opt_endpts.A[ch] = orig_endpts.A[ch];
+        opt_endpts.B[ch] = orig_endpts.B[ch];
+    }
+    /*
+        err0 = perturb(rgb0, delta0)
+        err1 = perturb(rgb1, delta1)
+        if (err0 < err1)
+            if (err0 >= initial_error) break
+            rgb0 += delta0
+            next = 1
+        else
+            if (err1 >= initial_error) break
+            rgb1 += delta1
+            next = 0
+        initial_err = map()
+        for (;;)
+            err = perturb(next ? rgb1:rgb0, delta)
+            if (err >= initial_err) break
+            next? rgb1 : rgb0 += delta
+            initial_err = err
+    */
+    IntEndpts new_a, new_b;
+    IntEndpts new_endpt;
+    int do_b;
+
+    // now optimize each channel separately
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        // figure out which endpoint when perturbed gives the most improvement and start there
+        // if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+        if (err0 < err1)
+        {
+            if (err0 >= opt_err)
+                continue;
+
+            opt_endpts.A[ch] = new_a.A[ch];
+            opt_err = err0;
+            do_b = 1;		// do B next
+        }
+        else
+        {
+            if (err1 >= opt_err)
+                continue;
+            opt_endpts.B[ch] = new_b.B[ch];
+            opt_err = err1;
+            do_b = 0;		// do A next
+        }
+
+        // now alternate endpoints and keep trying until there is no improvement
+        for (;;)
+        {
+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+            if (err >= opt_err)
+                break;
+            if (do_b == 0)
+                opt_endpts.A[ch] = new_endpt.A[ch];
+            else
+                opt_endpts.B[ch] = new_endpt.B[ch];
+            opt_err = err;
+            do_b = 1 - do_b;	// now move the other endpoint
+        }
+    }
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_TWO], 
+                            const IntEndpts orig_endpts[NREGIONS_TWO], int prec, IntEndpts opt_endpts[NREGIONS_TWO])
+{
+    Vector3 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+    float err = 0;
+
+    for (int region=0; region<NREGIONS_TWO; ++region)
+    {
+        // collect the pixels in the region
+        int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++)
+            for (int x = 0; x < tile.size_x; x++)
+                if (REGION(x,y,shapeindex) == region)
+                {
+            pixels[np] = tile.data[y][x];
+            importance[np] = tile.importance_map[y][x];
+            ++np;
+        }
+
+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+    }
+}
+
+/* optimization algorithm
+    for each pattern
+        convert endpoints using pattern precision
+        assign indices and get initial error
+        compress indices (and possibly reorder endpoints)
+        transform endpoints
+        if transformed endpoints fit pattern
+            get original endpoints back
+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+            compress new indices
+            transform new endpoints
+            if new endpoints fit pattern AND if error is improved
+                emit compressed block with new data
+            else
+                emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+float ZOH::refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block)
+{
+    float orig_err[NREGIONS_TWO], opt_err[NREGIONS_TWO], orig_toterr, opt_toterr;
+    IntEndpts orig_endpts[NREGIONS_TWO], opt_endpts[NREGIONS_TWO];
+    ComprEndpts compr_orig[NREGIONS_TWO], compr_opt[NREGIONS_TWO];
+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+    for (int sp = 0; sp < NPATTERNS; ++sp)
+    {
+        // precisions for all channels need to be the same
+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+        {
+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+            orig_toterr = opt_toterr = 0;
+            for (int i=0; i < NREGIONS_TWO; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+            {
+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+                return opt_toterr;
+            }
+            else
+            {
+                // either it stopped fitting when we optimized it, or there was no improvement
+                // so go back to the unoptimized endpoints which we know will fit
+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+                return orig_toterr;
+            }
+        }
+    }
+    nvAssert(false); //throw "No candidate found, should never happen (refinetwo.)";
+	return FLT_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_TWO], Vector3 palette[NREGIONS_TWO][NINDICES])
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_TWO])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+
+    generate_palette_unquantized(endpts, palette);
+
+    float toterr = 0;
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+float ZOH::roughtwo(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_TWO])
+{
+    for (int region=0; region<NREGIONS_TWO; ++region)
+    {
+        int np = 0;
+        Vector3 colors[Tile::TILE_TOTAL];
+        Vector3 mean(0,0,0);
+
+        for (int y = 0; y < tile.size_y; y++)
+            for (int x = 0; x < tile.size_x; x++)
+                if (REGION(x,y,shapeindex) == region)
+                {
+            colors[np] = tile.data[y][x];
+            mean += tile.data[y][x];
+            ++np;
+        }
+
+        // handle simple cases
+        if (np == 0)
+        {
+            Vector3 zero(0,0,0);
+            endpts[region].A = zero;
+            endpts[region].B = zero;
+            continue;
+        }
+        else if (np == 1)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[0];
+            continue;
+        }
+        else if (np == 2)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[1];
+            continue;
+        }
+
+        mean /= float(np);
+
+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+        // project each pixel value along the principal direction
+        float minp = FLT_MAX, maxp = -FLT_MAX;
+        for (int i = 0; i < np; i++)
+        {
+            float dp = dot(colors[i]-mean, direction);
+            if (dp < minp) minp = dp;
+            if (dp > maxp) maxp = dp;
+        }
+
+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+        endpts[region].A = mean + minp*direction;
+        endpts[region].B = mean + maxp*direction;
+
+        // clamp endpoints
+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+        // shape based on endpoints being clamped
+        Utils::clamp(endpts[region].A);
+        Utils::clamp(endpts[region].B);
+    }
+
+    return map_colors(tile, shapeindex, endpts);
+}
+
+float ZOH::compresstwo(const Tile &t, char *block)
+{
+    int shapeindex_best = 0;
+    FltEndpts endptsbest[NREGIONS_TWO], tempendpts[NREGIONS_TWO];
+    float msebest = FLT_MAX;
+
+    /*
+    collect the mse values that are within 5% of the best values
+    optimize each one and choose the best
+    */
+    // hack for now -- just use the best value WORK
+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+    {
+        float mse = roughtwo(t, i, tempendpts);
+        if (mse < msebest)
+        {
+            msebest = mse;
+            shapeindex_best = i;
+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+        }
+
+    }
+    return refinetwo(t, shapeindex_best, endptsbest, block);
+}
+
--- a/3rdparty/nvtt/bc7/avpcl.cpp
+++ b/3rdparty/nvtt/bc7/avpcl.cpp
@@ -0,0 +1,264 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// the avpcl compressor and decompressor
+
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/debug.h"
+#include "nvmath/vector.inl"
+#include <string.h>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+// global flags
+bool AVPCL::flag_premult = false;
+bool AVPCL::flag_nonuniform = false;
+bool AVPCL::flag_nonuniform_ati = false;
+
+// global mode
+bool AVPCL::mode_rgb = false;		// true if image had constant alpha = 255
+
+void AVPCL::compress(const Tile &t, char *block)
+{
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	float mse_mode0 = AVPCL::compress_mode0(t, tempblock);		if(mse_mode0 < msebest) { msebest = mse_mode0; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode1 = AVPCL::compress_mode1(t, tempblock);		if(mse_mode1 < msebest) { msebest = mse_mode1; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode2 = AVPCL::compress_mode2(t, tempblock);		if(mse_mode2 < msebest) { msebest = mse_mode2; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode3 = AVPCL::compress_mode3(t, tempblock);		if(mse_mode3 < msebest) { msebest = mse_mode3; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode4 = AVPCL::compress_mode4(t, tempblock);		if(mse_mode4 < msebest) { msebest = mse_mode4; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode5 = AVPCL::compress_mode5(t, tempblock);		if(mse_mode5 < msebest) { msebest = mse_mode5; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode6 = AVPCL::compress_mode6(t, tempblock);		if(mse_mode6 < msebest) { msebest = mse_mode6; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode7 = AVPCL::compress_mode7(t, tempblock);		if(mse_mode7 < msebest) { msebest = mse_mode7; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+		
+	/*if (errfile)
+	{
+		float errs[21];
+		int nerrs = 8;
+		errs[0] = mse_mode0; 
+		errs[1] = mse_mode1; 
+		errs[2] = mse_mode2; 
+		errs[3] = mse_mode3; 
+		errs[4] = mse_mode4; 
+		errs[5] = mse_mode5; 
+		errs[6] = mse_mode6; 
+		errs[7] = mse_mode7;
+		if (fwrite(errs, sizeof(float), nerrs, errfile) != nerrs)
+			throw "Write error on error file";
+	}*/
+}
+
+/*
+static int getbit(char *b, int start)
+{
+	if (start < 0 || start >= 128) return 0; // out of range
+
+	int ix = start >> 3;
+	return (b[ix] & (1 << (start & 7))) != 0;
+}
+
+static int getbits(char *b, int start, int len)
+{
+	int out = 0;
+	for (int i=0; i<len; ++i)
+		out |= getbit(b, start+i) << i;
+	return out;
+}
+
+static void setbit(char *b, int start, int bit)
+{
+	if (start < 0 || start >= 128) return; // out of range
+
+	int ix = start >> 3;
+
+	if (bit & 1)
+		b[ix] |= (1 << (start & 7));
+	else
+		b[ix] &= ~(1 << (start & 7));
+}
+
+static void setbits(char *b, int start, int len, int bits)
+{
+	for (int i=0; i<len; ++i)
+		setbit(b, start+i, bits >> i);
+}
+*/
+
+void AVPCL::decompress(const char *cblock, Tile &t)
+{
+	char block[AVPCL::BLOCKSIZE];
+	memcpy(block, cblock, AVPCL::BLOCKSIZE);
+
+	switch(getmode(block))
+	{
+	case 0:	AVPCL::decompress_mode0(block, t);	break;
+	case 1:	AVPCL::decompress_mode1(block, t);	break;
+	case 2:	AVPCL::decompress_mode2(block, t);	break;
+	case 3:	AVPCL::decompress_mode3(block, t);	break;
+	case 4:	AVPCL::decompress_mode4(block, t);	break;
+	case 5:	AVPCL::decompress_mode5(block, t);	break;
+	case 6:	AVPCL::decompress_mode6(block, t);	break;
+	case 7:	AVPCL::decompress_mode7(block, t);	break;
+	case 8: // return a black tile if you get a reserved mode
+		for (int y=0; y<Tile::TILE_H; ++y)
+			for (int x=0; x<Tile::TILE_W; ++x)
+				t.data[y][x].set(0, 0, 0, 0);
+		break;
+	default: nvUnreachable();
+	}
+}
+
+/*
+void AVPCL::compress(string inf, string avpclf, string errf)
+{
+	Array2D<RGBA> pixels;
+	int w, h;
+	char block[AVPCL::BLOCKSIZE];
+
+	Targa::read(inf, pixels, w, h);
+	FILE *avpclfile = fopen(avpclf.c_str(), "wb");
+	if (avpclfile == NULL) throw "Unable to open .avpcl file for write";
+	FILE *errfile = NULL;
+	if (errf != "")
+	{
+		errfile = fopen(errf.c_str(), "wb");
+		if (errfile == NULL) throw "Unable to open error file for write";
+	}
+
+	// Look at alpha channel and override the premult flag if alpha is constant (but only if premult is set)
+	if (AVPCL::flag_premult)
+	{
+		if (AVPCL::mode_rgb)
+		{
+			AVPCL::flag_premult = false;
+			cout << endl << "NOTE: Source image alpha is constant 255, turning off premultiplied-alpha error metric." << endl << endl;
+		}
+	}
+
+	// stuff for progress bar O.o
+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
+	int tilecnt = 0;
+	clock_t start, prev, cur;
+
+	start = prev = clock();
+
+	// convert to tiles and compress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			if ((tilecnt%100) == 0) { cur = clock(); printf("Progress %d of %d, %5.2f seconds per 100 tiles\r", tilecnt, ntiles, float(cur-prev)/CLOCKS_PER_SEC); fflush(stdout); prev = cur; }
+
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			t.insert(pixels, x, y);
+
+			AVPCL::compress(t, block, errfile);
+			if (fwrite(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
+				throw "File error on write";
+
+			// progress bar
+			++tilecnt;
+		}
+	}
+
+	cur = clock();
+	printf("\nTotal time to compress: %.2f seconds\n\n", float(cur-start)/CLOCKS_PER_SEC);		// advance to next line finally
+
+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
+	if (errfile && fclose(errfile)) throw "Close failed on error file";
+}
+
+static int str2int(std::string s) 
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+// avpcl file name is ...-w-h-RGB[A].avpcl, extract width and height
+static void extract(string avpclf, int &w, int &h, bool &mode_rgb)
+{
+	size_t n = avpclf.rfind('.', avpclf.length()-1);
+	size_t n1 = avpclf.rfind('-', n-1);
+	size_t n2 = avpclf.rfind('-', n1-1);
+	size_t n3 = avpclf.rfind('-', n2-1);
+	//	...-wwww-hhhh-RGB[A].avpcl
+	//     ^    ^    ^      ^
+	//     n3   n2   n1     n n3<n2<n1<n
+	string width = avpclf.substr(n3+1, n2-n3-1);
+	w = str2int(width);
+	string height = avpclf.substr(n2+1, n1-n2-1);
+	h = str2int(height);
+	string mode = avpclf.substr(n1+1, n-n1-1);
+	mode_rgb = mode == "RGB";
+}
+
+static int modehist[8];
+
+static void stats(char block[AVPCL::BLOCKSIZE])
+{
+	int m = AVPCL::getmode(block);
+	modehist[m]++;
+}
+
+static void printstats()
+{
+	printf("\nMode histogram: "); for (int i=0; i<8; ++i) { printf("%d,", modehist[i]); }
+	printf("\n");
+}
+
+void AVPCL::decompress(string avpclf, string outf)
+{
+	Array2D<RGBA> pixels;
+	int w, h;
+	char block[AVPCL::BLOCKSIZE];
+
+	extract(avpclf, w, h, AVPCL::mode_rgb);
+	FILE *avpclfile = fopen(avpclf.c_str(), "rb");
+	if (avpclfile == NULL) throw "Unable to open .avpcl file for read";
+	pixels.resizeErase(h, w);
+
+	// convert to tiles and decompress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			if (fread(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
+				throw "File error on read";
+
+			stats(block);	// collect statistics
+		
+			AVPCL::decompress(block, t);
+
+			t.extract(pixels, x, y);
+		}
+	}
+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
+
+	Targa::write(outf, pixels, w, h);
+
+	printstats();	// print statistics
+}
+*/
--- a/3rdparty/nvtt/bc7/avpcl.h
+++ b/3rdparty/nvtt/bc7/avpcl.h
@@ -0,0 +1,99 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_H
+#define _AVPCL_H
+
+#include "tile.h"
+#include "bits.h"
+
+#define	DISABLE_EXHAUSTIVE	1	// define this if you don't want to spend a lot of time on exhaustive compression
+#define	USE_ZOH_INTERP		1	// use zoh interpolator, otherwise use exact avpcl interpolators
+#define	USE_ZOH_INTERP_ROUNDED 1	// use the rounded versions!
+
+namespace AVPCL {
+
+static const int NREGIONS_TWO	= 2;
+static const int NREGIONS_THREE	= 3;
+
+static const int BLOCKSIZE=16;
+static const int BITSIZE=128;
+
+// global flags
+extern bool flag_premult;
+extern bool flag_nonuniform;
+extern bool flag_nonuniform_ati;
+
+// global mode
+extern bool mode_rgb;		// true if image had constant alpha = 255
+
+void compress(const Tile &t, char *block);
+void decompress(const char *block, Tile &t);
+
+float compress_mode0(const Tile &t, char *block);
+void decompress_mode0(const char *block, Tile &t);
+
+float compress_mode1(const Tile &t, char *block);
+void decompress_mode1(const char *block, Tile &t);
+
+float compress_mode2(const Tile &t, char *block);
+void decompress_mode2(const char *block, Tile &t);
+
+float compress_mode3(const Tile &t, char *block);
+void decompress_mode3(const char *block, Tile &t);
+
+float compress_mode4(const Tile &t, char *block);
+void decompress_mode4(const char *block, Tile &t);
+
+float compress_mode5(const Tile &t, char *block);
+void decompress_mode5(const char *block, Tile &t);
+
+float compress_mode6(const Tile &t, char *block);
+void decompress_mode6(const char *block, Tile &t);
+
+float compress_mode7(const Tile &t, char *block);
+void decompress_mode7(const char *block, Tile &t);
+
+inline int getmode(Bits &in)
+{
+	int mode = 0;
+
+	if (in.read(1))			mode = 0;
+	else if (in.read(1))	mode = 1;
+	else if (in.read(1))	mode = 2;
+	else if (in.read(1))	mode = 3;
+	else if (in.read(1))	mode = 4;
+	else if (in.read(1))	mode = 5;
+	else if (in.read(1))	mode = 6;
+	else if (in.read(1))	mode = 7;
+	else mode = 8;	// reserved
+	return mode;
+}
+inline int getmode(const char *block)
+{
+	int bits = block[0], mode = 0;
+
+	if (bits & 1) mode = 0;
+	else if ((bits&3) == 2) mode = 1;
+	else if ((bits&7) == 4) mode = 2;
+	else if ((bits & 0xF) == 8) mode = 3;
+	else if ((bits & 0x1F) == 16) mode = 4;
+	else if ((bits & 0x3F) == 32) mode = 5;
+	else if ((bits & 0x7F) == 64) mode = 6;
+	else if ((bits & 0xFF) == 128) mode = 7;
+	else mode = 8;	// reserved
+	return mode;
+}
+
+}
+
+#endif
--- a/3rdparty/nvtt/bc7/avpcl_mode0.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode0.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode1.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode1.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode2.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode2.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode3.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode3.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode4.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode4.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode5.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode5.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode6.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode6.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode7.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode7.cpp
--- a/3rdparty/nvtt/bc7/avpcl_utils.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_utils.cpp
@@ -0,0 +1,389 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Utility and common routines
+
+#include "avpcl_utils.h"
+#include "avpcl.h"
+#include "nvmath/vector.inl"
+#include <math.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+static const int denom7_weights[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
+static const int denom15_weights[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
+
+int Utils::lerp(int a, int b, int i, int bias, int denom)
+{
+#ifdef	USE_ZOH_INTERP
+	nvAssert (denom == 3 || denom == 7 || denom == 15);
+	nvAssert (i >= 0 && i <= denom);
+	nvAssert (bias >= 0 && bias <= denom/2);
+	nvAssert (a >= 0 && b >= 0);
+
+	int round = 0;
+#ifdef	USE_ZOH_INTERP_ROUNDED
+	round = 32;
+#endif
+
+	switch (denom)
+	{
+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
+	case 15:return (a*denom15_weights[denom-i] + b*denom15_weights[i] + round) >> 6;
+	case 7:	return (a*denom7_weights[denom-i] + b*denom7_weights[i] + round) >> 6;
+	default: nvUnreachable(); return 0;
+	}
+#else
+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
+#endif
+}
+
+Vector4 Utils::lerp(Vector4::Arg a, Vector4::Arg b, int i, int bias, int denom)
+{
+#ifdef	USE_ZOH_INTERP
+	nvAssert (denom == 3 || denom == 7 || denom == 15);
+	nvAssert (i >= 0 && i <= denom);
+	nvAssert (bias >= 0 && bias <= denom/2);
+//	nvAssert (a >= 0 && b >= 0);
+
+	// no need to bias these as this is an exact division
+
+	switch (denom)
+	{
+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
+	case 15:return (a*float(denom15_weights[denom-i]) + b*float(denom15_weights[i])) / 64.0f;
+	case 7:	return (a*float(denom7_weights[denom-i]) + b*float(denom7_weights[i])) / 64.0f;
+	default: nvUnreachable(); return Vector4(0);
+	}
+#else
+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
+#endif
+}
+
+
+int Utils::unquantize(int q, int prec)
+{
+	int unq;
+
+	nvAssert (prec > 3);	// we only want to do one replicate
+
+#ifdef USE_ZOH_QUANT
+	if (prec >= 8)
+		unq = q;
+	else if (q == 0) 
+		unq = 0;
+	else if (q == ((1<<prec)-1)) 
+		unq = 255;
+	else
+		unq = (q * 256 + 128) >> prec;
+#else
+	// avpcl unquantizer -- bit replicate
+	unq = (q << (8-prec)) | (q >> (2*prec-8));
+#endif
+
+	return unq;
+}
+
+// quantize to the best value -- i.e., minimize unquantize error
+int Utils::quantize(float value, int prec)
+{
+	int q, unq;
+
+	nvAssert (prec > 3);	// we only want to do one replicate
+
+	unq = (int)floor(value + 0.5f);
+	nvAssert (unq <= 255);
+
+#ifdef USE_ZOH_QUANT
+	q = (prec >= 8) ? unq : (unq << prec) / 256;
+#else
+	// avpcl quantizer -- scale properly for best possible bit-replicated result
+	q = (unq * ((1<<prec)-1) + 127)/255;
+#endif
+
+	nvAssert (q >= 0 && q < (1 << prec));
+
+	return q;
+}
+
+float Utils::metric4(Vector4::Arg a, Vector4::Arg b)
+{
+	Vector4 err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+// WORK -- implement rotatemode for the below -- that changes where the rwt, gwt, and bwt's go.
+float Utils::metric3(Vector3::Arg a, Vector3::Arg b, int rotatemode)
+{
+	Vector3 err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: rwt = 1.0f; break;
+		case ROTATEMODE_RGBA_RABG: gwt = 1.0f; break;
+		case ROTATEMODE_RGBA_RGAB: bwt = 1.0f; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric1(const float a, const float b, int rotatemode)
+{
+	float err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt, awt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break;
+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err *= awt;
+	}
+
+	return err * err;
+}
+
+float Utils::premult(float r, float a)
+{
+	// note that the args are really integers stored in floats
+	int R = int(r), A = int(a);
+
+	nvAssert ((R==r) && (A==a));
+
+	return float((R*A + 127)/255);
+}
+
+static void premult4(Vector4& rgba)
+{
+	rgba.x = Utils::premult(rgba.x, rgba.w);
+	rgba.y = Utils::premult(rgba.y, rgba.w);
+	rgba.z = Utils::premult(rgba.z, rgba.w);
+}
+
+static void premult3(Vector3& rgb, float a)
+{
+	rgb.x = Utils::premult(rgb.x, a);
+	rgb.y = Utils::premult(rgb.y, a);
+	rgb.z = Utils::premult(rgb.z, a);
+}
+
+float Utils::metric4premult(Vector4::Arg a, Vector4::Arg b)
+{
+	Vector4 pma = a, pmb = b;
+
+	premult4(pma);
+	premult4(pmb);
+
+	Vector4 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric3premult_alphaout(Vector3::Arg rgb0, float a0, Vector3::Arg rgb1, float a1)
+{
+	Vector3 pma = rgb0, pmb = rgb1;
+
+	premult3(pma, a0);
+	premult3(pmb, a1);
+
+	Vector3 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric3premult_alphain(Vector3::Arg rgb0, Vector3::Arg rgb1, int rotatemode)
+{
+	Vector3 pma = rgb0, pmb = rgb1;
+
+	switch(rotatemode)
+	{
+	case ROTATEMODE_RGBA_RGBA:
+		// this function isn't supposed to be called for this rotatemode
+		nvUnreachable();
+		break;
+	case ROTATEMODE_RGBA_AGBR:
+		pma.y = premult(pma.y, pma.x);
+		pma.z = premult(pma.z, pma.x);
+		pmb.y = premult(pmb.y, pmb.x);
+		pmb.z = premult(pmb.z, pmb.x);
+		break;
+	case ROTATEMODE_RGBA_RABG:
+		pma.x = premult(pma.x, pma.y);
+		pma.z = premult(pma.z, pma.y);
+		pmb.x = premult(pmb.x, pmb.y);
+		pmb.z = premult(pmb.z, pmb.y);
+		break;
+	case ROTATEMODE_RGBA_RGAB:
+		pma.x = premult(pma.x, pma.z);
+		pma.y = premult(pma.y, pma.z);
+		pmb.x = premult(pmb.x, pmb.z);
+		pmb.y = premult(pmb.y, pmb.z);
+		break;
+	default: nvUnreachable();
+	}
+
+	Vector3 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode)
+{
+	float err = premult(rgb0, a0) - premult(rgb1, a1);
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt, awt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break;
+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err *= awt;
+	}
+
+	return err * err;
+}
--- a/3rdparty/nvtt/bc7/avpcl_utils.h
+++ b/3rdparty/nvtt/bc7/avpcl_utils.h
@@ -0,0 +1,61 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// utility class holding common routines
+#ifndef _AVPCL_UTILS_H
+#define _AVPCL_UTILS_H
+
+#include "nvmath/vector.h"
+
+namespace AVPCL {
+
+inline int SIGN_EXTEND(int x, int nb) { return ((((x)&(1<<((nb)-1)))?((~0)<<(nb)):0)|(x)); }
+
+static const int INDEXMODE_BITS				= 1;		// 2 different index modes
+static const int NINDEXMODES				= (1<<(INDEXMODE_BITS));
+static const int INDEXMODE_ALPHA_IS_3BITS	= 0;
+static const int INDEXMODE_ALPHA_IS_2BITS	= 1;
+
+static const int ROTATEMODE_BITS		= 2;		// 4 different rotate modes
+static const int NROTATEMODES			= (1<<(ROTATEMODE_BITS));
+static const int ROTATEMODE_RGBA_RGBA	= 0;
+static const int ROTATEMODE_RGBA_AGBR	= 1;
+static const int ROTATEMODE_RGBA_RABG	= 2;
+static const int ROTATEMODE_RGBA_RGAB	= 3;
+
+class Utils
+{
+public:
+	// error metrics
+	static float metric4(nv::Vector4::Arg a, nv::Vector4::Arg b);
+	static float metric3(nv::Vector3::Arg a, nv::Vector3::Arg b, int rotatemode);
+	static float metric1(float a, float b, int rotatemode);
+
+	static float metric4premult(nv::Vector4::Arg rgba0, nv::Vector4::Arg rgba1);
+	static float metric3premult_alphaout(nv::Vector3::Arg rgb0, float a0, nv::Vector3::Arg rgb1, float a1);
+	static float metric3premult_alphain(nv::Vector3::Arg rgb0, nv::Vector3::Arg rgb1, int rotatemode);
+	static float metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode);
+
+	static float premult(float r, float a);
+
+	// quantization and unquantization
+	static int unquantize(int q, int prec);
+	static int quantize(float value, int prec);
+
+	// lerping
+	static int lerp(int a, int b, int i, int bias, int denom);
+	static nv::Vector4 lerp(nv::Vector4::Arg a, nv::Vector4::Arg b, int i, int bias, int denom);
+};
+
+}
+
+#endif
--- a/3rdparty/nvtt/bc7/bits.h
+++ b/3rdparty/nvtt/bc7/bits.h
@@ -0,0 +1,76 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_BITS_H
+#define _AVPCL_BITS_H
+
+// read/write a bitstream
+
+#include "nvcore/debug.h"
+
+namespace AVPCL {
+
+class Bits
+{
+public:
+
+	Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
+	Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
+
+	void write(int value, int nbits) {
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		for (int i=0; i<nbits; ++i)
+			writeone(value>>i);
+	}
+	int read(int nbits) { 
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		int out = 0;
+		for (int i=0; i<nbits; ++i)
+			out |= readone() << i;
+		return out;
+	}
+	int getptr() { return bptr; }
+	void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
+	int getsize() { return bend; }
+
+private:
+	int	bptr;		// next bit to read
+	int bend;		// last written bit + 1
+	char *bits;		// ptr to user bit stream
+	const char *cbits;	// ptr to const user bit stream
+	int maxbits;	// max size of user bit stream
+	char readonly;	// 1 if this is a read-only stream
+
+	int readone() {
+		nvAssert (bptr < bend);
+		if (bptr >= bend) return 0;
+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
+		++bptr;
+		return bit != 0;
+	}
+	void writeone(int bit) {
+		nvAssert (!readonly); // "Writing a read-only bit stream"
+		nvAssert (bptr < maxbits);
+		if (bptr >= maxbits) return;
+		if (bit&1)
+			bits[bptr>>3] |= 1 << (bptr & 7);
+		else
+			bits[bptr>>3] &= ~(1 << (bptr & 7));
+		if (bptr++ >= bend) bend = bptr;
+	}
+};
+
+}
+
+#endif
--- a/3rdparty/nvtt/bc7/endpts.h
+++ b/3rdparty/nvtt/bc7/endpts.h
@@ -0,0 +1,81 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_ENDPTS_H
+#define _AVPCL_ENDPTS_H
+
+// endpoint definitions and routines to search through endpoint space
+
+#include "nvmath/vector.h"
+
+namespace AVPCL {
+
+static const int NCHANNELS_RGB	= 3;
+static const int NCHANNELS_RGBA	= 4;
+static const int CHANNEL_R		= 0;
+static const int CHANNEL_G		= 1;
+static const int CHANNEL_B		= 2;
+static const int CHANNEL_A		= 3;
+
+struct FltEndpts
+{
+	nv::Vector4	A;
+	nv::Vector4	B;
+};
+
+struct IntEndptsRGB
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+};
+
+struct IntEndptsRGB_1
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+	int		lsb;				// shared lsb for A and B
+};
+
+struct IntEndptsRGB_2
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+	int		a_lsb;				// lsb for A
+	int		b_lsb;				// lsb for B
+};
+
+
+struct IntEndptsRGBA
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+};
+
+struct IntEndptsRGBA_2
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+	int		a_lsb;				// lsb for A
+	int		b_lsb;				// lsb for B
+};
+
+struct IntEndptsRGBA_2a
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+	int		a_lsb;				// lsb for RGB channels of A
+	int		b_lsb;				// lsb for RGB channels of A
+};
+
+}
+
+#endif
--- a/3rdparty/nvtt/bc7/shapes_three.h
+++ b/3rdparty/nvtt/bc7/shapes_three.h
@@ -0,0 +1,132 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef	_AVPCL_SHAPES_THREE_H
+#define _AVPCL_SHAPES_THREE_H
+
+// shapes for 3 regions
+
+#define NREGIONS 3
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 2, 2, 2,   
+0, 0, 1, 1,   0, 0, 1, 1,   2, 0, 0, 1,   0, 0, 2, 2,   
+0, 2, 2, 1,   2, 2, 1, 1,   2, 2, 1, 1,   0, 0, 1, 1,   
+2, 2, 2, 2,   2, 2, 2, 1,   2, 2, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 2,   
+0, 0, 0, 0,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 1, 2,   
+1, 1, 1, 1,   1, 1, 1, 1,   2, 2, 2, 2,   0, 0, 1, 2,   
+2, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 2,   0, 0, 1, 2,   
+
+0, 1, 1, 2,   0, 1, 2, 2,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 2,   0, 1, 2, 2,   0, 1, 1, 2,   2, 0, 0, 1,   
+0, 1, 1, 2,   0, 1, 2, 2,   1, 1, 2, 2,   2, 2, 0, 0,   
+0, 1, 1, 2,   0, 1, 2, 2,   1, 2, 2, 2,   2, 2, 2, 0,   
+
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
+0, 0, 1, 1,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
+0, 1, 1, 2,   2, 0, 0, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
+1, 1, 2, 2,   2, 2, 0, 0,   1, 1, 2, 2,   1, 1, 1, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   1, 1, 0, 0,   
+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
+
+0, 1, 2, 2,   0, 0, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 2, 2,   0, 0, 1, 2,   1, 2, 2, 1,   0, 1, 1, 0,   
+0, 0, 1, 1,   1, 1, 2, 2,   1, 2, 2, 1,   1, 2, 2, 1,   
+0, 0, 0, 0,   2, 2, 2, 2,   0, 1, 1, 0,   1, 2, 2, 1,   
+
+0, 0, 2, 2,   0, 1, 1, 0,   0, 0, 1, 1,   0, 0, 0, 0,   
+1, 1, 0, 2,   0, 1, 1, 0,   0, 1, 2, 2,   2, 0, 0, 0,   
+1, 1, 0, 2,   2, 0, 0, 2,   0, 1, 2, 2,   2, 2, 1, 1,   
+0, 0, 2, 2,   2, 2, 2, 2,   0, 0, 1, 1,   2, 2, 2, 1,   
+
+0, 0, 0, 0,   0, 2, 2, 2,   0, 0, 1, 1,   0, 1, 2, 0,   
+0, 0, 0, 2,   0, 0, 2, 2,   0, 0, 1, 2,   0, 1, 2, 0,   
+1, 1, 2, 2,   0, 0, 1, 2,   0, 0, 2, 2,   0, 1, 2, 0,   
+1, 2, 2, 2,   0, 0, 1, 1,   0, 2, 2, 2,   0, 1, 2, 0,   
+
+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
+1, 1, 1, 1,   1, 2, 0, 1,   2, 0, 1, 2,   2, 2, 0, 0,   
+2, 2, 2, 2,   2, 0, 1, 2,   1, 2, 0, 1,   1, 1, 2, 2,   
+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
+1, 1, 2, 2,   0, 1, 0, 1,   0, 0, 0, 0,   1, 1, 2, 2,   
+2, 2, 0, 0,   2, 2, 2, 2,   2, 1, 2, 1,   0, 0, 2, 2,   
+0, 0, 1, 1,   2, 2, 2, 2,   2, 1, 2, 1,   1, 1, 2, 2,   
+
+0, 0, 2, 2,   0, 2, 2, 0,   0, 1, 0, 1,   0, 0, 0, 0,   
+0, 0, 1, 1,   1, 2, 2, 1,   2, 2, 2, 2,   2, 1, 2, 1,   
+0, 0, 2, 2,   0, 2, 2, 0,   2, 2, 2, 2,   2, 1, 2, 1,   
+0, 0, 1, 1,   1, 2, 2, 1,   0, 1, 0, 1,   2, 1, 2, 1,   
+
+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   0, 0, 0, 0,   
+0, 1, 0, 1,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   2, 1, 1, 2,   
+2, 2, 2, 2,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
+
+0, 2, 2, 2,   0, 0, 0, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   2, 1, 1, 2,   
+0, 2, 2, 2,   0, 0, 0, 2,   2, 2, 2, 2,   2, 1, 1, 2,   
+
+0, 1, 1, 0,   0, 0, 2, 2,   0, 0, 2, 2,   0, 0, 0, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
+2, 2, 2, 2,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
+2, 2, 2, 2,   0, 0, 2, 2,   0, 0, 2, 2,   2, 1, 1, 2,   
+
+0, 0, 0, 2,   0, 2, 2, 2,   0, 1, 0, 1,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 0, 1, 1,   
+0, 0, 0, 2,   0, 2, 2, 2,   2, 2, 2, 2,   2, 2, 0, 1,   
+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 0,
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static int shapeindex_to_compressed_indices[NSHAPES*3] = 
+{
+	0, 3,15,  0, 3, 8,  0,15, 8,  0,15, 3,
+	0, 8,15,  0, 3,15,  0,15, 3,  0,15, 8,
+	0, 8,15,  0, 8,15,  0, 6,15,  0, 6,15,
+	0, 6,15,  0, 5,15,  0, 3,15,  0, 3, 8,
+
+	0, 3,15,  0, 3, 8,  0, 8,15,  0,15, 3,
+	0, 3,15,  0, 3, 8,  0, 6,15,  0,10, 8,
+	0, 5, 3,  0, 8,15,  0, 8, 6,  0, 6,10,
+	0, 8,15,  0, 5,15,  0,15,10,  0,15, 8,
+
+	0, 8,15,  0,15, 3,  0, 3,15,  0, 5,10,
+	0, 6,10,  0,10, 8,  0, 8, 9,  0,15,10,
+	0,15, 6,  0, 3,15,  0,15, 8,  0, 5,15,
+	0,15, 3,  0,15, 6,  0,15, 6,  0,15, 8,
+
+	0, 3,15,  0,15, 3,  0, 5,15,  0, 5,15,
+	0, 5,15,  0, 8,15,  0, 5,15,  0,10,15,
+	0, 5,15,  0,10,15,  0, 8,15,  0,13,15,
+	0,15, 3,  0,12,15,  0, 3,15,  0, 3, 8
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*3+(region)]
+
+#endif
--- a/3rdparty/nvtt/bc7/shapes_two.h
+++ b/3rdparty/nvtt/bc7/shapes_two.h
@@ -0,0 +1,133 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_SHAPES_TWO_H
+#define _AVPCL_SHAPES_TWO_H
+
+// shapes for two regions
+
+#define NREGIONS 2
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
+
+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
+
+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
+
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static int shapeindex_to_compressed_indices[NSHAPES*2] = 
+{
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+
+	0,15,  0, 2,  0, 8,  0, 2,
+	0, 2,  0, 8,  0, 8,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 8,  0, 8,  0, 2,  0, 2,
+
+	0,15,  0,15,  0, 6,  0, 8,
+	0, 2,  0, 8,  0,15,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 2,  0,15,  0,15,  0, 6,
+
+	0, 6,  0, 2,  0, 6,  0, 8,
+	0,15,  0,15,  0, 2,  0, 2,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0, 2,  0, 2,  0,15
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
+
+#endif
--- a/3rdparty/nvtt/bc7/tile.h
+++ b/3rdparty/nvtt/bc7/tile.h
@@ -0,0 +1,41 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_TILE_H
+#define _AVPCL_TILE_H
+
+#include "nvmath/vector.h"
+#include <math.h>
+#include "avpcl_utils.h"
+
+namespace AVPCL {
+
+// extract a tile of pixels from an array
+
+class Tile
+{
+public:
+	static const int TILE_H = 4;
+	static const int TILE_W = 4;
+	static const int TILE_TOTAL = TILE_H * TILE_W;
+	nv::Vector4 data[TILE_H][TILE_W];
+    float importance_map[TILE_H][TILE_W];
+	int	size_x, size_y;			// actual size of tile
+
+	Tile() {};
+	~Tile(){};
+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
+};
+
+}
+
+#endif
--- a/3rdparty/nvtt/nvcore/array.h
+++ b/3rdparty/nvtt/nvcore/array.h
@@ -0,0 +1,181 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_ARRAY_H
+#define NV_CORE_ARRAY_H
+
+/*
+This array class requires the elements to be relocable; it uses memmove and realloc. Ideally I should be 
+using swap, but I honestly don't care. The only thing that you should be aware of is that internal pointers
+are not supported.
+
+Note also that push_back and resize does not support inserting arguments elements that are in the same 
+container. This is forbidden to prevent an extra copy.
+*/
+
+
+#include "memory.h"
+#include "debug.h"
+#include "foreach.h" // pseudoindex
+
+
+namespace nv 
+{
+    class Stream;
+
+    /**
+    * Replacement for std::vector that is easier to debug and provides
+    * some nice foreach enumerators. 
+    */
+    template<typename T>
+    class NVCORE_CLASS Array {
+    public:
+        typedef uint size_type;
+
+        // Default constructor.
+        NV_FORCEINLINE Array() : m_buffer(NULL), m_capacity(0), m_size(0) {}
+
+        // Copy constructor.
+        NV_FORCEINLINE Array(const Array & a) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            copy(a.m_buffer, a.m_size);
+        }
+
+        // Constructor that initializes the vector with the given elements.
+        NV_FORCEINLINE Array(const T * ptr, uint num) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            copy(ptr, num);
+        }
+
+        // Allocate array.
+        NV_FORCEINLINE explicit Array(uint capacity) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            setArrayCapacity(capacity);
+        }
+
+        // Destructor.
+        NV_FORCEINLINE ~Array() {
+            clear();
+            free<T>(m_buffer);
+        }
+
+
+        /// Const element access.
+        NV_FORCEINLINE const T & operator[]( uint index ) const
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+        NV_FORCEINLINE const T & at( uint index ) const
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+
+        /// Element access.
+        NV_FORCEINLINE T & operator[] ( uint index )
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+        NV_FORCEINLINE T & at( uint index )
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+
+        /// Get vector size.
+        NV_FORCEINLINE uint size() const { return m_size; }
+
+        /// Get vector size.
+        NV_FORCEINLINE uint count() const { return m_size; }
+
+        /// Get vector capacity.
+        NV_FORCEINLINE uint capacity() const { return m_capacity; }
+
+        /// Get const vector pointer.
+        NV_FORCEINLINE const T * buffer() const { return m_buffer; }
+
+        /// Get vector pointer.
+        NV_FORCEINLINE T * buffer() { return m_buffer; }
+
+        /// Provide begin/end pointers for C++11 range-based for loops.
+        NV_FORCEINLINE T * begin() { return m_buffer; }
+        NV_FORCEINLINE T * end() { return m_buffer + m_size; }
+        NV_FORCEINLINE const T * begin() const { return m_buffer; }
+        NV_FORCEINLINE const T * end() const { return m_buffer + m_size; }
+
+        /// Is vector empty.
+        NV_FORCEINLINE bool isEmpty() const { return m_size == 0; }
+
+        /// Is a null vector.
+        NV_FORCEINLINE bool isNull() const { return m_buffer == NULL; }
+
+
+        T & append();
+        void push_back( const T & val );
+        void pushBack( const T & val );
+        Array<T> & append( const T & val );
+        Array<T> & operator<< ( T & t );
+        void pop_back();
+        void popBack(uint count = 1);
+        void popFront(uint count = 1);
+        const T & back() const;
+        T & back();
+        const T & front() const;
+        T & front();
+        bool contains(const T & e) const;
+        bool find(const T & element, uint * indexPtr) const;
+        bool find(const T & element, uint begin, uint end, uint * indexPtr) const;
+        void removeAt(uint index);
+        bool remove(const T & element);
+        void insertAt(uint index, const T & val = T());
+        void append(const Array<T> & other);
+        void append(const T other[], uint count);
+        void replaceWithLast(uint index);
+        void resize(uint new_size);
+        void resize(uint new_size, const T & elem);
+        void fill(const T & elem);
+        void clear();
+        void shrink();
+        void reserve(uint desired_size);
+        void copy(const T * data, uint count);
+        Array<T> & operator=( const Array<T> & a );
+        T * release();
+
+
+        // Array enumerator.
+        typedef uint PseudoIndex;
+
+        NV_FORCEINLINE PseudoIndex start() const { return 0; }
+        NV_FORCEINLINE bool isDone(const PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); return i == this->m_size; }
+        NV_FORCEINLINE void advance(PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); i++; }
+
+#if NV_CC_MSVC
+        NV_FORCEINLINE T & operator[]( const PseudoIndexWrapper & i ) {
+            return m_buffer[i(this)];
+        }
+        NV_FORCEINLINE const T & operator[]( const PseudoIndexWrapper & i ) const {
+            return m_buffer[i(this)];
+        }
+#endif
+
+        // Friends.
+        template <typename Typ> 
+        friend Stream & operator<< ( Stream & s, Array<Typ> & p );
+
+        template <typename Typ>
+        friend void swap(Array<Typ> & a, Array<Typ> & b);
+
+
+    protected:
+
+        void setArraySize(uint new_size);
+        void setArrayCapacity(uint new_capacity);
+
+        T * m_buffer;
+        uint m_capacity;
+        uint m_size;
+
+    };
+
+
+} // nv namespace
+
+#endif // NV_CORE_ARRAY_H
--- a/3rdparty/nvtt/nvcore/array.inl
+++ b/3rdparty/nvtt/nvcore/array.inl
@@ -0,0 +1,437 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_ARRAY_INL
+#define NV_CORE_ARRAY_INL
+
+#include "array.h"
+
+#include "stream.h"
+#include "utils.h" // swap
+
+#include <string.h>	// memmove
+#include <new> // for placement new
+
+
+
+namespace nv 
+{
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::append()
+    {
+        uint old_size = m_size;
+        uint new_size = m_size + 1;
+
+        setArraySize(new_size);
+
+        construct_range(m_buffer, new_size, old_size);
+
+        return m_buffer[old_size]; // Return reference to last element.
+    }
+
+    // Push an element at the end of the vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::push_back( const T & val )
+    {
+#if 1
+        nvDebugCheck(&val < m_buffer || &val >= m_buffer+m_size);
+
+        uint old_size = m_size;
+        uint new_size = m_size + 1;
+
+        setArraySize(new_size);
+
+        construct_range(m_buffer, new_size, old_size, val);
+#else
+        uint new_size = m_size + 1;
+
+        if (new_size > m_capacity)
+        {
+            // @@ Is there any way to avoid this copy?
+            // @@ Can we create a copy without side effects? Ie. without calls to constructor/destructor. Use alloca + memcpy?
+            // @@ Assert instead of copy?
+            const T copy(val);	// create a copy in case value is inside of this array.
+
+            setArraySize(new_size);
+
+            new (m_buffer+new_size-1) T(copy);
+        }
+        else
+        {
+            m_size = new_size;
+            new(m_buffer+new_size-1) T(val);
+        }
+#endif // 0/1
+    }
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::pushBack( const T & val )
+    {
+        push_back(val);
+    }
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::append( const T & val )
+    {
+        push_back(val);
+        return *this;
+    }
+
+    // Qt like push operator.
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::operator<< ( T & t )
+    {
+        push_back(t);
+        return *this;
+    }
+
+    // Pop the element at the end of the vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::pop_back()
+    {
+        nvDebugCheck( m_size > 0 );
+        resize( m_size - 1 );
+    }
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::popBack(uint count)
+    {
+        nvDebugCheck(m_size >= count);
+        resize(m_size - count);
+    }
+
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::popFront(uint count)
+    {
+        nvDebugCheck(m_size >= count);
+        //resize(m_size - count);
+
+        if (m_size == count) {
+            clear();
+        }
+        else {
+            destroy_range(m_buffer, 0, count);
+
+            memmove(m_buffer, m_buffer + count, sizeof(T) * (m_size - count));
+
+            m_size -= count;
+        }
+
+    }
+
+
+    // Get back element.
+    template <typename T>
+    NV_FORCEINLINE const T & Array<T>::back() const
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[m_size-1];
+    }
+
+    // Get back element.
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::back()
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[m_size-1];
+    }
+
+    // Get front element.
+    template <typename T>
+    NV_FORCEINLINE const T & Array<T>::front() const
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[0];
+    }
+
+    // Get front element.
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::front()
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[0];
+    }
+
+    // Check if the given element is contained in the array.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::contains(const T & e) const
+    {
+        return find(e, NULL);
+    }
+
+    // Return true if element found.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint * indexPtr) const
+    {
+        return find(element, 0, m_size, indexPtr);
+    }
+
+    // Return true if element found within the given range.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint begin, uint end, uint * indexPtr) const
+    {
+        return ::nv::find(element, m_buffer, begin, end, indexPtr);
+    }
+
+
+    // Remove the element at the given index. This is an expensive operation!
+    template <typename T>
+    void Array<T>::removeAt(uint index)
+    {
+        nvDebugCheck(index >= 0 && index < m_size);
+
+        if (m_size == 1) {
+            clear();
+        }
+        else {
+            m_buffer[index].~T();
+
+            memmove(m_buffer+index, m_buffer+index+1, sizeof(T) * (m_size - 1 - index));
+            m_size--;
+        }
+    }
+
+    // Remove the first instance of the given element.
+    template <typename T>
+    bool Array<T>::remove(const T & element)
+    {
+        uint index;
+        if (find(element, &index)) {
+            removeAt(index);
+            return true;
+        }
+        return false;
+    }
+
+    // Insert the given element at the given index shifting all the elements up.
+    template <typename T>
+    void Array<T>::insertAt(uint index, const T & val/*=T()*/)
+    {
+        nvDebugCheck( index >= 0 && index <= m_size );
+
+        setArraySize(m_size + 1);
+
+        if (index < m_size - 1) {
+            memmove(m_buffer+index+1, m_buffer+index, sizeof(T) * (m_size - 1 - index));
+        }
+
+        // Copy-construct into the newly opened slot.
+        new(m_buffer+index) T(val);
+    }
+
+    // Append the given data to our vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::append(const Array<T> & other)
+    {
+        append(other.m_buffer, other.m_size);
+    }
+
+    // Append the given data to our vector.
+    template <typename T>
+    void Array<T>::append(const T other[], uint count)
+    {
+        if (count > 0) {
+            const uint old_size = m_size;
+
+            setArraySize(m_size + count);
+
+            for (uint i = 0; i < count; i++ ) {
+                new(m_buffer + old_size + i) T(other[i]);
+            }
+        }
+    }
+
+
+    // Remove the given element by replacing it with the last one.
+    template <typename T> 
+    void Array<T>::replaceWithLast(uint index)
+    {
+        nvDebugCheck( index < m_size );
+        nv::swap(m_buffer[index], back());      // @@ Is this OK when index == size-1?
+        (m_buffer+m_size-1)->~T();
+        m_size--;
+    }
+
+    // Resize the vector preserving existing elements.
+    template <typename T> 
+    void Array<T>::resize(uint new_size)
+    {
+        uint old_size = m_size;
+
+        // Destruct old elements (if we're shrinking).
+        destroy_range(m_buffer, new_size, old_size);
+
+        setArraySize(new_size);
+
+        // Call default constructors
+        construct_range(m_buffer, new_size, old_size);
+    }
+
+
+    // Resize the vector preserving existing elements and initializing the
+    // new ones with the given value.
+    template <typename T> 
+    void Array<T>::resize(uint new_size, const T & elem)
+    {
+        nvDebugCheck(&elem < m_buffer || &elem > m_buffer+m_size);
+
+        uint old_size = m_size;
+
+        // Destruct old elements (if we're shrinking).
+        destroy_range(m_buffer, new_size, old_size);
+
+        setArraySize(new_size);
+
+        // Call copy constructors
+        construct_range(m_buffer, new_size, old_size, elem);
+    }
+
+    // Fill array with the given value.
+    template <typename T>
+    void Array<T>::fill(const T & elem)
+    {
+        fill(m_buffer, m_size, elem);
+    }
+
+    // Clear the buffer.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::clear()
+    {
+        nvDebugCheck(isValidPtr(m_buffer));
+
+        // Destruct old elements
+        destroy_range(m_buffer, 0, m_size);
+
+        m_size = 0;
+    }
+
+    // Shrink the allocated vector.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::shrink()
+    {
+        if (m_size < m_capacity) {
+            setArrayCapacity(m_size);
+        }
+    }
+
+    // Preallocate space.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::reserve(uint desired_size)
+    {
+        if (desired_size > m_capacity) {
+            setArrayCapacity(desired_size);
+        }
+    }
+
+    // Copy elements to this array. Resizes it if needed.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::copy(const T * data, uint count)
+    {
+#if 1   // More simple, but maybe not be as efficient?
+        destroy_range(m_buffer, 0, m_size);
+
+        setArraySize(count);
+
+        construct_range(m_buffer, count, 0, data);
+#else
+        const uint old_size = m_size;
+
+        destroy_range(m_buffer, count, old_size);
+
+        setArraySize(count);
+
+        copy_range(m_buffer, data, old_size);
+
+        construct_range(m_buffer, count, old_size, data);
+#endif
+    }
+
+    // Assignment operator.
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::operator=( const Array<T> & a )
+    {
+        copy(a.m_buffer, a.m_size);
+        return *this;
+    }
+
+    // Release ownership of allocated memory and returns pointer to it.
+    template <typename T>
+    T * Array<T>::release() {
+        T * tmp = m_buffer;
+        m_buffer = NULL;
+        m_capacity = 0;
+        m_size = 0;
+        return tmp;
+    }
+
+
+
+    // Change array size.
+    template <typename T> 
+    inline void Array<T>::setArraySize(uint new_size) {
+        m_size = new_size;
+
+        if (new_size > m_capacity) {
+            uint new_buffer_size;
+            if (m_capacity == 0) {
+                // first allocation is exact
+                new_buffer_size = new_size;
+            }
+            else {
+                // following allocations grow array by 25%
+                new_buffer_size = new_size + (new_size >> 2);
+            }
+
+            setArrayCapacity( new_buffer_size );
+        }
+    }
+
+    // Change array capacity.
+    template <typename T> 
+    inline void Array<T>::setArrayCapacity(uint new_capacity) {
+        nvDebugCheck(new_capacity >= m_size);
+
+        if (new_capacity == 0) {
+            // free the buffer.
+            if (m_buffer != NULL) {
+                free<T>(m_buffer);
+                m_buffer = NULL;
+            }
+        }
+        else {
+            // realloc the buffer
+            m_buffer = realloc<T>(m_buffer, new_capacity);
+        }
+
+        m_capacity = new_capacity;
+    }
+
+    // Array serialization.
+    template <typename Typ> 
+    inline Stream & operator<< ( Stream & s, Array<Typ> & p )
+    {
+        if (s.isLoading()) {
+            uint size;
+            s << size;
+            p.resize( size );
+        }
+        else {
+            s << p.m_size;
+        }
+
+        for (uint i = 0; i < p.m_size; i++) {
+            s << p.m_buffer[i];
+        }
+
+        return s;
+    }
+
+    // Swap the members of the two given vectors.
+    template <typename Typ>
+    inline void swap(Array<Typ> & a, Array<Typ> & b)
+    {
+        nv::swap(a.m_buffer, b.m_buffer);
+        nv::swap(a.m_capacity, b.m_capacity);
+        nv::swap(a.m_size, b.m_size);
+    }
+
+
+} // nv namespace
+
+#endif // NV_CORE_ARRAY_INL
--- a/3rdparty/nvtt/nvcore/debug.h
+++ b/3rdparty/nvtt/nvcore/debug.h
@@ -0,0 +1,216 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_DEBUG_H
+#define NV_CORE_DEBUG_H
+
+#include "nvcore.h"
+
+#include <stdarg.h> // va_list
+
+
+// Make sure we are using our assert.
+#undef assert
+
+#define NV_ABORT_DEBUG      1
+#define NV_ABORT_IGNORE     2
+#define NV_ABORT_EXIT       3
+
+#define nvNoAssert(exp) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    (void)sizeof(exp); \
+    NV_MULTI_LINE_MACRO_END
+
+#if NV_NO_ASSERT
+
+#   define nvAssert(exp) nvNoAssert(exp)
+#   define nvCheck(exp) nvNoAssert(exp)
+#   define nvDebugAssert(exp) nvNoAssert(exp)
+#   define nvDebugCheck(exp) nvNoAssert(exp)
+#   define nvDebugBreak() nvNoAssert(0)
+
+#else // NV_NO_ASSERT
+
+#   if NV_CC_MSVC
+        // @@ Does this work in msvc-6 and earlier?
+#       define nvDebugBreak()       __debugbreak()
+//#       define nvDebugBreak()        __asm { int 3 }
+#   elif NV_OS_ORBIS
+#       define nvDebugBreak()       __debugbreak()
+#   elif NV_CC_GNUC
+#       define nvDebugBreak()       __builtin_trap()
+#   else
+#       error "No nvDebugBreak()!"
+#   endif
+
+/*
+#   elif NV_CC_GNUC || NV_CPU_PPC && NV_OS_DARWIN
+        // @@ Use __builtin_trap() on GCC
+#       define nvDebugBreak()       __asm__ volatile ("trap")
+#   elif (NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64) && NV_OS_DARWIN
+#       define nvDebugBreak()       __asm__ volatile ("int3")
+#   elif NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64
+#       define nvDebugBreak()       __asm__ ( "int %0" : :"I"(3) )
+#   else
+#       include <signal.h>
+#       define nvDebugBreak()       raise(SIGTRAP)
+#   endif
+*/
+
+#define nvDebugBreakOnce() \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    static bool firstTime = true; \
+    if (firstTime) { firstTime = false; nvDebugBreak(); } \
+    NV_MULTI_LINE_MACRO_END
+
+#define nvAssertMacro(exp) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    if (!(exp)) { \
+        if (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) { \
+            nvDebugBreak(); \
+        } \
+    } \
+    NV_MULTI_LINE_MACRO_END
+
+// GCC, LLVM need "##" before the __VA_ARGS__, MSVC doesn't care
+#define nvAssertMacroWithIgnoreAll(exp,...) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+        static bool ignoreAll = false; \
+        if (!ignoreAll && !(exp)) { \
+            int result = nvAbort(#exp, __FILE__, __LINE__, __FUNC__, ##__VA_ARGS__); \
+            if (result == NV_ABORT_DEBUG) { \
+                nvDebugBreak(); \
+            } else if (result == NV_ABORT_IGNORE) { \
+                ignoreAll = true; \
+            } \
+        } \
+    NV_MULTI_LINE_MACRO_END
+
+// Interesting assert macro from Insomniac:
+// http://www.gdcvault.com/play/1015319/Developing-Imperfect-Software-How-to
+// Used as follows:
+// if (nvCheck(i < count)) {
+//     normal path
+// } else {
+//     fixup code.
+// }
+// This style of macro could be combined with __builtin_expect to let the compiler know failure is unlikely.
+#define nvCheckMacro(exp) \
+    (\
+        (exp) ? true : ( \
+            (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) ? (nvDebugBreak(), true) : ( false ) \
+        ) \
+    )
+
+
+#define nvAssert(exp)    nvAssertMacro(exp)
+#define nvCheck(exp)     nvAssertMacro(exp)
+
+#if defined(_DEBUG)
+#   define nvDebugAssert(exp)   nvAssertMacro(exp)
+#   define nvDebugCheck(exp)    nvAssertMacro(exp)
+#else // _DEBUG
+#   define nvDebugAssert(exp)   nvNoAssert(exp)
+#   define nvDebugCheck(exp)    nvNoAssert(exp)
+#endif // _DEBUG
+
+#endif // NV_NO_ASSERT
+
+// Use nvAssume for very simple expresions only: nvAssume(0), nvAssume(value == true), etc.
+/*#if !defined(_DEBUG)
+#   if NV_CC_MSVC
+#       define nvAssume(exp)    __assume(exp)
+#   else
+#       define nvAssume(exp)    nvCheck(exp)
+#   endif
+#else
+#   define nvAssume(exp)    nvCheck(exp)
+#endif*/
+
+#if defined(_DEBUG)
+#  if NV_CC_MSVC
+#   define nvUnreachable() nvAssert(0 && "unreachable"); __assume(0)
+#  else
+#   define nvUnreachable() nvAssert(0 && "unreachable"); __builtin_unreachable()
+#  endif
+#else
+#  if NV_CC_MSVC
+#   define nvUnreachable() __assume(0)
+#  else
+#   define nvUnreachable() __builtin_unreachable()
+#  endif
+#endif
+
+
+#define nvError(x)      nvAbort(x, __FILE__, __LINE__, __FUNC__)
+#define nvWarning(x)    nvDebugPrint("*** Warning %s/%d: %s\n", __FILE__, __LINE__, (x))
+
+#ifndef NV_DEBUG_PRINT
+#define NV_DEBUG_PRINT 1 //defined(_DEBUG)
+#endif
+
+#if NV_DEBUG_PRINT
+#define nvDebug(...)    nvDebugPrint(__VA_ARGS__)
+#else
+#if NV_CC_MSVC
+#define nvDebug(...)    __noop(__VA_ARGS__)
+#else
+#define nvDebug(...)    ((void)0) // Non-msvc platforms do not evaluate arguments?
+#endif
+#endif
+
+
+NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL, const char * msg = NULL, ...) __attribute__((format (printf, 5, 6)));
+NVCORE_API void NV_CDECL nvDebugPrint( const char *msg, ... ) __attribute__((format (printf, 1, 2)));
+
+namespace nv
+{
+    inline bool isValidPtr(const void * ptr) {
+    #if NV_CPU_X86_64
+        if (ptr == NULL) return true;
+        if (reinterpret_cast<uint64>(ptr) < 0x10000ULL) return false;
+        if (reinterpret_cast<uint64>(ptr) >= 0x000007FFFFFEFFFFULL) return false;
+    #else
+	    if (reinterpret_cast<uint32>(ptr) == 0xcccccccc) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xcdcdcdcd) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xdddddddd) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xffffffff) return false;
+    #endif
+        return true;
+    }
+
+    // Message handler interface.
+    struct MessageHandler {
+        virtual void log(const char * str, va_list arg) = 0;
+        virtual ~MessageHandler() {}
+    };
+
+    // Assert handler interface.
+    struct AssertHandler {
+        virtual int assertion(const char *exp, const char *file, int line, const char *func, const char *msg, va_list arg) = 0;
+        virtual ~AssertHandler() {}
+    };
+
+
+    namespace debug
+    {
+        NVCORE_API void dumpInfo();
+        NVCORE_API void dumpCallstack( MessageHandler *messageHandler, int callstackLevelsToSkip = 0 );
+
+        NVCORE_API void setMessageHandler( MessageHandler * messageHandler );
+        NVCORE_API void resetMessageHandler();
+
+        NVCORE_API void setAssertHandler( AssertHandler * assertHanlder );
+        NVCORE_API void resetAssertHandler();
+
+        NVCORE_API void enableSigHandler(bool interactive);
+        NVCORE_API void disableSigHandler();
+
+        NVCORE_API bool isDebuggerPresent();
+        NVCORE_API bool attachToDebugger();
+
+        NVCORE_API void terminate(int code);
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_DEBUG_H
--- a/3rdparty/nvtt/nvcore/defsgnucdarwin.h
+++ b/3rdparty/nvtt/nvcore/defsgnucdarwin.h
@@ -0,0 +1,57 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
+#include <stddef.h> // operator new, size_t, NULL
+
+#ifndef __STDC_VERSION__
+#	define __STDC_VERSION__ 0
+#endif // __STDC_VERSION__
+
+// Function linkage
+#define DLL_IMPORT
+#if __GNUC__ >= 4
+#	define DLL_EXPORT __attribute__((visibility("default")))
+#	define DLL_EXPORT_CLASS DLL_EXPORT
+#else
+#	define DLL_EXPORT
+#	define DLL_EXPORT_CLASS
+#endif
+
+// Function calling modes
+#if NV_CPU_X86
+#	define NV_CDECL 	__attribute__((cdecl))
+#	define NV_STDCALL	__attribute__((stdcall))
+#else
+#	define NV_CDECL 
+#	define NV_STDCALL
+#endif
+
+#define NV_FASTCALL		__attribute__((fastcall))
+#define NV_FORCEINLINE	inline
+#define NV_DEPRECATED   __attribute__((deprecated))
+#define NV_THREAD_LOCAL //ACS: there's no "__thread" or equivalent on iOS/OSX
+
+#if __GNUC__ > 2
+#define NV_PURE     __attribute__((pure))
+#define NV_CONST    __attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ < 199901L
+#	if __GNUC__ >= 2
+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
+#	else
+#		define __FUNC__ "<unknown>"
+#	endif
+#else
+#	define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict    __restrict__
--- a/3rdparty/nvtt/nvcore/defsgnuclinux.h
+++ b/3rdparty/nvtt/nvcore/defsgnuclinux.h
@@ -0,0 +1,63 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
+#include <stddef.h> // operator new, size_t, NULL
+
+#ifndef __STDC_VERSION__
+#	define __STDC_VERSION__ 0
+#endif
+
+// Function linkage
+#define DLL_IMPORT
+#if __GNUC__ >= 4
+#   define DLL_EXPORT   __attribute__((visibility("default")))
+#   define DLL_EXPORT_CLASS DLL_EXPORT
+#else
+#   define DLL_EXPORT
+#   define DLL_EXPORT_CLASS
+#endif
+
+// Function calling modes
+#if NV_CPU_X86
+#   define NV_CDECL     __attribute__((cdecl))
+#   define NV_STDCALL   __attribute__((stdcall))
+#else
+#   define NV_CDECL 
+#   define NV_STDCALL
+#endif
+
+#define NV_FASTCALL     __attribute__((fastcall))
+//#if __GNUC__ > 3
+// It seems that GCC does not assume always_inline implies inline. I think this depends on the GCC version :(
+#define NV_FORCEINLINE  inline
+//#else
+// Some compilers complain that inline and always_inline are redundant.
+//#define NV_FORCEINLINE  __attribute__((always_inline))
+//#endif
+#define NV_DEPRECATED   __attribute__((deprecated))
+#define NV_THREAD_LOCAL __thread 
+
+#if __GNUC__ > 2
+#define NV_PURE     __attribute__((pure))
+#define NV_CONST    __attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ < 199901L
+#   if __GNUC__ >= 2
+#       define __FUNC__ __PRETTY_FUNCTION__ // __FUNCTION__
+#   else
+#       define __FUNC__ "<unknown>"
+#   endif
+#else
+#   define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict    __restrict__
--- a/3rdparty/nvtt/nvcore/defsgnucwin32.h
+++ b/3rdparty/nvtt/nvcore/defsgnucwin32.h
@@ -0,0 +1,65 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+//#include <cstddef> // size_t, NULL
+
+// Function linkage
+#define DLL_IMPORT	__declspec(dllimport)
+#define DLL_EXPORT	__declspec(dllexport)
+#define DLL_EXPORT_CLASS DLL_EXPORT
+
+// Function calling modes
+#if NV_CPU_X86
+#	define NV_CDECL 	__attribute__((cdecl))
+#	define NV_STDCALL	__attribute__((stdcall))
+#else
+#	define NV_CDECL 
+#	define NV_STDCALL
+#endif
+
+#define NV_FASTCALL		__attribute__((fastcall))
+#define NV_FORCEINLINE	inline
+#define NV_DEPRECATED   __attribute__((deprecated))
+
+#if __GNUC__ > 2
+#define NV_PURE		__attribute__((pure))
+#define NV_CONST	__attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ < 199901L
+#	if __GNUC__ >= 2
+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
+#	else
+#		define __FUNC__ "<unknown>"
+#	endif
+#else
+#	define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict	__restrict__
+
+/*
+// Type definitions
+typedef unsigned char		uint8;
+typedef signed char			int8;
+
+typedef unsigned short		uint16;
+typedef signed short		int16;
+
+typedef unsigned int		uint32;
+typedef signed int			int32;
+
+typedef unsigned long long	uint64;
+typedef signed long long	int64;
+
+// Aliases
+typedef uint32				uint;
+*/
+
--- a/3rdparty/nvtt/nvcore/defsvcwin32.h
+++ b/3rdparty/nvtt/nvcore/defsvcwin32.h
@@ -0,0 +1,94 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+// Function linkage
+#define DLL_IMPORT __declspec(dllimport)
+#define DLL_EXPORT __declspec(dllexport)
+#define DLL_EXPORT_CLASS DLL_EXPORT
+
+// Function calling modes
+#define NV_CDECL        __cdecl
+#define NV_STDCALL      __stdcall
+#define NV_FASTCALL     __fastcall
+#define NV_DEPRECATED
+
+#define NV_PURE
+#define NV_CONST
+
+// Set standard function names.
+#if _MSC_VER < 1900
+#   define snprintf _snprintf
+#endif
+#if _MSC_VER < 1500
+#   define vsnprintf _vsnprintf
+#endif
+#if _MSC_VER < 1700
+#   define strtoll _strtoi64
+#   define strtoull _strtoui64
+#endif
+#define chdir _chdir
+#define getcwd _getcwd 
+
+#if _MSC_VER < 1800 // Not sure what version introduced this.
+#define va_copy(a, b) (a) = (b)
+#endif
+
+#if !defined restrict
+#define restrict
+#endif
+
+// Ignore gcc attributes.
+#define __attribute__(X)
+
+#if !defined __FUNC__
+#define __FUNC__ __FUNCTION__ 
+#endif
+
+#define NV_NOINLINE __declspec(noinline)
+#define NV_FORCEINLINE inline
+
+#define NV_THREAD_LOCAL __declspec(thread)
+
+/*
+// Type definitions
+typedef unsigned char       uint8;
+typedef signed char         int8;
+
+typedef unsigned short      uint16;
+typedef signed short        int16;
+
+typedef unsigned int        uint32;
+typedef signed int          int32;
+
+typedef unsigned __int64    uint64;
+typedef signed __int64      int64;
+
+// Aliases
+typedef uint32              uint;
+*/
+
+// Unwanted VC++ warnings to disable.
+/*
+#pragma warning(disable : 4244)     // conversion to float, possible loss of data
+#pragma warning(disable : 4245)     // conversion from 'enum ' to 'unsigned long', signed/unsigned mismatch
+#pragma warning(disable : 4100)     // unreferenced formal parameter
+#pragma warning(disable : 4514)     // unreferenced inline function has been removed
+#pragma warning(disable : 4710)     // inline function not expanded
+#pragma warning(disable : 4127)     // Conditional expression is constant
+#pragma warning(disable : 4305)     // truncation from 'const double' to 'float'
+#pragma warning(disable : 4505)     // unreferenced local function has been removed
+
+#pragma warning(disable : 4702)     // unreachable code in inline expanded function
+#pragma warning(disable : 4711)     // function selected for automatic inlining
+#pragma warning(disable : 4725)     // Pentium fdiv bug
+
+#pragma warning(disable : 4786)     // Identifier was truncated and cannot be debugged.
+
+#pragma warning(disable : 4675)     // resolved overload was found by argument-dependent lookup
+*/
+
+#pragma warning(1 : 4705)     // Report unused local variables.
+#pragma warning(1 : 4555)     // Expression has no effect.
--- a/3rdparty/nvtt/nvcore/foreach.h
+++ b/3rdparty/nvtt/nvcore/foreach.h
@@ -0,0 +1,68 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_FOREACH_H
+#define NV_CORE_FOREACH_H
+
+/*
+These foreach macros are very non-standard and somewhat confusing, but I like them.
+*/
+
+#include "nvcore.h"
+
+#if NV_CC_GNUC // If typeof or decltype is available:
+#if !NV_CC_CPP11
+#   define NV_DECLTYPE typeof // Using a non-standard extension over typeof that behaves as C++11 decltype
+#else
+#   define NV_DECLTYPE decltype
+#endif
+
+/*
+Ideally we would like to write this:
+
+#define NV_FOREACH(i, container) \
+    for(NV_DECLTYPE(container)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
+
+But gcc versions prior to 4.7 required an intermediate type. See:
+https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6709
+*/
+
+#define NV_FOREACH(i, container) \
+    typedef NV_DECLTYPE(container) NV_STRING_JOIN2(cont,__LINE__); \
+    for(NV_STRING_JOIN2(cont,__LINE__)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
+
+#else // If typeof not available:
+
+#include <new> // placement new
+
+struct PseudoIndexWrapper {
+    template <typename T>
+    PseudoIndexWrapper(const T & container) {
+        nvStaticCheck(sizeof(typename T::PseudoIndex) <= sizeof(memory));
+        new (memory) typename T::PseudoIndex(container.start());
+    }
+    // PseudoIndex cannot have a dtor!
+
+    template <typename T> typename T::PseudoIndex & operator()(const T * /*container*/) {
+        return *reinterpret_cast<typename T::PseudoIndex *>(memory);
+    }
+    template <typename T> const typename T::PseudoIndex & operator()(const T * /*container*/) const {
+        return *reinterpret_cast<const typename T::PseudoIndex *>(memory);
+    }
+
+    uint8 memory[4];	// Increase the size if we have bigger enumerators.
+};
+
+#define NV_FOREACH(i, container) \
+    for(PseudoIndexWrapper i(container); !(container).isDone(i(&(container))); (container).advance(i(&(container))))
+
+#endif
+
+// Declare foreach keyword.
+#if !defined NV_NO_USE_KEYWORDS
+#   define foreach NV_FOREACH
+#   define foreach_index NV_FOREACH
+#endif
+
+
+#endif // NV_CORE_FOREACH_H
--- a/3rdparty/nvtt/nvcore/hash.h
+++ b/3rdparty/nvtt/nvcore/hash.h
@@ -0,0 +1,83 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_HASH_H
+#define NV_CORE_HASH_H
+
+#include "nvcore.h"
+
+namespace nv
+{
+    inline uint sdbmHash(const void * data_in, uint size, uint h = 5381)
+    {
+        const uint8 * data = (const uint8 *) data_in;
+        uint i = 0;
+        while (i < size) {
+            h = (h << 16) + (h << 6) - h + (uint) data[i++];
+        }
+        return h;
+    }
+
+    // Note that this hash does not handle NaN properly.
+    inline uint sdbmFloatHash(const float * f, uint count, uint h = 5381)
+    {
+        for (uint i = 0; i < count; i++) {
+            //nvDebugCheck(nv::isFinite(*f));
+            union { float f; uint32 i; } x = { f[i] };
+            if (x.i == 0x80000000) x.i = 0;
+            h = sdbmHash(&x, 4, h);
+        }
+        return h;
+    }
+
+
+    template <typename T>
+    inline uint hash(const T & t, uint h = 5381)
+    {
+        return sdbmHash(&t, sizeof(T), h);
+    }
+
+    template <>
+    inline uint hash(const float & f, uint h)
+    {
+        return sdbmFloatHash(&f, 1, h);
+    }
+
+
+    // Functors for hash table:
+    template <typename Key> struct Hash 
+    {
+        uint operator()(const Key & k) const {
+            return hash(k);
+        }
+    };
+
+    template <typename Key> struct Equal
+    {
+        bool operator()(const Key & k0, const Key & k1) const {
+            return k0 == k1;
+        }
+    };
+
+
+    // @@ Move to Utils.h?
+    template <typename T1, typename T2>
+    struct Pair {
+        T1 first;
+        T2 second;
+    };
+
+    template <typename T1, typename T2>
+    bool operator==(const Pair<T1,T2> & p0, const Pair<T1,T2> & p1) {
+        return p0.first == p1.first && p0.second == p1.second;
+    }
+
+    template <typename T1, typename T2>
+    uint hash(const Pair<T1,T2> & p, uint h = 5381) {
+        return hash(p.second, hash(p.first));
+    }
+
+
+} // nv namespace
+
+#endif // NV_CORE_HASH_H
--- a/3rdparty/nvtt/nvcore/memory.h
+++ b/3rdparty/nvtt/nvcore/memory.h
@@ -0,0 +1,30 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_MEMORY_H
+#define NV_CORE_MEMORY_H
+
+#include "nvcore.h"
+#include <stdlib.h>
+
+namespace nv {
+
+    // C++ helpers.
+    template <typename T> inline T * malloc(size_t count) {
+        return (T *)::malloc(sizeof(T) * count);
+    }
+
+    template <typename T> inline T * realloc(T * ptr, size_t count) {
+        return (T *)::realloc(ptr, sizeof(T) * count);
+    }
+
+    template <typename T> inline void free(const T * ptr) {
+        ::free((void *)ptr);
+    }
+
+    template <typename T> inline void zero(T & data) {
+        memset(&data, 0, sizeof(T));
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_MEMORY_H
--- a/3rdparty/nvtt/nvcore/nvcore.h
+++ b/3rdparty/nvtt/nvcore/nvcore.h
@@ -0,0 +1,363 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_H
+#define NV_CORE_H
+
+#define NVCORE_SHARED 0
+#define NV_NO_ASSERT 0
+
+// Function linkage
+#if NVCORE_SHARED
+#ifdef NVCORE_EXPORTS
+#define NVCORE_API DLL_EXPORT
+#define NVCORE_CLASS DLL_EXPORT_CLASS
+#else
+#define NVCORE_API DLL_IMPORT
+#define NVCORE_CLASS DLL_IMPORT
+#endif
+#else // NVCORE_SHARED
+#define NVCORE_API
+#define NVCORE_CLASS
+#endif // NVCORE_SHARED
+
+// Platform definitions
+#include "posh.h"
+
+#define NV_OS_STRING POSH_OS_STRING
+
+#if defined POSH_OS_LINUX
+#   define NV_OS_LINUX 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_ORBIS
+#   define NV_OS_ORBIS 1
+#elif defined POSH_OS_FREEBSD
+#   define NV_OS_FREEBSD 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_OPENBSD
+#   define NV_OS_OPENBSD 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_CYGWIN32
+#   define NV_OS_CYGWIN 1
+#elif defined POSH_OS_MINGW
+#   define NV_OS_MINGW 1
+#   define NV_OS_WIN32 1
+#elif defined POSH_OS_OSX
+#   define NV_OS_DARWIN 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_IOS
+#   define NV_OS_DARWIN 1 //ACS should we keep this on IOS?
+#   define NV_OS_UNIX 1
+#   define NV_OS_IOS 1
+#elif defined POSH_OS_UNIX
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_WIN64
+#   define NV_OS_WIN32 1
+#   define NV_OS_WIN64 1
+#elif defined POSH_OS_WIN32
+#   define NV_OS_WIN32 1
+#elif defined POSH_OS_XBOX
+#   define NV_OS_XBOX 1
+#else
+#   error "Unsupported OS"
+#endif
+
+#ifndef NV_OS_WIN32
+#	define NV_OS_WIN32  0
+#endif // NV_OS_WIN32
+
+#ifndef NV_OS_WIN64
+#	define NV_OS_WIN64  0
+#endif // NV_OS_WIN64
+
+#ifndef NV_OS_MINGW
+#	define NV_OS_MINGW  0
+#endif // NV_OS_MINGW
+
+#ifndef NV_OS_CYGWIN
+#	define NV_OS_CYGWIN 0
+#endif // NV_OS_CYGWIN
+
+#ifndef NV_OS_LINUX
+#	define NV_OS_LINUX  0
+#endif // NV_OS_LINUX
+
+#ifndef NV_OS_FREEBSD
+#	define NV_OS_FREEBSD 0
+#endif // NV_OS_FREEBSD
+
+#ifndef NV_OS_OPENBSD
+#	define NV_OS_OPENBSD 0
+#endif // NV_OS_OPENBSD
+
+#ifndef NV_OS_UNIX
+#	define NV_OS_UNIX   0
+#endif // NV_OS_UNIX
+
+#ifndef NV_OS_DARWIN
+#	define NV_OS_DARWIN 0
+#endif // NV_OS_DARWIN
+
+#ifndef NV_OS_XBOX
+#	define NV_OS_XBOX   0
+#endif // NV_OS_XBOX
+
+#ifndef NV_OS_ORBIS
+#	define NV_OS_ORBIS  0
+#endif // NV_OS_ORBIS
+
+#ifndef NV_OS_IOS
+#	define NV_OS_IOS    0
+#endif // NV_OS_IOS
+
+// Threading:
+// some platforms don't implement __thread or similar for thread-local-storage
+#if NV_OS_UNIX || NV_OS_ORBIS || NV_OS_IOS //ACStodoIOS darwin instead of ios?
+#   define NV_OS_USE_PTHREAD 1
+#   if NV_OS_DARWIN || NV_OS_IOS
+#       define NV_OS_HAS_TLS_QUALIFIER 0
+#   else
+#       define NV_OS_HAS_TLS_QUALIFIER 1
+#   endif
+#else
+#   define NV_OS_USE_PTHREAD 0
+#   define NV_OS_HAS_TLS_QUALIFIER 1
+#endif
+
+
+// CPUs:
+
+#define NV_CPU_STRING   POSH_CPU_STRING
+
+#if defined POSH_CPU_X86_64
+//#   define NV_CPU_X86 1
+#   define NV_CPU_X86_64 1
+#elif defined POSH_CPU_X86
+#   define NV_CPU_X86 1
+#elif defined POSH_CPU_PPC
+#   define NV_CPU_PPC 1
+#elif defined POSH_CPU_STRONGARM
+#   define NV_CPU_ARM 1
+#elif defined POSH_CPU_AARCH64
+#   define NV_CPU_AARCH64 1
+#else
+#   error "Unsupported CPU"
+#endif
+
+#ifndef NV_CPU_X86
+#	define NV_CPU_X86     0
+#endif // NV_CPU_X86
+
+#ifndef NV_CPU_X86_64
+#	define NV_CPU_X86_64  0
+#endif // NV_CPU_X86_64
+
+#ifndef NV_CPU_PPC
+#	define NV_CPU_PPC     0
+#endif // NV_CPU_PPC
+
+#ifndef NV_CPU_ARM
+#	define NV_CPU_ARM     0
+#endif // NV_CPU_ARM
+
+#ifndef NV_CPU_AARCH64
+#	define NV_CPU_AARCH64 0
+#endif // NV_CPU_AARCH64
+
+// Compiler:
+
+#if defined POSH_COMPILER_CLANG
+#   define NV_CC_CLANG  1
+#   define NV_CC_GNUC   1    // Clang is compatible with GCC.
+#   define NV_CC_STRING "clang"
+#	pragma clang diagnostic ignored "-Wmissing-braces"
+#	pragma clang diagnostic ignored "-Wshadow"
+#	pragma clang diagnostic ignored "-Wunused-local-typedef"
+#	pragma clang diagnostic ignored "-Wunused-function"
+#	pragma clang diagnostic ignored "-Wunused-variable"
+#	pragma clang diagnostic ignored "-Wunused-parameter"
+#	pragma clang diagnostic ignored "-Wsometimes-uninitialized"
+#elif defined POSH_COMPILER_GCC
+#   define NV_CC_GNUC   1
+#   define NV_CC_STRING "gcc"
+#	pragma GCC diagnostic ignored "-Wshadow"
+#	pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#	pragma GCC diagnostic ignored "-Wunused-function"
+#	pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#	pragma GCC diagnostic ignored "-Wunused-variable"
+#	pragma GCC diagnostic ignored "-Wunused-parameter"
+#	pragma GCC diagnostic ignored "-Warray-bounds"
+#elif defined POSH_COMPILER_MSVC
+#   define NV_CC_MSVC   1
+#   define NV_CC_STRING "msvc"
+#else
+#   error "Unsupported compiler"
+#endif
+
+#ifndef NV_CC_GNUC
+#	define NV_CC_GNUC  0
+#endif // NV_CC_GNUC
+
+#ifndef NV_CC_MSVC
+#	define NV_CC_MSVC  0
+#endif // NV_CC_MSVC
+
+#ifndef NV_CC_CLANG
+#	define NV_CC_CLANG 0
+#endif // NV_CC_CLANG
+
+#if NV_CC_MSVC
+#define NV_CC_CPP11 (__cplusplus > 199711L || _MSC_VER >= 1800) // Visual Studio 2013 has all the features we use, but doesn't advertise full C++11 support yet.
+#else
+// @@ IC: This works in CLANG, about GCC?
+// @@ ES: Doesn't work in gcc. These 3 features are available in GCC >= 4.4.
+#ifdef __clang__
+#define NV_CC_CPP11 (__has_feature(cxx_deleted_functions) && __has_feature(cxx_rvalue_references) && __has_feature(cxx_static_assert))
+#elif defined __GNUC__ 
+#define NV_CC_CPP11 ( __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
+#endif
+#endif
+
+// Endiannes:
+#define NV_LITTLE_ENDIAN    POSH_LITTLE_ENDIAN
+#define NV_BIG_ENDIAN       POSH_BIG_ENDIAN
+#define NV_ENDIAN_STRING    POSH_ENDIAN_STRING
+
+
+// Type definitions:
+typedef posh_u8_t   uint8;
+typedef posh_i8_t   int8;
+
+typedef posh_u16_t  uint16;
+typedef posh_i16_t  int16;
+
+typedef posh_u32_t  uint32;
+typedef posh_i32_t  int32;
+
+typedef posh_u64_t  uint64;
+typedef posh_i64_t  int64;
+
+// Aliases
+typedef uint32      uint;
+
+
+// Version string:
+#define NV_VERSION_STRING \
+    NV_OS_STRING "/" NV_CC_STRING "/" NV_CPU_STRING"/" \
+    NV_ENDIAN_STRING"-endian - " __DATE__ "-" __TIME__
+
+
+// Disable copy constructor and assignment operator. 
+#if NV_CC_CPP11
+#define NV_FORBID_COPY(C) \
+    C( const C & ) = delete; \
+    C &operator=( const C & ) = delete
+#else
+#define NV_FORBID_COPY(C) \
+    private: \
+    C( const C & ); \
+    C &operator=( const C & )
+#endif
+
+// Disable dynamic allocation on the heap. 
+// See Prohibiting Heap-Based Objects in More Effective C++.
+#define NV_FORBID_HEAPALLOC() \
+    private: \
+    void *operator new(size_t size); \
+    void *operator new[](size_t size)
+
+// String concatenation macros.
+#define NV_STRING_JOIN2(arg1, arg2) NV_DO_STRING_JOIN2(arg1, arg2)
+#define NV_DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
+#define NV_STRING_JOIN3(arg1, arg2, arg3) NV_DO_STRING_JOIN3(arg1, arg2, arg3)
+#define NV_DO_STRING_JOIN3(arg1, arg2, arg3) arg1 ## arg2 ## arg3
+#define NV_STRING2(x) #x
+#define NV_STRING(x) NV_STRING2(x)
+
+#if NV_CC_MSVC
+#define NV_MULTI_LINE_MACRO_BEGIN do {  
+#define NV_MULTI_LINE_MACRO_END \
+    __pragma(warning(push)) \
+    __pragma(warning(disable:4127)) \
+    } while(false) \
+    __pragma(warning(pop))  
+#else
+#define NV_MULTI_LINE_MACRO_BEGIN do {
+#define NV_MULTI_LINE_MACRO_END } while(false)
+#endif
+
+#if NV_CC_CPP11
+#define nvStaticCheck(x) static_assert((x), "Static assert "#x" failed")
+#else
+#define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
+#endif
+#define NV_COMPILER_CHECK(x) nvStaticCheck(x)   // I like this name best.
+
+// Make sure type definitions are fine.
+NV_COMPILER_CHECK(sizeof(int8) == 1);
+NV_COMPILER_CHECK(sizeof(uint8) == 1);
+NV_COMPILER_CHECK(sizeof(int16) == 2);
+NV_COMPILER_CHECK(sizeof(uint16) == 2);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+
+
+#define NV_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
+
+#if 0 // Disabled in The Witness.
+#if NV_CC_MSVC
+#define NV_MESSAGE(x) message(__FILE__ "(" NV_STRING(__LINE__) ") : " x)
+#else
+#define NV_MESSAGE(x) message(x)
+#endif
+#else
+#define NV_MESSAGE(x) 
+#endif
+
+
+// Startup initialization macro.
+#define NV_AT_STARTUP(some_code) \
+    namespace { \
+        static struct NV_STRING_JOIN2(AtStartup_, __LINE__) { \
+            NV_STRING_JOIN2(AtStartup_, __LINE__)() { some_code; } \
+        } \
+        NV_STRING_JOIN3(AtStartup_, __LINE__, Instance); \
+    }
+
+// Indicate the compiler that the parameter is not used to suppress compier warnings.
+#define NV_UNUSED(a) ((a)=(a))
+
+// Null index. @@ Move this somewhere else... it's only used by nvmesh.
+//const unsigned int NIL = unsigned int(~0);
+//#define NIL uint(~0)
+
+// Null pointer.
+#ifndef NULL
+#define NULL 0
+#endif
+
+// Platform includes
+#if NV_CC_MSVC
+#   if NV_OS_WIN32
+#       include "defsvcwin32.h"
+#   elif NV_OS_XBOX
+#       include "defsvcxbox.h"
+#   else
+#       error "MSVC: Platform not supported"
+#   endif
+#elif NV_CC_GNUC
+#   if NV_OS_LINUX
+#       include "defsgnuclinux.h"
+#   elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
+#       include "defsgnucdarwin.h"
+#   elif NV_OS_MINGW
+#       include "defsgnucwin32.h"
+#   elif NV_OS_CYGWIN
+#       error "GCC: Cygwin not supported"
+#   else
+#       error "GCC: Platform not supported"
+#   endif
+#endif
+
+#endif // NV_CORE_H
--- a/3rdparty/nvtt/nvcore/posh.h
+++ b/3rdparty/nvtt/nvcore/posh.h
--- a/3rdparty/nvtt/nvcore/stdstream.h
+++ b/3rdparty/nvtt/nvcore/stdstream.h
@@ -0,0 +1,459 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#include "nvcore.h"
+#include "stream.h"
+#include "array.h"
+
+#include <stdio.h> // fopen
+#include <string.h> // memcpy
+
+namespace nv
+{
+
+    // Portable version of fopen.
+    inline FILE * fileOpen(const char * fileName, const char * mode)
+    {
+        nvCheck(fileName != NULL);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+        FILE * fp;
+        if (fopen_s(&fp, fileName, mode) == 0) {
+            return fp;
+        }
+        return NULL;
+#else
+        return fopen(fileName, mode);
+#endif
+    }
+
+
+    /// Base stdio stream.
+    class NVCORE_CLASS StdStream : public Stream
+    {
+        NV_FORBID_COPY(StdStream);
+    public:
+
+        /// Ctor.
+        StdStream( FILE * fp, bool autoclose ) : m_fp(fp), m_autoclose(autoclose) { }
+
+        /// Dtor. 
+        virtual ~StdStream()
+        {
+            if( m_fp != NULL && m_autoclose ) {
+#if NV_OS_WIN32
+                _fclose_nolock( m_fp );
+#else
+                fclose( m_fp );
+#endif
+            }
+        }
+
+
+        /** @name Stream implementation. */
+        //@{
+        virtual void seek( uint pos )
+        {
+            nvDebugCheck(m_fp != NULL);
+            nvDebugCheck(pos <= size());
+#if NV_OS_WIN32
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+        }
+
+        virtual uint tell() const
+        {
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return _ftell_nolock(m_fp);
+#else
+            return (uint)ftell(m_fp);
+#endif
+        }
+
+        virtual uint size() const
+        {
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            uint pos = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, 0, SEEK_END);
+            uint end = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            uint pos = (uint)ftell(m_fp);
+            fseek(m_fp, 0, SEEK_END);
+            uint end = (uint)ftell(m_fp);
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+            return end;
+        }
+
+        virtual bool isError() const
+        {
+            return m_fp == NULL || ferror( m_fp ) != 0;
+        }
+
+        virtual void clearError()
+        {
+            nvDebugCheck(m_fp != NULL);
+            clearerr(m_fp);
+        }
+
+        // @@ The original implementation uses feof, which only returns true when we attempt to read *past* the end of the stream. 
+        // That is, if we read the last byte of a file, then isAtEnd would still return false, even though the stream pointer is at the file end. This is not the intent and was inconsistent with the implementation of the MemoryStream, a better 
+        // implementation uses use ftell and fseek to determine our location within the file.
+        virtual bool isAtEnd() const
+        {
+            if (m_fp == NULL) return true;
+            //nvDebugCheck(m_fp != NULL);
+            //return feof( m_fp ) != 0;
+#if NV_OS_WIN32
+            uint pos = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, 0, SEEK_END);
+            uint end = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            uint pos = (uint)ftell(m_fp);
+            fseek(m_fp, 0, SEEK_END);
+            uint end = (uint)ftell(m_fp);
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+            return pos == end;
+        }
+
+        /// Always true.
+        virtual bool isSeekable() const { return true; }
+        //@}
+
+    protected:
+
+        FILE * m_fp;
+        bool m_autoclose;
+
+    };
+
+
+    /// Standard output stream.
+    class NVCORE_CLASS StdOutputStream : public StdStream
+    {
+        NV_FORBID_COPY(StdOutputStream);
+    public:
+
+        /// Construct stream by file name.
+        StdOutputStream( const char * name ) : StdStream(fileOpen(name, "wb"), /*autoclose=*/true) { }
+
+        /// Construct stream by file handle.
+        StdOutputStream( FILE * fp, bool autoclose ) : StdStream(fp, autoclose)
+        {
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Write data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return (uint)_fwrite_nolock(data, 1, len, m_fp);
+#elif NV_OS_LINUX
+            return (uint)fwrite_unlocked(data, 1, len, m_fp);
+#elif NV_OS_DARWIN
+            // @@ No error checking, always returns len.
+            for (uint i = 0; i < len; i++) {
+                putc_unlocked(((char *)data)[i], m_fp);
+            }
+            return len;
+#else
+            return (uint)fwrite(data, 1, len, m_fp);
+#endif
+        }
+
+        virtual bool isLoading() const
+        {
+            return false;
+        }
+
+        virtual bool isSaving() const
+        {
+            return true;
+        }
+        //@}
+
+    };
+
+
+    /// Standard input stream.
+    class NVCORE_CLASS StdInputStream : public StdStream
+    {
+        NV_FORBID_COPY(StdInputStream);
+    public:
+
+        /// Construct stream by file name.
+        StdInputStream( const char * name ) : StdStream(fileOpen(name, "rb"), /*autoclose=*/true) { }
+
+        /// Construct stream by file handle.
+        StdInputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose)
+        {
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return (uint)_fread_nolock(data, 1, len, m_fp);
+#elif NV_OS_LINUX
+            return (uint)fread_unlocked(data, 1, len, m_fp);
+#elif NV_OS_DARWIN
+            // @@ No error checking, always returns len.
+            for (uint i = 0; i < len; i++) {
+                ((char *)data)[i] = getc_unlocked(m_fp);
+            }
+            return len;
+#else
+            return (uint)fread(data, 1, len, m_fp);
+#endif
+            
+        }
+
+        virtual bool isLoading() const
+        {
+            return true;
+        }
+
+        virtual bool isSaving() const
+        {
+            return false;
+        }
+        //@}
+    };
+
+
+
+    /// Memory input stream.
+    class NVCORE_CLASS MemoryInputStream : public Stream
+    {
+        NV_FORBID_COPY(MemoryInputStream);
+    public:
+
+        /// Ctor.
+        MemoryInputStream( const uint8 * mem, uint size ) : m_mem(mem), m_ptr(mem), m_size(size) { }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(!isError());
+
+            uint left = m_size - tell();
+            if (len > left) len = left;
+
+            memcpy( data, m_ptr, len );
+            m_ptr += len;
+
+            return len;
+        }
+
+        virtual void seek( uint pos )
+        {
+            nvDebugCheck(!isError());
+            m_ptr = m_mem + pos;
+            nvDebugCheck(!isError());
+        }
+
+        virtual uint tell() const
+        {
+            nvDebugCheck(m_ptr >= m_mem);
+            return uint(m_ptr - m_mem);
+        }
+
+        virtual uint size() const
+        {
+            return m_size;
+        }
+
+        virtual bool isError() const
+        {
+            return m_mem == NULL || m_ptr > m_mem + m_size || m_ptr < m_mem;
+        }
+
+        virtual void clearError()
+        {
+            // Nothing to do.
+        }
+
+        virtual bool isAtEnd() const
+        {
+            return m_ptr == m_mem + m_size;
+        }
+
+        /// Always true.
+        virtual bool isSeekable() const
+        {
+            return true;
+        }
+
+        virtual bool isLoading() const
+        {
+            return true;
+        }
+
+        virtual bool isSaving() const
+        {
+            return false;
+        }
+        //@}
+
+        const uint8 * ptr() const { return m_ptr; }
+
+
+    private:
+
+        const uint8 * m_mem;
+        const uint8 * m_ptr;
+        uint m_size;
+
+    };
+
+
+    /// Buffer output stream.
+    class NVCORE_CLASS BufferOutputStream : public Stream
+    {
+        NV_FORBID_COPY(BufferOutputStream);
+    public:
+
+        BufferOutputStream(Array<uint8> & buffer) : m_buffer(buffer) { }
+
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            m_buffer.append((uint8 *)data, len);
+            return len;
+        }
+
+        virtual void seek( uint /*pos*/ ) { /*Not implemented*/ }
+        virtual uint tell() const { return m_buffer.size(); }
+        virtual uint size() const { return m_buffer.size(); }
+
+        virtual bool isError() const { return false; }
+        virtual void clearError() {}
+
+        virtual bool isAtEnd() const { return true; }
+        virtual bool isSeekable() const { return false; }
+        virtual bool isLoading() const { return false; }
+        virtual bool isSaving() const { return true; }
+
+    private:
+        Array<uint8> & m_buffer;
+    };
+
+
+    /// Protected input stream.
+    class NVCORE_CLASS ProtectedStream : public Stream
+    {
+        NV_FORBID_COPY(ProtectedStream);
+    public:
+
+        /// Ctor.
+        ProtectedStream( Stream & s ) : m_s(&s), m_autodelete(false)
+        { 
+        }
+
+        /// Ctor.
+        ProtectedStream( Stream * s, bool autodelete = true ) : 
+        m_s(s), m_autodelete(autodelete) 
+        {
+            nvDebugCheck(m_s != NULL);
+        }
+
+        /// Dtor.
+        virtual ~ProtectedStream()
+        {
+            if( m_autodelete ) {
+                delete m_s;
+            }
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            len = m_s->serialize( data, len );
+
+            if( m_s->isError() ) {
+                throw;
+            }
+
+            return len;
+        }
+
+        virtual void seek( uint pos )
+        {
+            m_s->seek( pos );
+
+            if( m_s->isError() ) {
+                throw;
+            }
+        }
+
+        virtual uint tell() const
+        {
+            return m_s->tell();
+        }
+
+        virtual uint size() const
+        {
+            return m_s->size();
+        }
+
+        virtual bool isError() const
+        {
+            return m_s->isError();
+        }
+
+        virtual void clearError()
+        {
+            m_s->clearError();
+        }
+
+        virtual bool isAtEnd() const
+        {
+            return m_s->isAtEnd();
+        }
+
+        virtual bool isSeekable() const
+        {
+            return m_s->isSeekable();
+        }
+
+        virtual bool isLoading() const
+        {
+            return m_s->isLoading();
+        }
+
+        virtual bool isSaving() const
+        {
+            return m_s->isSaving();
+        }
+        //@}
+
+
+    private:
+
+        Stream * const m_s;
+        bool const m_autodelete;
+
+    };
+
+} // nv namespace
+
+
+//#endif // NV_CORE_STDSTREAM_H
--- a/3rdparty/nvtt/nvcore/stream.h
+++ b/3rdparty/nvtt/nvcore/stream.h
@@ -0,0 +1,163 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_STREAM_H
+#define NV_CORE_STREAM_H
+
+#include "nvcore.h"
+#include "debug.h"
+
+namespace nv
+{
+
+    /// Base stream class.
+    class NVCORE_CLASS Stream {
+    public:
+
+        enum ByteOrder {
+            LittleEndian = false,
+            BigEndian = true,
+        };
+
+        /// Get the byte order of the system.
+        static ByteOrder getSystemByteOrder() { 
+#if NV_LITTLE_ENDIAN
+            return LittleEndian;
+#else
+            return BigEndian;
+#endif
+        }
+
+
+        /// Ctor.
+        Stream() : m_byteOrder(LittleEndian) { }
+
+        /// Virtual destructor.
+        virtual ~Stream() {}
+
+        /// Set byte order.
+        void setByteOrder(ByteOrder bo) { m_byteOrder = bo; }
+
+        /// Get byte order.
+        ByteOrder byteOrder() const { return m_byteOrder; }
+
+
+        /// Serialize the given data.
+        virtual uint serialize( void * data, uint len ) = 0;
+
+        /// Move to the given position in the archive.
+        virtual void seek( uint pos ) = 0;
+
+        /// Return the current position in the archive.
+        virtual uint tell() const = 0;
+
+        /// Return the current size of the archive.
+        virtual uint size() const = 0;
+
+        /// Determine if there has been any error.
+        virtual bool isError() const = 0;
+
+        /// Clear errors.
+        virtual void clearError() = 0;
+
+        /// Return true if the stream is at the end.
+        virtual bool isAtEnd() const = 0;
+
+        /// Return true if the stream is seekable.
+        virtual bool isSeekable() const = 0;
+
+        /// Return true if this is an input stream.
+        virtual bool isLoading() const = 0;
+
+        /// Return true if this is an output stream.
+        virtual bool isSaving() const = 0;
+
+
+        void advance(uint offset) { seek(tell() + offset); }
+
+
+        // friends	
+        friend Stream & operator<<( Stream & s, bool & c ) {
+#if NV_OS_DARWIN && !NV_CC_CPP11
+            nvStaticCheck(sizeof(bool) == 4);
+            uint8 b = c ? 1 : 0;
+            s.serialize( &b, 1 );
+            c = (b == 1);
+#else
+            nvStaticCheck(sizeof(bool) == 1);
+            s.serialize( &c, 1 );
+#endif
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, char & c ) {
+            nvStaticCheck(sizeof(char) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, uint8 & c ) {
+            nvStaticCheck(sizeof(uint8) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, int8 & c ) {
+            nvStaticCheck(sizeof(int8) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, uint16 & c ) {
+            nvStaticCheck(sizeof(uint16) == 2);
+            return s.byteOrderSerialize( &c, 2 );
+        }
+        friend Stream & operator<<( Stream & s, int16 & c ) {
+            nvStaticCheck(sizeof(int16) == 2);
+            return s.byteOrderSerialize( &c, 2 );
+        }
+        friend Stream & operator<<( Stream & s, uint32 & c ) {
+            nvStaticCheck(sizeof(uint32) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, int32 & c ) {
+            nvStaticCheck(sizeof(int32) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, uint64 & c ) {
+            nvStaticCheck(sizeof(uint64) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+        friend Stream & operator<<( Stream & s, int64 & c ) {
+            nvStaticCheck(sizeof(int64) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+        friend Stream & operator<<( Stream & s, float & c ) {
+            nvStaticCheck(sizeof(float) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, double & c ) {
+            nvStaticCheck(sizeof(double) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+
+    protected:
+
+        /// Serialize in the stream byte order.
+        Stream & byteOrderSerialize( void * v, uint len ) {
+            if( m_byteOrder == getSystemByteOrder() ) {
+                serialize( v, len );
+            }
+            else {
+                for( uint i = len; i > 0; i-- ) {
+                    serialize( (uint8 *)v + i - 1, 1 );
+                }
+            }
+            return *this;
+        }
+
+
+    private:
+
+        ByteOrder m_byteOrder;
+
+    };
+
+} // nv namespace
+
+#endif // NV_CORE_STREAM_H
--- a/Show More
+++ b/Show More